--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,10 @@ test.py
 test.ipynb
 nohup*.out

 # model weights
 *.pth
 *.th

 # C extensions
 *.so

--- a/README.md
+++ b/README.md
@@ -11,11 +11,11 @@ This project is now supported by PengCheng Lab
 ### Beta

 - DARTS
  `python search/DARTS.py --cfg configs/search/DARTS.yaml`
  - `python search/DARTS.py --cfg configs/search/DARTS.yaml`
 - PCDARTS
  `python search/PDARTS.py --cfg configs/search/PDARTS.yaml`
  - `python search/PDARTS.py --cfg configs/search/PDARTS.yaml`
 - PDARTS
  `python search/PCDARTS.py --cfg configs/search/PCDARTS.yaml`
  - `python search/PCDARTS.py --cfg configs/search/PCDARTS.yaml`
 - SNG
 - ASNG
 - MDENAS
@@ -23,11 +23,16 @@ This project is now supported by PengCheng Lab
 - MIGONAS
 - GridSearch
 - DrNAS
  `python search/DrNAS/nb201space.py --cfg configs/search/DrNAS/nb201_cifar10_Dirichlet.yaml`
  `python search/DrNAS/nb201space.py --cfg configs/search/DrNAS/nb201_cifar100_Dirichlet.yaml`
  `python search/DrNAS/DARTSspace.py --cfg configs/search/DrNAS/DARTS_cifar10.yaml`
  - `python search/DrNAS/nb201space.py --cfg configs/search/DrNAS/nb201_cifar10_Dirichlet.yaml`
  - `python search/DrNAS/nb201space.py --cfg configs/search/DrNAS/nb201_cifar100_Dirichlet.yaml`
  - `python search/DrNAS/DARTSspace.py --cfg configs/search/DrNAS/DARTS_cifar10.yaml`
 - TENAS
  `python search/TENAS.py --cfg configs/search/TENAS/nb201_cifar10.yaml`
  - `python search/TENAS.py --cfg configs/search/TENAS/nb201_cifar10.yaml`
 - RMINAS
  - `./search/RMINAS/download_weight.sh # prepare weights of teacher models`
  - `./python search/RMINAS/RMINAS_nb201.py --cfg configs/search/RMINAS/nb201_cifar10.yaml`
  - `./python search/RMINAS/RMINAS_darts.py --cfg configs/search/RMINAS/darts_cifar10.yaml`


 ## Supported Search Spaces

@@ -125,7 +130,7 @@ We reimplement several widely used NAS methods including:
 | dynamic_SNG      |2   |2.927    |0.0        |24.13     |96.87|473.156 |-       |78.07       |cell-based  |
 | dynamic_SNG      |3   |2.724    |0.0        |28.07     |97.45|442.826 |-       |77.68       |cell-based  |
 | dynamic_SNG      |4   |3.323    |0.0        |31.85     |96.65|528.784 |-       |79.78       |cell-based  |

 | RMINAS           |-   |-        |1.92       |31.9      |97.36|-       |-       |-           |cell-based  |

 ### TODO

--- a/configs/search/RMINAS/darts_cifar10.yaml
+++ b/configs/search/RMINAS/darts_cifar10.yaml
@@ -0,0 +1,17 @@
 RNG_SEED: 2
 SEARCH:
  DATASET: 'cifar10'
  NUM_CLASSES: 10
  IM_SIZE: 32
 DATA_LOADER:
  BACKEND: 'custom'
 OUT_DIR: 'experiment/train_test'
 OPTIM:
  BASE_LR: 0.025
  MOMENTUM: 0.9
  WEIGHT_DECAY: 0.0003
  MAX_EPOCH: 250
 TRAIN:
  BATCH_SIZE: 128
  CHANNELS: 16
  LAYERS: 8
--- a/configs/search/RMINAS/darts_cifar100.yaml
+++ b/configs/search/RMINAS/darts_cifar100.yaml
@@ -0,0 +1,17 @@
 RNG_SEED: 2
 SEARCH:
  DATASET: 'cifar100'
  NUM_CLASSES: 100
  IM_SIZE: 32
 DATA_LOADER:
  BACKEND: 'custom'
 OUT_DIR: 'experiment/train_test'
 OPTIM:
  BASE_LR: 0.025
  MOMENTUM: 0.9
  WEIGHT_DECAY: 0.0003
  MAX_EPOCH: 250
 TRAIN:
  BATCH_SIZE: 128
  CHANNELS: 16
  LAYERS: 8
--- a/configs/search/RMINAS/darts_imagenet.yaml
+++ b/configs/search/RMINAS/darts_imagenet.yaml
@@ -0,0 +1,17 @@
 RNG_SEED: 2
 SEARCH:
  DATASET: 'imagenet'
  NUM_CLASSES: 1000
  IM_SIZE: 32
 DATA_LOADER:
  BACKEND: 'custom'
 OUT_DIR: 'experiment/train_test'
 OPTIM:
  BASE_LR: 0.025
  MOMENTUM: 0.9
  WEIGHT_DECAY: 0.0003
  MAX_EPOCH: 250
 TRAIN:
  BATCH_SIZE: 32
  CHANNELS: 16
  LAYERS: 8
--- a/configs/search/RMINAS/nb201_cifar10.yaml
+++ b/configs/search/RMINAS/nb201_cifar10.yaml
@@ -0,0 +1,15 @@
 RNG_SEED: 7
 SEARCH:
  DATASET: 'cifar10'
  NUM_CLASSES: 10
 DATA_LOADER:
  BACKEND: 'custom'
 OUT_DIR: 'experiment/nb201_train'
 OPTIM:
  BASE_LR: 0.1
  MOMENTUM: 0.9
  WEIGHT_DECAY: 0.0005
  MAX_EPOCH: 150
 TRAIN:
  BATCH_SIZE: 32
  CHECKPOINT_PERIOD: 10
--- a/configs/search/RMINAS/nb201_cifar100.yaml
+++ b/configs/search/RMINAS/nb201_cifar100.yaml
@@ -0,0 +1,15 @@
 RNG_SEED: 7
 SEARCH:
  DATASET: 'cifar100'
  NUM_CLASSES: 100
 DATA_LOADER:
  BACKEND: 'custom'
 OUT_DIR: 'experiment/nb201_train'
 OPTIM:
  BASE_LR: 0.1
  MOMENTUM: 0.9
  WEIGHT_DECAY: 0.0005
  MAX_EPOCH: 150
 TRAIN:
  BATCH_SIZE: 32
  CHECKPOINT_PERIOD: 10
--- a/configs/search/RMINAS/nb201_imagenet16.yaml
+++ b/configs/search/RMINAS/nb201_imagenet16.yaml
@@ -0,0 +1,15 @@
 RNG_SEED: 7
 SEARCH:
  DATASET: 'imagenet16_120'
  NUM_CLASSES: 120
 DATA_LOADER:
  BACKEND: 'custom'
 OUT_DIR: 'experiment/nb201_train'
 OPTIM:
  BASE_LR: 0.1
  MOMENTUM: 0.9
  WEIGHT_DECAY: 0.0005
  MAX_EPOCH: 150
 TRAIN:
  BATCH_SIZE: 32
  CHECKPOINT_PERIOD: 10
--- a/doc/RMI_NAS.md
+++ b/doc/RMI_NAS.md
@@ -0,0 +1,73 @@
 ## Introduction

 Code for paper: **Neural Architecture Search with Representation Mutual Information**

 RMI-NAS is an efficient architecture search method based on Representation Mutual Information (RMI) theory. It aims at improving the speed of performance evaluation by ranking architectures with RMI, which is an accurate and effective indicator to facilitate NAS. RMI-NAS uses only one batch of data to complete training and generalizes well to different search spaces. For more details, please refer to our paper.



 ## Usage

 ### Installation

 ```bash
 git clone https://github.com/MAC-AutoML/XNAS.git
 cd XNAS
 # set root path
 export PYTHONPATH=$PYTHONPATH:/Path/to/XNAS
 ```

 File [`NAS-Bench-201-v1_0-e61699.pth`](https://drive.google.com/open?id=1SKW0Cu0u8-gb18zDpaAGi0f74UdXeGKs) is needed for a previous version of `NAS-Bench-201` we are using. It should be downloaded and put into the `utils` directory.

 #### Search

 ```bash
 # download weight files for teacher models
 ./search/RMINAS/download_weight.sh
 # NAS-Bench-201 + CIFAR-10
 python search/RMINAS/RMINAS_nb201.py --cfg configs/search/RMINAS/nb201_cifar10.yaml
 # NAS-Bench-201 + CIFAR-100
 python search/RMINAS/RMINAS_nb201.py --cfg configs/search/RMINAS/nb201_cifar100.yaml
 # NAS-Bench-201 + ImageNet
 python search/RMINAS/RMINAS_nb201.py --cfg configs/search/RMINAS/nb201_imagenet16.yaml
 # DARTS + CIFAR-10
 python search/RMINAS/RMINAS_darts.py --cfg configs/search/RMINAS/darts_cifar10.yaml
 # DARTS + CIFAR-100
 python search/RMINAS/RMINAS_darts.py --cfg configs/search/RMINAS/darts_cifar100.yaml
 # DARTS + ImageNet
 python search/RMINAS/RMINAS_darts.py --cfg configs/search/RMINAS/darts_imagenet.yaml
 ```

 ## Results

 ### Results on NAS-Bench-201

 | Method      | Search Cost<br />(seconds) | CIFAR-10 <br />Test Acc.(%) | CIFAR-100 <br />Test Acc.(%) | ImageNet16-120 <br />Test Acc.(%) |
 | ----------- | -------------------------- | --------------------------- | ---------------------------- | --------------------------------- |
 | RL          | 27870.7                    | 93.85±0.37                  | 71.71±1.09                   | 45.24±1.18                        |
 | DARTS-V2    | 35781.8                    | 54.30±0.00                  | 15.61±0.00                   | 16.32±0.00                        |
 | GDAS        | 31609.8                    | 93.61±0.09                  | 70.70±0.30                   | 41.71±0.98                        |
 | FairNAS     | 9845.0                     | 93.23±0.18                  | 71.00±1.46                   | 42.19±0.31                        |
 | **RMI-NAS** | **1258.2**                 | **94.28±0.10**              | **73.36±0.19**               | **46.34±0.00**                    |

 ![img.png](images/nasbench201.png)
 ### Results on DARTS

 | Method      | Search Cost<br />(gpu-days) | CIFAR-10 <br />Test Acc.(%)<br />(paper) | CIFAR-10 <br />Test Acc.(%)<br />(retrain) |
 | ----------- |-----------------------------| ---------------------------------------- | ------------------------------------------ |
 | AmoebaNet-B | 3150                        | 2.55±0.05                                | -                                          |
 | NASNet-A    | 1800                        | 2.65                                     | -                                          |
 | DARTS (1st) | 0.4                         | 3.00±0.14                                | 2.75                                       |
 | DARTS (2nd) | 1                           | 2.76±0.09                                | 2.60                                       |
 | SNAS        | 1.5                         | 2.85±0.02                                | 2.68                                       |
 | PC-DARTS    | 1                           | 2.57±0.07                                | 2.71±0.11                                  |
 | FairDARTS-D | 0.4                         | 2.54±0.05                                | 2.71                                       |
 | **RMI-NAS** | **0.08**                    | -                                        | 2.64±0.04                                  |

 Comparisons with other methods in DARTS. We also report retrained results under exactly the same settings to ensure a fair comparison. Our method delivers a comparable accuracy but substantial improvements on time comsumption.

 #### Normal cell
 ![img.png](images/normal.png)

 #### Reduce cell
 ![img.png](images/reduce.png)
--- a/doc/images/nasbench201.png
+++ b/doc/images/nasbench201.png
--- a/doc/images/normal.png
+++ b/doc/images/normal.png
--- a/doc/images/reduce.png
+++ b/doc/images/reduce.png
--- a/search/RMINAS/README.md
+++ b/search/RMINAS/README.md
@@ -0,0 +1,74 @@
 ## Introduction

 Code for paper: **Neural Architecture Search with Representation Mutual Information**

 RMI-NAS is an efficient architecture search method based on Representation Mutual Information (RMI) theory. It aims at improving the speed of performance evaluation by ranking architectures with RMI, which is an accurate and effective indicator to facilitate NAS. RMI-NAS uses only one batch of data to complete training and generalizes well to different search spaces. For more details, please refer to our paper.

 ## Results

 ### Results on NAS-Bench-201

 | Method      | Search Cost<br />(seconds) | CIFAR-10 <br />Test Acc.(%) | CIFAR-100 <br />Test Acc.(%) | ImageNet16-120 <br />Test Acc.(%) |
 | ----------- | -------------------------- | --------------------------- | ---------------------------- | --------------------------------- |
 | RL          | 27870.7                    | 93.85±0.37                  | 71.71±1.09                   | 45.24±1.18                        |
 | DARTS-V2    | 35781.8                    | 54.30±0.00                  | 15.61±0.00                   | 16.32±0.00                        |
 | GDAS        | 31609.8                    | 93.61±0.09                  | 70.70±0.30                   | 41.71±0.98                        |
 | FairNAS     | 9845.0                     | 93.23±0.18                  | 71.00±1.46                   | 42.19±0.31                        |
 | **RMI-NAS** | **1258.2**                 | **94.28±0.10**              | **73.36±0.19**               | **46.34±0.00**                    |

 Our method shows significant efficiency and accuracy improvements.

 ### Results on DARTS

 | Method      | Search Cost<br />(seconds) | CIFAR-10 <br />Test Acc.(%)<br />(paper) | CIFAR-10 <br />Test Acc.(%)<br />(retrain) |
 | ----------- | -------------------------- | ---------------------------------------- | ------------------------------------------ |
 | AmoebaNet-B | 3150                       | 2.55±0.05                                | -                                          |
 | NASNet-A    | 1800                       | 2.65                                     | -                                          |
 | DARTS (1st) | 0.4                        | 3.00±0.14                                | 2.75                                       |
 | DARTS (2nd) | 1                          | 2.76±0.09                                | 2.60                                       |
 | SNAS        | 1.5                        | 2.85±0.02                                | 2.68                                       |
 | PC-DARTS    | 1                          | 2.57±0.07                                | 2.71±0.11                                  |
 | FairDARTS-D | 0.4                        | 2.54±0.05                                | 2.71                                       |
 | **RMI-NAS** | **0.08**                   | -                                        | 2.64±0.04                                  |

 Comparisons with other methods in DARTS. We also report retrained results under exactly the same settings to ensure a fair comparison. Our method delivers a comparable accuracy but substantial improvements on time comsumption.



 ## Usage

 #### Install RMI-NAS

 Our code contains functions from XNAS repository, which is required to be installed.

 ```bash
 # install XNAS
 git clone https://github.com/MAC-AutoML/XNAS.git
 export PYTHONPATH=$PYTHONPATH:/PATH/to/XNAS

 # prepare environment for RMI-NAS (conda)
 conda env create --file environment.yaml

 # download weight files for teacher models
 chmod +x search/RMINAS/download_weight.sh
 bash search/RMINAS/download_weight.sh
 ```

 File [`NAS-Bench-201-v1_0-e61699.pth`](https://drive.google.com/open?id=1SKW0Cu0u8-gb18zDpaAGi0f74UdXeGKs) is required for a previous version of `NAS-Bench-201` we are using. It should be downloaded and put into the `utils` directory.

 #### Search

 ```bash
 # NAS-Bench-201 + CIFAR-10
 python search/RMINAS/RMINAS_nb201.py --cfg configs/search/RMINAS/nb201_cifar10.yaml

 # DARTS + CIFAR-100 + specific exp path
 python search/RMINAS/RMINAS_darts.py --cfg configs/search/RMINAS/darts_cifar100.yaml OUT_DIR experiments/
 ```


 ## Related work

 [NAS-Bench-201](https://github.com/D-X-Y/NAS-Bench-201)

 [XNAS](https://github.com/MAC-AutoML/XNAS)
--- a/search/RMINAS/RMINAS_darts.py
+++ b/search/RMINAS/RMINAS_darts.py
@@ -0,0 +1,209 @@
 import time
 import numpy as np
 import time

 import xnas.search_algorithm.RMINAS.utils.RMI_torch as RMI
 from xnas.search_algorithm.RMINAS.sampler.RF_sampling import RF_suggest
 import xnas.search_algorithm.RMINAS.sampler.sampling_darts as sampling

 import torch
 import torch.nn as nn
 import torch.nn.parallel
 import torch.optim

 import xnas.core.config as config
 import xnas.core.logging as logging
 from xnas.core.config import cfg
 from xnas.core.trainer import setup_env

 from xnas.search_space.RMINAS.DARTS.darts_cnn import AugmentCNN, geno_from_alpha, reformat_DARTS


 class CKA_loss(nn.Module):
    def __init__(self, datasize):
        super(CKA_loss, self).__init__()
        self.datasize = datasize

    def forward(self, features_1, features_2):
        s = []
        for i in range(len(features_1)):
            s.append(RMI.tensor_cka(RMI.tensor_gram_linear(features_1[i].view(self.datasize, -1)), RMI.tensor_gram_linear(features_2[i].view(self.datasize, -1))))
        return torch.sum(3 - s[0] - s[1] - s[2])


 def main():
    logger = logging.get_logger(__name__)
    
    # Load config and check
    config.load_cfg_fom_args()
    config.assert_and_infer_cfg()
    cfg.freeze()
    
    setup_env()
    
    print(cfg.SEARCH.DATASET)
    # assert cfg.SEARCH.DATASET in ['cifar10', 'cifar100'], 'dataset error'
    assert cfg.SEARCH.DATASET in ['cifar10', 'cifar100', 'imagenet'], 'dataset error'
    if cfg.SEARCH.DATASET == 'imagenet':
        print('='*30+' NOTE '+'='*30)
        print('Our method does not directly search in ImageNet.')
        print('Only partial tests have been conducted, please use with caution.')
        print('='*66)

    if cfg.SEARCH.DATASET == 'cifar10':
        from xnas.search_algorithm.RMINAS.utils.loader import cifar10_data
        import xnas.search_algorithm.RMINAS.teacher_model.resnet20_cifar10.resnet as resnet
        """Data preparing"""
        more_data_X, more_data_y = cifar10_data(cfg.TRAIN.BATCH_SIZE, cfg.DATA_LOADER.NUM_WORKERS)

        """ResNet codes"""
        checkpoint_res = torch.load('xnas/search_algorithm/RMINAS/teacher_model/resnet20_cifar10/resnet20.th')
        model_res = torch.nn.DataParallel(resnet.__dict__['resnet20']())
        model_res.cuda()
        model_res.load_state_dict(checkpoint_res['state_dict'])
        
        """selecting well-performed data."""
        with torch.no_grad():
            ce_loss = torch.nn.CrossEntropyLoss(reduction='none').cuda()
            more_logits = model_res(more_data_X)
            _, indices = torch.topk(-ce_loss(more_logits, more_data_y).cpu().detach(), cfg.TRAIN.BATCH_SIZE)
        data_y = torch.Tensor([more_data_y[i] for i in indices]).long().cuda()
        data_X = torch.Tensor([more_data_X[i].cpu().numpy() for i in indices]).cuda()
        with torch.no_grad():
            feature_res = model_res.module.feature_extractor(data_X)

    elif cfg.SEARCH.DATASET == 'cifar100':
        from xnas.search_algorithm.RMINAS.utils.loader import cifar100_data
        from xnas.search_algorithm.RMINAS.teacher_model.resnet101_cifar100.resnet import resnet101
        """Data preparing"""
        more_data_X, more_data_y = cifar100_data(cfg.TRAIN.BATCH_SIZE, cfg.DATA_LOADER.NUM_WORKERS)
        
        """ResNet codes"""
        net = resnet101()
        net.load_state_dict(torch.load('xnas/search_algorithm/RMINAS/teacher_model/resnet101_cifar100/resnet101.pth'))
        net.cuda()
        
        """selecting well-performed data."""
        with torch.no_grad():
            ce_loss = torch.nn.CrossEntropyLoss(reduction='none').cuda()
            more_logits = net(more_data_X)
            _, indices = torch.topk(-ce_loss(more_logits, more_data_y).cpu().detach(), cfg.TRAIN.BATCH_SIZE)
        data_y = torch.Tensor([more_data_y[i] for i in indices]).long().cuda()
        data_X = torch.Tensor([more_data_X[i].cpu().numpy() for i in indices]).cuda()
        with torch.no_grad():
            feature_res = net.feature_extractor(data_X)

    elif cfg.SEARCH.DATASET == 'imagenet':
        from xnas.search_algorithm.RMINAS.utils.loader import imagenet_data
        import xnas.search_algorithm.RMINAS.teacher_model.fbresnet_imagenet.fbresnet as fbresnet
        """Data preparing"""
        more_data_X, more_data_y = imagenet_data(cfg.TRAIN.BATCH_SIZE, cfg.DATA_LOADER.NUM_WORKERS, '/media/DATASET/ILSVRC2012/')
        
        """ResNet codes"""
        model_res = fbresnet.fbresnet152()
        model_res.cuda()
        
        """selecting well-performed data."""
        with torch.no_grad():
            ce_loss = torch.nn.CrossEntropyLoss(reduction='none').cuda()
            more_logits = model_res(more_data_X)
            _, indices = torch.topk(-ce_loss(more_logits, more_data_y).cpu().detach(), cfg.TRAIN.BATCH_SIZE)
        data_y = torch.Tensor([more_data_y[i] for i in indices]).long().cuda()
        data_X = torch.Tensor([more_data_X[i].cpu().numpy() for i in indices]).cuda()
        with torch.no_grad():
            feature_res = model_res.features_extractor(data_X)

    RFS = RF_suggest(space='darts', logger=logger, thres_rate=cfg.RMINAS.RF_THRESRATE, seed=cfg.RNG_SEED)
    
    # loss function
    loss_fun_cka = CKA_loss(data_X.size()[0])
    loss_fun_cka = loss_fun_cka.requires_grad_()
    loss_fun_cka.cuda()
    loss_fun_log = torch.nn.CrossEntropyLoss().cuda()
        
    def train_arch(genotype):
        s_time = time.time()
        model = AugmentCNN(
            cfg.SEARCH.IM_SIZE, 
            cfg.SEARCH.INPUT_CHANNEL, 
            cfg.TRAIN.CHANNELS,
            cfg.SEARCH.NUM_CLASSES, 
            cfg.TRAIN.LAYERS, 
            False, # don't use auxiliary head
            genotype)
        model.cuda()
        model.train()
        
        # weights optimizer
        optimizer = torch.optim.SGD(
            model.parameters(), 
            cfg.OPTIM.BASE_LR, 
            momentum=cfg.OPTIM.MOMENTUM,
            weight_decay=cfg.OPTIM.WEIGHT_DECAY)

        for cur_epoch in range(1, cfg.OPTIM.MAX_EPOCH+1):
            optimizer.zero_grad()

            features, logits, aux_logits = model(data_X)
            loss_cka = loss_fun_cka(features, feature_res)
            loss_logits = loss_fun_log(logits, data_y)
            loss = cfg.RMINAS.LOSS_BETA * loss_cka + (1-cfg.RMINAS.LOSS_BETA)*loss_logits
            loss.backward()

            optimizer.step()

            if cur_epoch == cfg.OPTIM.MAX_EPOCH:
                logger.info("training arch cost: {}".format(time.time()-s_time))
                return loss.cpu().detach().numpy()
    
    start_time = time.time()
    trained_arch, trained_loss = [], []

    # ====== Warmup ======
    warmup_samples = RFS.warmup_samples(cfg.RMINAS.RF_WARMUP)
    logger.info("Warming up with {} archs".format(cfg.RMINAS.RF_WARMUP))
    for sample in warmup_samples:
        sample_alpha = sampling.ransug2alpha(sample)  # shape=(28, 8)
        sample_geno = geno_from_alpha(sample_alpha)  # type=Genotype
        # if cfg.SEARCH.DATASET == 'imagenet' :
        #     sample_geno = reformat_DARTS(sample_geno)
        mixed_loss = train_arch(sample_geno)
        mixed_loss = np.inf if np.isnan(mixed_loss) else mixed_loss
        trained_arch.append(str(sample_geno))
        trained_loss.append(mixed_loss)
        RFS.trained_arch.append({'arch':sample, 'loss':mixed_loss})
    RFS.Warmup()
    logger.info('warmup time cost: {}'.format(str(time.time() - start_time)))
    
    # ====== RF Sampling ======
    sampling_time = time.time()
    sampling_cnt = 0
    while sampling_cnt < cfg.RMINAS.RF_SUCC:
        sample = RFS.fitting_samples()
        sample_alpha = sampling.ransug2alpha(sample)  # shape=(28, 8)
        sample_geno = geno_from_alpha(sample_alpha)  # type=Genotype
        # if cfg.SEARCH.DATASET == 'imagenet' :
        #     sample_geno = reformat_DARTS(sample_geno)
        mixed_loss = train_arch(sample_geno)
        mixed_loss = np.inf if np.isnan(mixed_loss) else mixed_loss
        trained_arch.append(str(sample_geno))
        trained_loss.append(mixed_loss)
        RFS.trained_arch.append({'arch':sample, 'loss':mixed_loss})
        sampling_cnt += RFS.Fitting()
    if sampling_cnt >= cfg.RMINAS.RF_SUCC:
        logger.info('successfully sampling good archs for {} times'.format(sampling_cnt))
    else:
        logger.info('failed sampling good archs for only {} times'.format(sampling_cnt))
    logger.info('RF sampling time cost: {}'.format(str(time.time() - sampling_time)))
    
    # ====== Evaluation ======
    logger.info('Total time cost:{}'.format(str(time.time() - start_time)))
    logger.info('Actual training times: {}'.format(len(trained_arch)))
    op_sample = RFS.optimal_arch(method='sum', top=50)
    op_alpha = torch.from_numpy(np.r_[op_sample, op_sample])
    op_geno = reformat_DARTS(geno_from_alpha(op_alpha))
    logger.info('Searched architecture@top50:\n{}'.format(str(op_geno)))

 if __name__ == "__main__":
    main()

--- a/search/RMINAS/RMINAS_mb_imagenet.py
+++ b/search/RMINAS/RMINAS_mb_imagenet.py
@@ -0,0 +1,154 @@
 import numpy as np
 import random
 import os
 import time

 import torch
 import torch.nn as nn
 from torch.optim import lr_scheduler
 import torch.utils
 import torchvision.datasets as dset
 import torchvision.transforms as transforms

 import xnas.core.logging as logging
 import xnas.core.config as config

 from xnas.core.utils import one_hot_to_index
 from xnas.core.trainer import setup_env
 from xnas.core.config import cfg

 from xnas.search_space.RMINAS.MBConv.mb_v3_cnn import MobileNetV3
 import xnas.search_algorithm.RMINAS.utils.RMI_torch as RMI
 from xnas.search_algorithm.RMINAS.sampler.RF_sampling import RF_suggest

 from xnas.search_algorithm.RMINAS.utils.loader import imagenet_data

 import xnas.search_algorithm.RMINAS.teacher_model.fbresnet_imagenet.fbresnet as fbresnet


 # NOTE: this code is not fully tested.
 # OBSERVE_EPO = 250
 # RF_WARMUP = 200


 class CKA_loss(nn.Module):
    def __init__(self, datasize):
        super(CKA_loss, self).__init__()
        self.datasize = datasize

    def forward(self, features_1, features_2):
        s = []
        for i in range(len(features_1)):
            s.append(RMI.tensor_cka(RMI.tensor_gram_linear(features_1[i].view(self.datasize, -1)), RMI.tensor_gram_linear(features_2[i].view(self.datasize, -1))))
        return torch.sum(3 - s[0] - s[1] - s[2])

 def main():    
    # Load config and check
    config.load_cfg_fom_args()
    config.assert_and_infer_cfg()
    cfg.freeze()
    
    setup_env()

    logger = logging.get_logger(__name__)
    
    """Data preparing"""
    more_data_X, more_data_y = imagenet_data(cfg.TRAIN.BATCH_SIZE, cfg.DATA_LOADER.NUM_WORKERS, '/media/DATASET/ILSVRC2012/')
    
    """ResNet codes"""
    model_res = fbresnet.fbresnet152()
    model_res.cuda()
    
    """selecting well-performed data."""
    with torch.no_grad():
        ce_loss = torch.nn.CrossEntropyLoss(reduction='none').cuda()
        more_logits = model_res(more_data_X)
        _, indices = torch.topk(-ce_loss(more_logits, more_data_y).cpu().detach(), cfg.TRAIN.BATCH_SIZE)

    data_y = torch.Tensor([more_data_y[i] for i in indices]).long().cuda()
    data_X = torch.Tensor([more_data_X[i].cpu().numpy() for i in indices]).cuda()
    
    with torch.no_grad():
        feature_res = model_res.features_extractor(data_X)
    
    RFS = RF_suggest(space='mb', logger=logger, thres_rate=cfg.RMINAS.RF_THRESRATE, seed=cfg.RNG_SEED)
    
    # loss function
    loss_fun_cka = CKA_loss(data_X.size()[0])
    loss_fun_cka = loss_fun_cka.requires_grad_()
    loss_fun_cka.cuda()
    loss_fun_log = torch.nn.CrossEntropyLoss().cuda()
    
    
    def train_arch(sample):
        
        model = MobileNetV3(n_classes=1000)
        model.cuda()
        
        w_optim = torch.optim.SGD(model.parameters(),
                                  cfg.OPTIM.BASE_LR,
                                  momentum=cfg.OPTIM.MOMENTUM,
                                  weight_decay=cfg.OPTIM.WEIGHT_DECAY)
        
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(w_optim, cfg.OPTIM.MAX_EPOCH, eta_min=cfg.OPTIM.MIN_LR)
        
        model.train()
        
        logger.info("Sampling: {}".format(one_hot_to_index(sample)))
        for cur_epoch in range(1, cfg.OPTIM.MAX_EPOCH+1):
            
            lr = w_optim.param_groups[0]['lr']

            logits, features = model(data_X, sample)
            loss_cka = loss_fun_cka(features, feature_res)
            loss_logits = loss_fun_log(logits, data_y)
            loss = cfg.RMINAS.LOSS_BETA * loss_cka + (1-cfg.RMINAS.LOSS_BETA)*loss_logits

            w_optim.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), cfg.OPTIM.GRAD_CLIP)
            w_optim.step()

            lr_scheduler.step()
            
            if cur_epoch == cfg.OPTIM.MAX_EPOCH:
                return loss.cpu().detach().numpy()
    
    start_time = time.time()

    # ====== Warmup ======
    warmup_samples = RFS.warmup_samples(cfg.RMINAS.RF_WARMUP)
    logger.info("Warming up with {} archs".format(cfg.RMINAS.RF_WARMUP))
    for sample in warmup_samples:
        mixed_loss = train_arch(sample)
        mixed_loss = np.inf if np.isnan(mixed_loss) else mixed_loss
        RFS.trained_arch.append({'arch':sample, 'loss':mixed_loss})
 #         print(str(sample_geno), mixed_loss)
    RFS.Warmup()
    logger.info('warmup time cost: {}'.format(str(time.time() - start_time)))
    
    # ====== RF Sampling ======
    sampling_time = time.time()
    sampling_cnt = 0
    while sampling_cnt < cfg.RMINAS.RF_SUCC:
        sample = RFS.fitting_samples()
        mixed_loss = train_arch(sample)
        mixed_loss = np.inf if np.isnan(mixed_loss) else mixed_loss
        RFS.trained_arch.append({'arch':sample, 'loss':mixed_loss})
 #         print(str(sample_geno), mixed_loss)
        sampling_cnt += RFS.Fitting()
    if sampling_cnt >= cfg.RMINAS.RF_SUCC:
        logger.info('successfully sampling good archs for {} times'.format(sampling_cnt))
    else:
        logger.info('failed sampling good archs for only {} times'.format(sampling_cnt))
    logger.info('RF sampling time cost: {}'.format(str(time.time() - sampling_time)))
    
    # ====== Evaluation ======
    logger.info('Total time cost:{}'.format(str(time.time() - start_time)))
    logger.info('Actual training times: {}'.format(len(RFS.trained_arch)))
    op_sample = RFS.optimal_arch(method='sum', top=30)
    logger.info('Searched architecture@top50:\n{}'.format(str(op_sample)))
 #     logger.info(model.genotype(torch.Tensor(op_sample)))

 if __name__ == "__main__":
    main()
--- a/search/RMINAS/RMINAS_nb201.py
+++ b/search/RMINAS/RMINAS_nb201.py
@@ -0,0 +1,220 @@
 import time
 import random
 import numpy as np

 import xnas.search_algorithm.RMINAS.utils.RMI_torch as RMI
 from xnas.search_algorithm.RMINAS.sampler.RF_sampling import RF_suggest
 import xnas.search_algorithm.RMINAS.sampler.sampling as sampling
    
 from xnas.search_space.RMINAS.NB201.utils import *
 from nas_201_api import NASBench201API as api

 import torch
 import torch.nn as nn
 import torch.nn.parallel
 import torch.optim

 import xnas.core.config as config
 import xnas.core.logging as logging
 from xnas.core.config import cfg
 from xnas.core.trainer import setup_env


 class CKA_loss(nn.Module):
    def __init__(self, datasize):
        super(CKA_loss, self).__init__()
        self.datasize = datasize

    def forward(self, features_1, features_2):
        s = []
        for i in range(len(features_1)):
            s.append(RMI.tensor_cka(RMI.tensor_gram_linear(features_1[i].view(self.datasize, -1)), RMI.tensor_gram_linear(features_2[i].view(self.datasize, -1))))
        return torch.sum(3 - s[0] - s[1] - s[2])


 def main():
    logger = logging.get_logger(__name__)

    # Load config and check
    config.load_cfg_fom_args()
    config.assert_and_infer_cfg()
    cfg.freeze()
    
    setup_env()
    
    print(cfg.SEARCH.DATASET)
    assert cfg.SEARCH.DATASET in ['cifar10', 'cifar100', 'imagenet16_120'], 'dataset error'

    if cfg.SEARCH.DATASET == 'cifar10':
        from xnas.search_algorithm.RMINAS.utils.loader import cifar10_data
        import xnas.search_algorithm.RMINAS.teacher_model.resnet20_cifar10.resnet as resnet

        """Data preparing"""
        more_data_X, more_data_y = cifar10_data(cfg.TRAIN.BATCH_SIZE, cfg.DATA_LOADER.NUM_WORKERS)

        """ResNet codes"""
        checkpoint_res = torch.load('xnas/search_algorithm/RMINAS/teacher_model/resnet20_cifar10/resnet20.th')
        model_res = torch.nn.DataParallel(resnet.__dict__['resnet20']())
        model_res.cuda()
        model_res.load_state_dict(checkpoint_res['state_dict'])
        
        """selecting well-performed data."""
        with torch.no_grad():
            ce_loss = torch.nn.CrossEntropyLoss(reduction='none').cuda()
            more_logits = model_res(more_data_X)
            _, indices = torch.topk(-ce_loss(more_logits, more_data_y).cpu().detach(), cfg.TRAIN.BATCH_SIZE)
        data_y = torch.Tensor([more_data_y[i] for i in indices]).long().cuda()
        data_X = torch.Tensor([more_data_X[i].cpu().numpy() for i in indices]).cuda()
        with torch.no_grad():
            feature_res = model_res.module.feature_extractor(data_X)
    
    elif cfg.SEARCH.DATASET == 'cifar100':
        from xnas.search_algorithm.RMINAS.utils.loader import cifar100_data
        from xnas.search_algorithm.RMINAS.teacher_model.resnet101_cifar100.resnet import resnet101

        """Data preparing"""
        more_data_X, more_data_y = cifar100_data(cfg.TRAIN.BATCH_SIZE, cfg.DATA_LOADER.NUM_WORKERS)

        """ResNet codes"""
        model_res = resnet101()
        model_res.load_state_dict(torch.load('xnas/search_algorithm/RMINAS/teacher_model/resnet101_cifar100/resnet101.pth'))
        model_res.cuda()
        
        """selecting well-performed data."""
        with torch.no_grad():
            ce_loss = torch.nn.CrossEntropyLoss(reduction='none').cuda()
            more_logits = model_res(more_data_X)
            _, indices = torch.topk(-ce_loss(more_logits, more_data_y).cpu().detach(), cfg.TRAIN.BATCH_SIZE)
        data_y = torch.Tensor([more_data_y[i] for i in indices]).long().cuda()
        data_X = torch.Tensor([more_data_X[i].cpu().numpy() for i in indices]).cuda()
        with torch.no_grad():
            feature_res = model_res.feature_extractor(data_X)
    
    elif cfg.SEARCH.DATASET == 'imagenet16_120':
        import xnas.search_algorithm.RMINAS.utils.imagenet16120_loader as imagenetloader
        from xnas.search_space.RMINAS.NB201.geno import Structure as cellstructure
        from nas_201_api import ResultsCount

        """Data preparing"""
        train_loader, _ = imagenetloader.get_loader(batch_size=cfg.TRAIN.BATCH_SIZE*16)
        target_i = random.randint(0, len(train_loader)-1)
        more_data_X, more_data_y = None, None
        for i, (more_data_X, more_data_y) in enumerate(train_loader):
            if i == target_i:
                break
        more_data_X = more_data_X.cuda()
        more_data_y = more_data_y.cuda()

        """Teacher Network: using best arch searched from cifar10 and weight from nb201."""
        filename = 'xnas/search_algorithm/RMINAS/teacher_model/nb201model_imagenet16120/009930-FULL.pth'
        xdata = torch.load(filename)
        odata  = xdata['full']['all_results'][('ImageNet16-120', 777)]
        result = ResultsCount.create_from_state_dict(odata)
        result.get_net_param()
        arch_config = result.get_config(cellstructure.str2structure) # create the network with params
        net_config = dict2config(arch_config, None)
        network = get_cell_based_tiny_net(net_config)
        network.load_state_dict(result.get_net_param())
        network.cuda()
        
        """selecting well-performed data."""
        with torch.no_grad():
            ce_loss = torch.nn.CrossEntropyLoss(reduction='none').cuda()
            _, more_logits = network(more_data_X)
            _, indices = torch.topk(-ce_loss(more_logits, more_data_y).cpu().detach(), cfg.TRAIN.BATCH_SIZE)
        data_y = torch.Tensor([more_data_y[i] for i in indices]).long().cuda()
        data_X = torch.Tensor([more_data_X[i].cpu().numpy() for i in indices]).cuda()
        with torch.no_grad():
            feature_res, _ = network(data_X)
        
    """Codes: build from config file."""
    nb201_api = api('./data/NAS-Bench-201-v1_0-e61699.pth')
    
    RFS = RF_suggest(space='nasbench201', logger=logger, api=nb201_api, thres_rate=cfg.RMINAS.RF_THRESRATE, seed=cfg.RNG_SEED)

    # loss function
    loss_fun_cka = CKA_loss(data_X.size()[0])
    loss_fun_cka = loss_fun_cka.requires_grad_()
    loss_fun_cka.cuda()
    loss_fun_log = torch.nn.CrossEntropyLoss().cuda()
        
    def train_arch(arch_index):                
        # get arch
        arch_config = {
            'name': 'infer.tiny', 
            'C': 16, 'N': 5, 
            'arch_str':nb201_api.arch(arch_index), 
            'num_classes': cfg.SEARCH.NUM_CLASSES}
        net_config = dict2config(arch_config, None)
        model = get_cell_based_tiny_net(net_config)
        model.cuda()
        
        model.train()

        # weights optimizer
        optimizer = torch.optim.SGD(
            model.parameters(), 
            cfg.OPTIM.BASE_LR, 
            momentum=cfg.OPTIM.MOMENTUM,
            weight_decay=cfg.OPTIM.WEIGHT_DECAY)

        for cur_epoch in range(1, cfg.OPTIM.MAX_EPOCH+1):
            optimizer.zero_grad()
            
            features, logits = model(data_X)
            loss_logits = loss_fun_log(logits, data_y)
            loss_cka = loss_fun_cka(features, feature_res)
            loss = cfg.RMINAS.LOSS_BETA * loss_cka + (1-cfg.RMINAS.LOSS_BETA)*loss_logits
            loss.backward()

            optimizer.step()
            
            if cur_epoch == cfg.OPTIM.MAX_EPOCH:
                logger.info('Arch:{} Loss:{}'.format(str(arch_index), str(loss.cpu().detach().numpy())))
                return loss.cpu().detach().numpy()
        
    start_time = time.time()
    trained_loss = []
    
    # ====== Warmup ======
    warmup_samples = RFS.warmup_samples(cfg.RMINAS.RF_WARMUP)
    logger.info("Warming up with {} archs".format(cfg.RMINAS.RF_WARMUP))
    for arch_index in warmup_samples:
        mixed_loss = train_arch(arch_index)
        mixed_loss = np.inf if np.isnan(mixed_loss) else mixed_loss
        trained_loss.append(mixed_loss)
        arch_arr = sampling.genostr2array(nb201_api.arch(arch_index))
        RFS.trained_arch.append({'arch':arch_arr, 'loss':mixed_loss})
        RFS.trained_arch_index.append(arch_index)
 #         print(arch_index, mixed_loss)
    RFS.Warmup()
    logger.info('warmup time cost: {}'.format(str(time.time() - start_time)))
    
    # ====== RF Sampling ======
    sampling_time = time.time()
    sampling_cnt= 0
    while sampling_cnt < cfg.RMINAS.RF_SUCC:
        arch_index = RFS.fitting_samples()
        assert arch_index not in list(RFS.trained_arch_index), "RFS.trained_arch_index error"
        mixed_loss = train_arch(arch_index)
        mixed_loss = np.inf if np.isnan(mixed_loss) else mixed_loss
        RFS.trained_arch_index.append(arch_index)
        trained_loss.append(mixed_loss)
        arch_arr = sampling.genostr2array(nb201_api.arch(arch_index))
        RFS.trained_arch.append({'arch':arch_arr, 'loss':mixed_loss})
 #         print(arch_index, mixed_loss)
        sampling_cnt += RFS.Fitting()
    if sampling_cnt >= cfg.RMINAS.RF_SUCC:
        logger.info('successfully sampling good archs for {} times'.format(sampling_cnt))
    else:
        logger.info('failed sampling good archs for only {} times'.format(sampling_cnt))
    logger.info('RF sampling time cost:{}'.format(str(time.time() - sampling_time)))
    
    # ====== Evaluation ======
    logger.info('Total time cost: {}'.format(str(time.time() - start_time)))
    logger.info('Actual training times: {}'.format(len(RFS.trained_arch_index)))
    logger.info('Searched architecture:\n{}'.format(str(RFS.optimal_arch(method='sum', top=50))))
    # logger.info('Searched architecture:\n{}'.format(str(RFS.optimal_arch(method='greedy', top=50))))

 if __name__ == '__main__':
    main()
--- a/search/RMINAS/download_weight.sh
+++ b/search/RMINAS/download_weight.sh
@@ -0,0 +1,5 @@
 echo `cd xnas/search_algorithm/RMINAS/teacher_model/resnet20_cifar10 && wget http://cdn.thrase.cn/rmi/resnet20.th`
 echo `cd xnas/search_algorithm/RMINAS/teacher_model/nb201model_imagenet16120 && wget http://cdn.thrase.cn/rmi/009930-FULL.pth`
 echo `cd xnas/search_algorithm/RMINAS/teacher_model/fbresnet_imagenet && wget http://cdn.thrase.cn/rmi/fbresnet152.pth`
 echo `cd xnas/search_algorithm/RMINAS/teacher_model/resnet101_cifar100 && wget http://cdn.thrase.cn/rmi/resnet101.pth`
 echo "Finish downloading weight files."
--- a/xnas/core/config.py
+++ b/xnas/core/config.py
@@ -462,6 +462,24 @@ _C.TENAS.REPEAT = 3
 _C.TENAS.PRUNE_NUMBER = 1


 # ------------------------------------------------------------------------------------ #
 # RMINAS options
 # ------------------------------------------------------------------------------------ #
 _C.RMINAS = CfgNode()

 # beta of mixed loss
 _C.RMINAS.LOSS_BETA = 0.80

 # number of archs for random forest warming up
 _C.RMINAS.RF_WARMUP = 100

 # threshold of random forest to choose good archs
 _C.RMINAS.RF_THRESRATE = 0.05

 # number of good archs when random forest terminates
 _C.RMINAS.RF_SUCC = 100



 def dump_cfg():
    """Dumps the config to the output directory."""
--- a/xnas/search_algorithm/RMINAS/sampler/RF_sampling.py
+++ b/xnas/search_algorithm/RMINAS/sampler/RF_sampling.py
@@ -0,0 +1,270 @@
 import numpy as np
 import pickle
 import copy
 import time
 # import torch.nn as nn
 import scipy
 import torch
 from scipy.stats import ks_2samp
 from scipy import stats
 from sklearn import svm
 from sklearn.ensemble import RandomForestClassifier

 import xnas.search_algorithm.RMINAS.sampler.sampling as sampling

 def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

 class RF_suggest():
    def __init__(self, space, logger, api=None, thres_rate=0.05, batch=1000, seed=10):
        np.random.seed(seed)
        self.sampled_history = []  # list[arch_index] / list[arch.ravel()]
        self.trained_arch = []  # list[dict{'arch':arch, 'loss':loss}]
        self.trained_arch_index = []
        self.thres_rate = thres_rate
        self.loss_thres = 0.
        self.batch = batch
        self.space = space
        self.logger = logger
        self.times_suggest = 0  # without warmup
        if self.space == 'nasbench201':
            self.api = api
            self.max_space = 15625
            self.num_estimator = 30
        elif self.space == 'darts':
            self.num_estimator = 98
        elif self.space == 'mb':
            self.num_estimator = 140
            
        self.model = RandomForestClassifier(n_estimators=self.num_estimator)
    
    def _update_lossthres(self):
        losses = [i['loss'] for i in self.trained_arch]
 #         losses_wo_inf = []
 #         for i in losses:
 #             if not np.isinf(i):
 #                 losses_wo_inf.append(i)
        self.loss_thres = np.quantile(losses, self.thres_rate) + 1e-9
        self.logger.info("CKA loss_thres: {}".format(self.loss_thres))
        good_arch = (np.array(losses) < self.loss_thres).tolist()
        assert np.sum(good_arch) > 1, "no enough good architectures"
    
    def _index2arch_nb201(self, index):
        assert self.space == 'nasbench201', 'api dismatch'
        _arch_str = self.api.arch(index)
        _arch_arr = sampling.genostr2array(_arch_str)
        return _arch_arr
    
    def _trainedarch2xy(self):
        features = []
        labels = []
        for i in self.trained_arch:
            features.append(i['arch'].ravel())
            labels.append(i['loss'] < self.loss_thres if self.loss_thres else False)
        return features, labels
            
    def warmup_samples(self, num_warmup):
        if self.space == 'nasbench201':
            sampled = list(np.random.choice(self.max_space, size=num_warmup, replace=False))
            self.sampled_history = copy.deepcopy(sampled)
            return sampled
        elif self.space == 'darts':
            return [self._single_sample() for _ in range(num_warmup)]
        elif self.space == 'mb':
            return [self._single_sample() for _ in range(num_warmup)]
    
    def _single_sample(self, unique=True):
        if self.space == 'nasbench201':
            assert len(self.sampled_history) < self.max_space, "error: oversampled"
            while True:
                sample = np.random.randint(self.max_space)
                if sample not in self.sampled_history:
                    self.sampled_history.append(sample)
                    return sample
        elif self.space == 'darts':
            if unique:
                while True:
                    sample = np.zeros((14, 7)) # 14边，7op
                    node_ids = np.asarray([np.random.choice(range(x,x+i+2), size=2, replace=False) for i, x in enumerate((0,2,5,9))]).ravel() # 选择哪8个边
                    op = np.random.multinomial(1,[1/7.]*7, size=8) # 8条选择的边、7个有意义op
                    sample[node_ids] = op
                    if str(sample) not in self.sampled_history:
                        self.sampled_history.append(str(sample))
                        return sample
            else:
                sample = np.zeros((14, 7)) # 14边，7op
                node_ids = np.asarray([np.random.choice(range(x,x+i+2), size=2, replace=False) for i, x in enumerate((0,2,5,9))]).ravel() # 选择哪8个边
                op = np.random.multinomial(1,[1/7.]*7, size=8) # 8条选择的边、7个有意义op
                sample[node_ids] = op
                return sample
        elif self.space == 'mb':
            if unique:
                while True:
                    c = np.zeros((20, 7))
                    for i in range(20):
                        j = np.random.randint(7)
                        c[i, j] = True
                    if str(c) not in self.sampled_history:
                        self.sampled_history.append(str(c))
                        return c
            else:
                c = np.zeros((20, 7))
                for i in range(20):
                    j = np.random.randint(7)
                    c[i, j] = True
                return c

    def Warmup(self):
        self._update_lossthres()
        features, labels = self._trainedarch2xy()
        self.model.fit(np.asarray(features, dtype='float'), np.asarray(labels, dtype='float'))
    
    def fitting_samples(self):
        self.times_suggest += 1
        start_time = time.time()
        if self.space == 'nasbench201':
            _sample_indexes = np.random.choice(self.max_space, size=self.batch, replace=False)
            _sample_archs = []
            _sample_archs_idx = []
            for i in _sample_indexes:
                if i not in self.trained_arch_index:
                    _sample_archs.append(self._index2arch_nb201(i).ravel())
                    _sample_archs_idx.append(i)
 #             print("sample {} archs/batch, cost time: {}".format(len(_sample_archs), time.time()-start_time))
            _sample_archs = np.array(_sample_archs)
            best_id = np.argmax(self.model.predict_proba(_sample_archs)[:,1])
            best_arch_id = _sample_archs_idx[best_id]
            return best_arch_id
        elif self.space == 'darts':
            _sample_batch = np.array([self._single_sample(unique=False).ravel() for _ in range(self.batch)])
            _tmp_trained_arch = [str(i['arch'].ravel()) for i in self.trained_arch]
            _sample_archs = []
            for i in _sample_batch:
                if str(i) not in _tmp_trained_arch:
                    _sample_archs.append(i)
 #             print("sample {} archs/batch, cost time: {}".format(len(_sample_archs), time.time()-start_time))
            best_id = np.argmax(self.model.predict_proba(_sample_archs)[:,1])
            best_arch = _sample_archs[best_id].reshape((14, 7))
            return best_arch
        elif self.space == 'mb':
            _sample_batch = np.array([self._single_sample(unique=False).ravel() for _ in range(self.batch)])
            _tmp_trained_arch = [str(i['arch'].ravel()) for i in self.trained_arch]
            _sample_archs = []
            for i in _sample_batch:
                if str(i) not in _tmp_trained_arch:
                    _sample_archs.append(i)
 #             print("sample {} archs/batch, cost time: {}".format(len(_sample_archs), time.time()-start_time))
            best_id = np.argmax(self.model.predict_proba(_sample_archs)[:,1])
            best_arch = _sample_archs[best_id].reshape((20, 7))
            return best_arch
            
    def Fitting(self):
        # Called after adding data into trained_arch list.
        loss = self.trained_arch[-1]['loss']
        features, labels = self._trainedarch2xy()
        self.model.fit(np.asarray(features, dtype='float'), np.asarray(labels, dtype='float'))
        return loss < self.loss_thres if self.loss_thres else False
        
    def optimal_arch(self, method, top=300, use_softmax=True):
        assert method in ['sum', 'greedy'], 'method error.'
 #         with open('RF_sampling.pkl', 'wb') as f:
 #             pickle.dump((self.loss_thres, self.trained_arch, self.sampled_history), f)
            
        self.logger.info("#times suggest: {}".format(self.times_suggest))
        _tmp_trained_arch = [i['arch'].ravel() for i in self.trained_arch]
 #         self.logger.info("Unique archs {} in total archs {}".format(len(np.unique(_tmp_trained_arch, axis=0)), len(self.trained_arch)))
        estimate_archs_tmp = []
        for i in self.trained_arch:
            if (i['loss'] < self.loss_thres if self.loss_thres else False):
                estimate_archs_tmp.append(i)
        
        self.logger.info("#arch < CKA loss_thres: {}".format(len(estimate_archs_tmp)))

        _est_archs_sort = sorted(estimate_archs_tmp, key=lambda d: d['loss']) 
        
        estimate_archs = []
        if top>len(_est_archs_sort):
            self.logger.info('top>all, using all archs.')
        for i in range(min(top, len(_est_archs_sort))):
            estimate_archs.append(_est_archs_sort[i]['arch'])
        
        if self.space == 'nasbench201':
            result = []
            if method == 'sum':
                all_sum = estimate_archs[0]
                for i in estimate_archs[1:]:
                    all_sum = np.add(all_sum, i)
                # print(all_sum)
                sum_max = list(np.argmax(all_sum, axis=1))
                result = copy.deepcopy(sum_max)
            
            elif method == 'greedy':
                path_info =[[[0 for _ in range(5)] for _ in range(5)] for _ in range(6)]
                
                for i in estimate_archs:
                    for j in range(1, 6):
                        path_info[j][np.argmax(i[j-1])][np.argmax(i[j])] += 1
                
                _esti_arch_0 = [0]*5
                for i in estimate_archs:
                    _esti_arch_0 = np.add(i[0], _esti_arch_0)

                startindex = np.argmax(_esti_arch_0)
                path_max = [startindex]
                for i in range(1, 6):
                #     path_max.append(np.argmax(path_info[i][path_max[i-1]]))
                    # one more step
                    max_op_sum = np.max(path_info[i][path_max[i-1]])
                    _tmp_max_idx = []
                    for j in range(5):
                        if path_info[i][path_max[i-1]][j] == max_op_sum:
                            _tmp_max_idx.append(j)
                    if len(_tmp_max_idx) == 1 or i==5:
                        path_max.append(np.argmax(path_info[i][path_max[i-1]]))
                    else:
                        _next_step = np.array([np.sum(path_info[i+1][j]) for j in _tmp_max_idx])
                        _chosen_op = _tmp_max_idx[np.argmax(_next_step)]
                        path_max.append(_chosen_op)
                self.logger.info("path info:\n{}".format(str(path_info)))
                result = copy.deepcopy(path_max)
            _tmp_np = np.array(result)
            op_arr = np.zeros((_tmp_np.size, 5))
            op_arr[np.arange(_tmp_np.size),_tmp_np] = 1
            return op_arr
        elif self.space == 'darts':
            assert method == 'sum', 'only sum is supported in darts.'
            all_sum = estimate_archs[0]
            for i in estimate_archs[1:]:
                all_sum = np.add(all_sum, i)
            if use_softmax:
                all_sum = softmax(all_sum)
            sum_max = np.argmax(all_sum, axis=1)
            start_index = 0
            end_index = 0
            for i in range(2, 6):
                end_index += i
                _, top_index = torch.topk(torch.from_numpy(sum_max[start_index:end_index]), 2)
                mask = list(set(range(i)) - set(list(top_index.numpy())))
                for j in mask:
                    sum_max[start_index+j] = 7
                start_index = end_index
 #             print(sum_max)
            _tmp_np = np.array(sum_max)
            op_arr = np.zeros((_tmp_np.size, 8))
            op_arr[np.arange(_tmp_np.size),_tmp_np] = 1
            return op_arr
        elif self.space == 'mb':
            assert method == 'sum', 'only sum is supported in mb.'
            all_sum = estimate_archs[0]
            for i in estimate_archs[1:]:
                all_sum = np.add(all_sum, i)
            print(all_sum)
            if use_softmax:
                all_sum = softmax(all_sum)
            sum_max = np.argmax(all_sum, axis=1)
            print(sum_max)
            _tmp_np = np.array(sum_max)
            op_arr = np.zeros((_tmp_np.size, 7))
            op_arr[np.arange(_tmp_np.size),_tmp_np] = 1
            return op_arr
--- a/xnas/search_algorithm/RMINAS/sampler/available_archs.txt
+++ b/xnas/search_algorithm/RMINAS/sampler/available_archs.txt
--- a/xnas/search_algorithm/RMINAS/sampler/sampling.py
+++ b/xnas/search_algorithm/RMINAS/sampler/sampling.py
@@ -0,0 +1,100 @@
 import random
 import numpy as np
 from scipy import stats

 true_list = []
 with open('xnas/search_algorithm/RMINAS/sampler/available_archs.txt', 'r') as f:
    true_list = eval(f.readline())

 def random_sampling(times):
    sample_list = []
    if times > sum(true_list):
        print('can only sample {} times.'.format(sum(true_list)))
        times = sum(true_list)
    for _ in range(times):
        i = random.randint(0, 15624)
        while (not true_list[i]) or (i in sample_list):
            i = random.randint(0, 15624)
        sample_list.append(i)
    return sample_list

 def genostr2array(geno_str):
    # |none~0|+|nor_conv_1x1~0|none~1|+|avg_pool_3x3~0|skip_connect~1|nor_conv_3x3~2|
    OPS = ["none", "skip_connect", "nor_conv_1x1", "nor_conv_3x3", "avg_pool_3x3"]
    _tmp = geno_str.split('|')
    _tmp2 = []
    for i in range(len(_tmp)):
        if i in [1,3,4,6,7,8]:
            _tmp2.append(_tmp[i][:-2])
    _tmp_np = np.array([0]*6)
    for i in range(6):
        _tmp_np[i] = OPS.index(_tmp2[i])
    _tmp_oh = np.zeros((_tmp_np.size, 5))
    _tmp_oh[np.arange(_tmp_np.size),_tmp_np] = 1
    return _tmp_oh

 def array2genostr(arr):
    OPS = ["none", "skip_connect", "nor_conv_1x1", "nor_conv_3x3", "avg_pool_3x3"]
    """[[1. 0. 0. 0. 0.]
        [0. 0. 1. 0. 0.]
        [1. 0. 0. 0. 0.]
        [0. 0. 0. 0. 1.]
        [0. 1. 0. 0. 0.]
        [0. 0. 0. 1. 0.]]"""
    idx = [list(i).index(1.) for i in arr]
    op = [OPS[x] for x in idx]
    mixed = '|' + op[0] + '~0|+|' + op[1] + '~0|' + op[2] + '~1|+|' + op[3] + '~0|' + op[4] + '~1|' + op[5] + '~2|'
    return mixed

 def base_transform(n, x):
    a=[0,1,2,3,4,5,6,7,8,9,'A','b','C','D','E','F']
    b=[]
    while True:
        s=n//x
        y=n%x
        b=b+[y]
        if s==0:
            break
        n=s
    b.reverse()
    zero_arr = [0]*(6-len(b))
    return zero_arr+b

 def array_morearch(arr, distance):
    """[[1. 0. 0. 0. 0.]
     [0. 0. 1. 0. 0.]
     [1. 0. 0. 0. 0.]
     [0. 0. 0. 0. 1.]
     [0. 1. 0. 0. 0.]
     [0. 0. 0. 1. 0.]]"""
    am = list(arr.argmax(axis=1))  # [0,2,0,4,1,3]
    morearch = []
    if distance == 1:
        for i in range(len(am)):
            for j in range(5):
                if am[i]!=j:
                    _tmp = am[:]
                    _tmp[i] = j
                    _tmp_np = np.array(_tmp)
                    _tmp_oh = np.zeros((_tmp_np.size, 5))
                    _tmp_oh[np.arange(_tmp_np.size),_tmp_np] = 1
                    morearch.append(_tmp_oh)
    else:
        for i in range(15625):
            arr = base_transform(i, 5)
            if distance == 6-sum([arr[i]==am[i] for i in range(6)]):
                _tmp_np = np.array(arr)
                _tmp_oh = np.zeros((_tmp_np.size, 5))
                _tmp_oh[np.arange(_tmp_np.size),_tmp_np] = 1
                morearch.append(_tmp_oh)
    #             morearch.append(arr)
    return morearch



 # test_arr = np.array([[1., 0., 0., 0., 0.],
 #      [0., 0., 1., 0., 0.],
 #      [1., 0., 0., 0., 0.],
 #      [0., 0., 0., 0., 1.],
 #      [0., 1., 0., 0., 0.],
 #      [0., 0., 0., 1., 0.]])
--- a/xnas/search_algorithm/RMINAS/sampler/sampling_darts.py
+++ b/xnas/search_algorithm/RMINAS/sampler/sampling_darts.py
@@ -0,0 +1,46 @@
 import numpy as np
 import torch
 from collections import namedtuple

 basic_op_list = ['max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5', 'none']
 Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat')

 def random_suggest():
    sample = np.zeros((14, 7)) # 14边，7op
    node_ids = np.asarray([np.random.choice(range(x,x+i+2), size=2, replace=False) for i, x in enumerate((0,2,5,9))]).ravel() # 选择哪8个边
    op = np.random.multinomial(1,[1/7.]*7, size=8) # 8条选择的边、7个有意义op
    sample[node_ids] = op
    return sample

 def ransug2alpha(suggest_sample):
    b = np.c_[suggest_sample, np.zeros(14)]
    return torch.from_numpy(np.r_[b,b])

 def geno2147array(genotype):
    """
    Genotype(normal=[[('max_pool_3x3', 0), ('dil_conv_3x3', 1)], [('max_pool_3x3', 0), ('dil_conv_5x5', 1)], [('avg_pool_3x3', 1), ('dil_conv_3x3', 0)], [('dil_conv_3x3', 0), ('sep_conv_3x3', 3)]], normal_concat=range(2, 6), reduce=[[('max_pool_3x3', 0), ('dil_conv_3x3', 1)], [('max_pool_3x3', 0), ('dil_conv_5x5', 1)], [('avg_pool_3x3', 1), ('dil_conv_3x3', 0)], [('dil_conv_3x3', 0), ('sep_conv_3x3', 3)]], reduce_concat=range(2, 6))
    """
    genotype = eval(genotype)
    sample = np.zeros([28, 7])
    norm_gene = genotype[0]
    reduce_gene = genotype[2]
    num_select = list(range(2, 6))
    for j, _gene in enumerate([norm_gene, reduce_gene]):
        for i, node in enumerate(_gene):
            for op in node:
                op_name = op[0]
                op_id = op[1]
                if i == 0:
                    true_id = op_id + j * 14
                else:
                    if i == 1:
                        _temp = num_select[0]
                    else:
                        _temp = sum(num_select[0:i])
                    true_id = op_id + _temp + j * 14
                sample[true_id, basic_op_list.index(op_name)] = 1
 #     for i in range(28):
 #         if np.sum(sample[i, :]) == 0:
 #             sample[i, 7] = 1
    return sample[0:14]
    
--- a/xnas/search_algorithm/RMINAS/teacher_model/fbresnet_imagenet/fbresnet.py
+++ b/xnas/search_algorithm/RMINAS/teacher_model/fbresnet_imagenet/fbresnet.py
@@ -0,0 +1,253 @@
 """code from https://github.com/Cadene/pretrained-models.pytorch.git"""

 from __future__ import print_function, division, absolute_import
 import torch.nn as nn
 import torch.nn.functional as F
 import math
 import torch.utils.model_zoo as model_zoo
 import torch

 WEIGHT_PATH = 'teacher_model/fbresnet_imagenet/fbresnet152.pth'

 __all__ = ['FBResNet',
           #'fbresnet18', 'fbresnet34', 'fbresnet50', 'fbresnet101',
           'fbresnet152']

 pretrained_settings = {
    'fbresnet152': {
        'imagenet': {
            'url': 'http://data.lip6.fr/cadene/pretrainedmodels/fbresnet152-2e20f6b4.pth',
            'input_space': 'RGB',
            'input_size': [3, 224, 224],
            'input_range': [0, 1],
            'mean': [0.485, 0.456, 0.406],
            'std': [0.229, 0.224, 0.225],
            'num_classes': 1000
        }
    }
 }


 def conv3x3(in_planes, out_planes, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=True)


 class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


 class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=True)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

 class FBResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        # Special attributs
        self.input_space = None
        self.input_size = (299, 299, 3)
        self.mean = None
        self.std = None
        super(FBResNet, self).__init__()
        # Modules
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                                bias=True)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.last_linear = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=True),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def features(self, input):
        x = self.conv1(input)
        self.conv1_input = x.clone()
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x
    
    def features_extractor(self, input):
        features = []
        x = self.conv1(input)
        self.conv1_input = x.clone()
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        features.append(x)
        x = self.layer1(x)
        x = self.layer2(x)
        features.append(x)
        x = self.layer3(x)
        x = self.layer4(x)
        features.append(x)
        return features

    def logits(self, features):
        adaptiveAvgPoolWidth = features.shape[2]
        x = F.avg_pool2d(features, kernel_size=adaptiveAvgPoolWidth)
        x = x.view(x.size(0), -1)
        x = self.last_linear(x)
        return x

    def forward(self, input):
        x = self.features(input)
        x = self.logits(x)
        return x


 def fbresnet18(num_classes=1000):
    """Constructs a ResNet-18 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = FBResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
    return model


 def fbresnet34(num_classes=1000):
    """Constructs a ResNet-34 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = FBResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes)
    return model


 def fbresnet50(num_classes=1000):
    """Constructs a ResNet-50 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = FBResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes)
    return model


 def fbresnet101(num_classes=1000):
    """Constructs a ResNet-101 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = FBResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes)
    return model


 def fbresnet152(num_classes=1000, pretrained='imagenet'):
    """Constructs a ResNet-152 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = FBResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes)
    if pretrained is not None:
        settings = pretrained_settings['fbresnet152'][pretrained]
        assert num_classes == settings['num_classes'], \
            "num_classes should be {}, but is {}".format(settings['num_classes'], num_classes)
        model.load_state_dict(torch.load(WEIGHT_PATH))
        model.input_space = settings['input_space']
        model.input_size = settings['input_size']
        model.input_range = settings['input_range']
        model.mean = settings['mean']
        model.std = settings['std']
    return model
--- a/xnas/search_algorithm/RMINAS/teacher_model/resnet101_cifar100/resnet.py
+++ b/xnas/search_algorithm/RMINAS/teacher_model/resnet101_cifar100/resnet.py
@@ -0,0 +1,133 @@
 """
 Reference:
 [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
 """

 import torch
 import torch.nn as nn

 class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels * BasicBlock.expansion)
        )

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * BasicBlock.expansion)
            )

    def forward(self, x):
        return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))

 class BottleNeck(nn.Module):
    expansion = 4
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, stride=stride, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, bias=False),
            nn.BatchNorm2d(out_channels * BottleNeck.expansion),
        )

        self.shortcut = nn.Sequential()

        if stride != 1 or in_channels != out_channels * BottleNeck.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels * BottleNeck.expansion, stride=stride, kernel_size=1, bias=False),
                nn.BatchNorm2d(out_channels * BottleNeck.expansion)
            )

    def forward(self, x):
        return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))

 class ResNet(nn.Module):

    def __init__(self, block, num_block, num_classes=100):
        super().__init__()

        self.in_channels = 64

        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True))
        #we use a different inputsize than the original paper
        #so conv2_x's stride is 1
        self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
        self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
        self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
        self.conv5_x = self._make_layer(block, 512, num_block[3], 2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels * block.expansion

        return nn.Sequential(*layers)

    def forward(self, x):
        output = self.conv1(x)
        output = self.conv2_x(output)
        output = self.conv3_x(output)
        output = self.conv4_x(output)
        output = self.conv5_x(output)
        output = self.avg_pool(output)
        output = output.view(output.size(0), -1)
        output = self.fc(output)

        return output
    
    def feature_extractor(self, x):
        features = []
        output = self.conv1(x)
        output = self.conv2_x(output)
        features.append(output)
        output = self.conv3_x(output)
        output = self.conv4_x(output)
        features.append(output)
        output = self.conv5_x(output)
        features.append(output)
 #         output = self.avg_pool(output)
 #         output = output.view(output.size(0), -1)
 #         output = self.fc(output)

        return features

 def resnet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])

 def resnet34():
    return ResNet(BasicBlock, [3, 4, 6, 3])

 def resnet50():
    return ResNet(BottleNeck, [3, 4, 6, 3])

 def resnet101():
    return ResNet(BottleNeck, [3, 4, 23, 3])

 def resnet152():
    return ResNet(BottleNeck, [3, 8, 36, 3])



--- a/xnas/search_algorithm/RMINAS/teacher_model/resnet20_cifar10/resnet.py
+++ b/xnas/search_algorithm/RMINAS/teacher_model/resnet20_cifar10/resnet.py
@@ -0,0 +1,173 @@
 '''
 Properly implemented ResNet-s for CIFAR10 as described in paper [1].

 The implementation and structure of this file is hugely influenced by [2]
 which is implemented for ImageNet and doesn't have option A for identity.
 Moreover, most of the implementations on the web is copy-paste from
 torchvision's resnet and has wrong number of params.

 Proper ResNet-s for CIFAR10 (for fair comparision and etc.) has following
 number of layers and parameters:

 name      | layers | params
 ResNet20  |    20  | 0.27M
 ResNet32  |    32  | 0.46M
 ResNet44  |    44  | 0.66M
 ResNet56  |    56  | 0.85M
 ResNet110 |   110  |  1.7M
 ResNet1202|  1202  | 19.4m

 which this implementation indeed has.

 Reference:
 [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
 [2] https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

 If you use this implementation in you work, please don't forget to mention the
 author, Yerlan Idelbayev.
 '''
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.nn.init as init

 from torch.autograd import Variable

 __all__ = ['ResNet', 'resnet20', 'resnet32', 'resnet44', 'resnet56', 'resnet110', 'resnet1202']

 def _weights_init(m):
    classname = m.__class__.__name__
    #print(classname)
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
        init.kaiming_normal_(m.weight)

 class LambdaLayer(nn.Module):
    def __init__(self, lambd):
        super(LambdaLayer, self).__init__()
        self.lambd = lambd

    def forward(self, x):
        return self.lambd(x)


 class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, option='A'):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            if option == 'A':
                """
                For CIFAR10 ResNet paper uses option A.
                """
                self.shortcut = LambdaLayer(lambda x:
                                            F.pad(x[:, :, ::2, ::2], (0, 0, 0, 0, planes//4, planes//4), "constant", 0))
            elif option == 'B':
                self.shortcut = nn.Sequential(
                     nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                     nn.BatchNorm2d(self.expansion * planes)
                )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


 class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 16

        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
        self.linear = nn.Linear(64, num_classes)

        self.apply(_weights_init)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion

        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.avg_pool2d(out, out.size()[3])
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

    def feature_extractor(self, x):
        features = []
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        features.append(out)
        out = self.layer2(out)
        features.append(out)
        out = self.layer3(out)
        features.append(out)
 #         out = F.avg_pool2d(out, out.size()[3])
 #         out = out.view(out.size(0), -1)
 #         out = self.linear(out)
        return features


 def resnet20():
    return ResNet(BasicBlock, [3, 3, 3])


 def resnet32():
    return ResNet(BasicBlock, [5, 5, 5])


 def resnet44():
    return ResNet(BasicBlock, [7, 7, 7])


 def resnet56():
    return ResNet(BasicBlock, [9, 9, 9])


 def resnet110():
    return ResNet(BasicBlock, [18, 18, 18])


 def resnet1202():
    return ResNet(BasicBlock, [200, 200, 200])


 def test(net):
    import numpy as np
    total_params = 0

    for x in filter(lambda p: p.requires_grad, net.parameters()):
        total_params += np.prod(x.data.numpy().shape)
    print("Total number of params", total_params)
    print("Total layers", len(list(filter(lambda p: p.requires_grad and len(p.data.size())>1, net.parameters()))))


 if __name__ == "__main__":
    for net_name in __all__:
        if net_name.startswith('resnet'):
            print(net_name)
            test(globals()[net_name]())
            print()
--- a/xnas/search_algorithm/RMINAS/utils/RMI_torch.py
+++ b/xnas/search_algorithm/RMINAS/utils/RMI_torch.py
@@ -0,0 +1,182 @@
 import numpy as np
 import torch
 import sys
 import pdb

 def gram_linear(x): #np和tensor都可以用这个方法
    """Compute Gram (kernel) matrix for a linear kernel.

    Args:
        x: A num_examples x num_features matrix of features.

    Returns:
        A num_examples x num_examples Gram matrix of examples.
    """
    return x.dot(x.T)

 def tensor_gram_linear(x):
    return torch.mm(x, x.T)

 def gram_rbf(x, threshold=1.0):
        """Compute Gram (kernel) matrix for an RBF kernel.

    Args:
        x: A num_examples x num_features matrix of features.
        threshold: Fraction of median Euclidean distance to use as RBF kernel
            bandwidth. (This is the heuristic we use in the paper. There are other
            possible ways to set the bandwidth; we didn't try them.)

    Returns:
        A num_examples x num_examples Gram matrix of examples.
    """
        dot_products = x.dot(x.T)
        sq_norms = np.diag(dot_products)
        sq_distances = -2 * dot_products + sq_norms[:, None] + sq_norms[None, :]
        sq_median_distance = np.median(sq_distances)
        return np.exp(-sq_distances / (2 * threshold ** 2 * sq_median_distance))


 def center_gram(gram, unbiased=False):
    """Center a symmetric Gram matrix.

    This is equvialent to centering the (possibly infinite-dimensional) features
    induced by the kernel before computing the Gram matrix.

    Args:
        gram: A num_examples x num_examples symmetric matrix.
        unbiased: Whether to adjust the Gram matrix in order to compute an unbiased
            estimate of HSIC. Note that this estimator may be negative.

    Returns:
        A symmetric matrix with centered columns and rows.
    """
    if not np.allclose(gram, gram.T):
        raise ValueError('Input must be a symmetric matrix.')
    gram = gram.copy()

    if unbiased:
        # This formulation of the U-statistic, from Szekely, G. J., & Rizzo, M.
        # L. (2014). Partial distance correlation with methods for dissimilarities.
        # The Annals of Statistics, 42(6), 2382-2412, seems to be more numerically
        # stable than the alternative from Song et al. (2007).
        n = gram.shape[0]
        np.fill_diagonal(gram, 0)
        means = np.sum(gram, 0, dtype=np.float64) / (n - 2)
        means -= np.sum(means) / (2 * (n - 1))
        gram -= means[:, None]
        gram -= means[None, :]
        np.fill_diagonal(gram, 0)
    else:#(256, 256)
        means = np.mean(gram, 0, dtype=np.float64) #(256,)
        means -= np.mean(means) / 2 #(256,)
        gram -= means[:, None] ##(256, 256)
        gram -= means[None, :] ##(256, 256)

    return gram

 def tensor_center_gram(gram, unbiased=False):
    # if not torch.allclose(gram, gram.T):
    #     raise ValueError('Input must be a symmetric matrix.')
    
    if unbiased:
        n = gram.shape[0]
        gram.fill_diagonal(0)
        means = torch.sum(gram, 0, dtype=torch.float64) / (n-2)
        means -= torch.sum(means) / (2 * (n-1))
        gram -= means[:, None]
        gram -= means[None, :]
        gram.fill_diagonal(0)
    else:
        means = torch.mean(gram, 0, dtype=torch.float64)
        means -= torch.mean(means) / 2
        gram -= means[:, None]
        gram -= means[None, :]
    return gram

 def cka(gram_x, gram_y, debiased=False):
    """Compute CKA.

    Args:
        gram_x: A num_examples x num_examples Gram matrix.
        gram_y: A num_examples x num_examples Gram matrix.
        debiased: Use unbiased estimator of HSIC. CKA may still be biased.

    Returns:
        The value of CKA between X and Y.
    """
    gram_x = center_gram(gram_x, unbiased=debiased)
    gram_y = center_gram(gram_y, unbiased=debiased)

    # Note: To obtain HSIC, this should be divided by (n-1)**2 (biased variant) or
    # n*(n-3) (unbiased variant), but this cancels for CKA.
    scaled_hsic = gram_x.ravel().dot(gram_y.ravel())

    normalization_x = np.linalg.norm(gram_x)
    normalization_y = np.linalg.norm(gram_y)
    return scaled_hsic / (normalization_x * normalization_y)

 def _ravel(gram):
    return torch.reshape(gram, (-1,))

 def tensor_cka(gram_x, gram_y, debiased=False):
    gram_x = tensor_center_gram(gram_x, unbiased=debiased)
    gram_y = tensor_center_gram(gram_y, unbiased=debiased)
    # scaled_hsic = gram_x.ravel().dot(gram_y.ravel())
    scaled_hsic = _ravel(gram_x).dot(_ravel(gram_y))    # works under pytorch 1.5. Same for below.
    # normalization_x = torch.linalg.norm(gram_x)
    # normalization_y = torch.linalg.norm(gram_y)
    normalization_x = torch.norm(gram_x)
    normalization_y = torch.norm(gram_y)
    return scaled_hsic / (normalization_x * normalization_y)

 def _debiased_dot_product_similarity_helper(
                xty, sum_squared_rows_x, sum_squared_rows_y, squared_norm_x, squared_norm_y,
                n):
    """Helper for computing debiased dot product similarity (i.e. linear HSIC)."""
    # This formula can be derived by manipulating the unbiased estimator from
    # Song et al. (2007).
    return (
                    xty - n / (n - 2.) * sum_squared_rows_x.dot(sum_squared_rows_y)
                    + squared_norm_x * squared_norm_y / ((n - 1) * (n - 2)))

 def feature_space_linear_cka(features_x, features_y, debiased=False):
    """Compute CKA with a linear kernel, in feature space.

    This is typically faster than computing the Gram matrix when there are fewer
    features than examples.

    Args:
        features_x: A num_examples x num_features matrix of features.
        features_y: A num_examples x num_features matrix of features.
        debiased: Use unbiased estimator of dot product similarity. CKA may still be
            biased. Note that this estimator may be negative.

    Returns:
        The value of CKA between X and Y.
    """
    features_x = features_x - np.mean(features_x, 0, keepdims=True)
    features_y = features_y - np.mean(features_y, 0, keepdims=True)

    dot_product_similarity = np.linalg.norm(features_x.T.dot(features_y)) ** 2
    normalization_x = np.linalg.norm(features_x.T.dot(features_x))
    normalization_y = np.linalg.norm(features_y.T.dot(features_y))

    if debiased:
        n = features_x.shape[0]
        # Equivalent to np.sum(features_x ** 2, 1) but avoids an intermediate array.
        sum_squared_rows_x = np.einsum('ij,ij->i', features_x, features_x)
        sum_squared_rows_y = np.einsum('ij,ij->i', features_y, features_y)
        squared_norm_x = np.sum(sum_squared_rows_x)
        squared_norm_y = np.sum(sum_squared_rows_y)

        dot_product_similarity = _debiased_dot_product_similarity_helper(
                dot_product_similarity, sum_squared_rows_x, sum_squared_rows_y,
                squared_norm_x, squared_norm_y, n)
        normalization_x = np.sqrt(_debiased_dot_product_similarity_helper(
                normalization_x ** 2, sum_squared_rows_x, sum_squared_rows_x,
                squared_norm_x, squared_norm_x, n))
        normalization_y = np.sqrt(_debiased_dot_product_similarity_helper(
                normalization_y ** 2, sum_squared_rows_y, sum_squared_rows_y,
                squared_norm_y, squared_norm_y, n))

    return dot_product_similarity / (normalization_x * normalization_y)
--- a/xnas/search_algorithm/RMINAS/utils/get_accuracy.ipynb
+++ b/xnas/search_algorithm/RMINAS/utils/get_accuracy.ipynb
@@ -0,0 +1,207 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "try to create the NAS-Bench-201 api from ./NAS-Bench-201-v1_0-e61699.pth\n",
      "done.\n"
     ]
    }
   ],
   "source": [
    "import copy\n",
    "from scipy import stats\n",
    "import numpy as np\n",
    "import sampler.sampling as sampling\n",
    "from nas_201_api import NASBench201API as api\n",
    "\n",
    "nb201_api = api('./NAS-Bench-201-v1_0-e61699.pth')\n",
    "print('done.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def get_acc_valid(dataset, index):\n",
    "    \"\"\"dataset in 'cifar10', 'cifar100', 'imgagenet'.\"\"\"\n",
    "    strings = nb201_api.query_by_arch(nb201_api.arch(index))\n",
    "    strings = strings.split('\\n')\n",
    "    if dataset == 'cifar10':\n",
    "        cifar10_valid_res = strings[3]\n",
    "        startpoint = cifar10_valid_res.find('valid : [loss = ') + len('valid : [loss = ')\n",
    "        toppoint = cifar10_valid_res.find('top1 = ', startpoint) + len('top1 = ')\n",
    "        endpoint = cifar10_valid_res.find('%]', toppoint)\n",
    "        ans = cifar10_valid_res[toppoint:endpoint]\n",
    "        return float(ans)\n",
    "    elif dataset == 'cifar100':\n",
    "        cifar100_res = strings[7]\n",
    "        startpoint = cifar100_res.find('valid : [loss = ') + len('valid : [loss = ')\n",
    "        toppoint = cifar100_res.find('top1 = ', startpoint) + len('top1 = ')\n",
    "        endpoint = cifar100_res.find('%]', toppoint)\n",
    "        ans = cifar100_res[toppoint:endpoint]\n",
    "        return float(ans)\n",
    "    elif dataset == 'imagenet':\n",
    "        imagenet_res = strings[9]\n",
    "        startpoint = imagenet_res.find('valid : [loss = ') + len('valid : [loss = ')\n",
    "        toppoint = imagenet_res.find('top1 = ', startpoint) + len('top1 = ')\n",
    "        endpoint = imagenet_res.find('%]', toppoint)\n",
    "        ans = imagenet_res[toppoint:endpoint]\n",
    "        return float(ans)\n",
    "    else:\n",
    "        print('dataset error')\n",
    "        exit(1)\n",
    "\n",
    "def get_acc_test(dataset, index):\n",
    "    \"\"\"dataset in 'cifar10', 'cifar100', 'imgagenet'.\"\"\"\n",
    "    strings = nb201_api.query_by_arch(nb201_api.arch(index))\n",
    "    strings = strings.split('\\n')\n",
    "    if dataset == 'cifar10':\n",
    "        cifar10_test_res = strings[5]\n",
    "        startpoint = cifar10_test_res.find('test  : [loss = ') + len('test  : [loss = ')\n",
    "        toppoint = cifar10_test_res.find('top1 = ', startpoint) + len('top1 = ')\n",
    "        endpoint = cifar10_test_res.find('%]', toppoint)\n",
    "        ans = cifar10_test_res[toppoint:endpoint]\n",
    "        return float(ans)\n",
    "    elif dataset == 'cifar100':\n",
    "        cifar100_res = strings[7]\n",
    "        startpoint = cifar100_res.find('test : [loss = ') + len('test : [loss = ')\n",
    "        toppoint = cifar100_res.find('top1 = ', startpoint) + len('top1 = ')\n",
    "        endpoint = cifar100_res.find('%]', toppoint)\n",
    "        ans = cifar100_res[toppoint:endpoint]\n",
    "        return float(ans)\n",
    "    elif dataset == 'imagenet':\n",
    "        imagenet_res = strings[9]\n",
    "        startpoint = imagenet_res.find('test : [loss = ') + len('test : [loss = ')\n",
    "        toppoint = imagenet_res.find('top1 = ', startpoint) + len('top1 = ')\n",
    "        endpoint = imagenet_res.find('%]', toppoint)\n",
    "        ans = imagenet_res[toppoint:endpoint]\n",
    "        return float(ans)\n",
    "    else:\n",
    "        print('dataset error')\n",
    "        exit(1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "cifar10_valid = []\n",
    "cifar100_valid = []\n",
    "imagenet_valid = []\n",
    "\n",
    "cifar10_test = []\n",
    "cifar100_test = []\n",
    "imagenet_test = []\n",
    "\n",
    "for i in range(15625):\n",
    "    cifar10_valid.append(get_acc_valid('cifar10', i))\n",
    "    cifar100_valid.append(get_acc_valid('cifar100', i))\n",
    "    imagenet_valid.append(get_acc_valid('imagenet', i))\n",
    "    cifar10_test.append(get_acc_test('cifar10', i))\n",
    "    cifar100_test.append(get_acc_test('cifar100', i))\n",
    "    imagenet_test.append(get_acc_test('imagenet', i))\n",
    "    \n",
    "\n",
    "cifar10_valid_sort = copy.deepcopy(cifar10_valid)\n",
    "cifar10_valid_sort.sort(reverse=True)\n",
    "cifar100_valid_sort = copy.deepcopy(cifar100_valid)\n",
    "cifar100_valid_sort.sort(reverse=True)\n",
    "imagenet_valid_sort = copy.deepcopy(imagenet_valid)\n",
    "imagenet_valid_sort.sort(reverse=True)\n",
    "\n",
    "cifar10_test_sort = copy.deepcopy(cifar10_test)\n",
    "cifar10_test_sort.sort(reverse=True)\n",
    "cifar100_test_sort = copy.deepcopy(cifar100_test)\n",
    "cifar100_test_sort.sort(reverse=True)\n",
    "imagenet_test_sort = copy.deepcopy(imagenet_test)\n",
    "imagenet_test_sort.sort(reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "valid\n",
      "acc_cifar10: 91.22, rank_cifar10: 43\n",
      "acc_cifar100: 71.97, rank_cifar100: 45\n",
      "acc_imgnet16: 45.59, rank_imgnet16: 71\n",
      "test\n",
      "acc_cifar10: 93.98, rank_cifar10: 48\n",
      "acc_cifar100: 71.69, rank_cifar100: 79\n",
      "acc_imgnet16: 45.82, rank_imgnet16: 66\n"
     ]
    }
   ],
   "source": [
    "res = [2,3,3,1,0,3]\n",
    "\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "def array2genostr(arr):\n",
    "    OPS = [\"none\", \"skip_connect\", \"nor_conv_1x1\", \"nor_conv_3x3\", \"avg_pool_3x3\"]\n",
    "    idx = [list(i).index(1.) for i in arr]\n",
    "    op = [OPS[x] for x in idx]\n",
    "    mixed = '|' + op[0] + '~0|+|' + op[1] + '~0|' + op[2] + '~1|+|' + op[3] + '~0|' + op[4] + '~1|' + op[5] + '~2|'\n",
    "    return mixed\n",
    "\n",
    "_tmp_np = np.array(res)\n",
    "_tmp_oh = np.zeros((_tmp_np.size, 5))\n",
    "_tmp_oh[np.arange(_tmp_np.size),_tmp_np] = 1\n",
    "# print(_tmp_oh)\n",
    "geno_str = array2genostr(_tmp_oh)\n",
    "#     print(geno_str)\n",
    "index = nb201_api.query_index_by_arch(geno_str)\n",
    "# print(index)\n",
    "\n",
    "print('valid')\n",
    "print('acc_cifar10: {}, rank_cifar10: {}'.format(cifar10_valid[index], cifar10_valid_sort.index(cifar10_valid[index])))\n",
    "print('acc_cifar100: {}, rank_cifar100: {}'.format(cifar100_valid[index], cifar100_valid_sort.index(cifar100_valid[index])))\n",
    "print('acc_imgnet16: {}, rank_imgnet16: {}'.format(imagenet_valid[index], imagenet_valid_sort.index(imagenet_valid[index])))\n",
    "\n",
    "print('test')\n",
    "print('acc_cifar10: {}, rank_cifar10: {}'.format(cifar10_test[index], cifar10_test_sort.index(cifar10_test[index])))\n",
    "print('acc_cifar100: {}, rank_cifar100: {}'.format(cifar100_test[index], cifar100_test_sort.index(cifar100_test[index])))\n",
    "print('acc_imgnet16: {}, rank_imgnet16: {}'.format(imagenet_test[index], imagenet_test_sort.index(imagenet_test[index])))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/xnas/search_algorithm/RMINAS/utils/imagenet16120_loader.py
+++ b/xnas/search_algorithm/RMINAS/utils/imagenet16120_loader.py
@@ -0,0 +1,182 @@
 import os, sys, torch
 import numpy as np
 import torchvision.datasets as dset
 import torchvision.transforms as transforms
 from copy import deepcopy
 from PIL import Image

 import hashlib
 import pickle
 import torch.utils.data as data

 DATA_PATH = 'data/ImageNet16'

 def calculate_md5(fpath, chunk_size=1024 * 1024):
    md5 = hashlib.md5()
    with open(fpath, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            md5.update(chunk)
    return md5.hexdigest()


 def check_md5(fpath, md5, **kwargs):
    return md5 == calculate_md5(fpath, **kwargs)


 def check_integrity(fpath, md5=None):
    if not os.path.isfile(fpath):
        return False
    if md5 is None:
        return True
    else:
        return check_md5(fpath, md5)

    
 class ImageNet16(data.Dataset):
    # http://image-net.org/download-images
    # A Downsampled Variant of ImageNet as an Alternative to the CIFAR datasets
    # https://arxiv.org/pdf/1707.08819.pdf

    train_list = [
        ["train_data_batch_1", "27846dcaa50de8e21a7d1a35f30f0e91"],
        ["train_data_batch_2", "c7254a054e0e795c69120a5727050e3f"],
        ["train_data_batch_3", "4333d3df2e5ffb114b05d2ffc19b1e87"],
        ["train_data_batch_4", "1620cdf193304f4a92677b695d70d10f"],
        ["train_data_batch_5", "348b3c2fdbb3940c4e9e834affd3b18d"],
        ["train_data_batch_6", "6e765307c242a1b3d7d5ef9139b48945"],
        ["train_data_batch_7", "564926d8cbf8fc4818ba23d2faac7564"],
        ["train_data_batch_8", "f4755871f718ccb653440b9dd0ebac66"],
        ["train_data_batch_9", "bb6dd660c38c58552125b1a92f86b5d4"],
        ["train_data_batch_10", "8f03f34ac4b42271a294f91bf480f29b"],
    ]
    valid_list = [
        ["val_data", "3410e3017fdaefba8d5073aaa65e4bd6"],
    ]

    def __init__(self, root, train, transform, use_num_of_class_only=None):
        self.root = root
        self.transform = transform
        self.train = train  # training set or valid set
        if not self._check_integrity():
            raise RuntimeError("Dataset not found or corrupted.")

        if self.train:
            downloaded_list = self.train_list
        else:
            downloaded_list = self.valid_list
        self.data = []
        self.targets = []

        # now load the picked numpy arrays
        for i, (file_name, checksum) in enumerate(downloaded_list):
            file_path = os.path.join(self.root, file_name)
            # print ('Load {:}/{:02d}-th : {:}'.format(i, len(downloaded_list), file_path))
            with open(file_path, "rb") as f:
                if sys.version_info[0] == 2:
                    entry = pickle.load(f)
                else:
                    entry = pickle.load(f, encoding="latin1")
                self.data.append(entry["data"])
                self.targets.extend(entry["labels"])
        self.data = np.vstack(self.data).reshape(-1, 3, 16, 16)
        self.data = self.data.transpose((0, 2, 3, 1))  # convert to HWC
        if use_num_of_class_only is not None:
            assert (
                isinstance(use_num_of_class_only, int)
                and use_num_of_class_only > 0
                and use_num_of_class_only < 1000
            ), "invalid use_num_of_class_only : {:}".format(use_num_of_class_only)
            new_data, new_targets = [], []
            for I, L in zip(self.data, self.targets):
                if 1 <= L <= use_num_of_class_only:
                    new_data.append(I)
                    new_targets.append(L)
            self.data = new_data
            self.targets = new_targets

    def __repr__(self):
        return "{name}({num} images, {classes} classes)".format(
            name=self.__class__.__name__,
            num=len(self.data),
            classes=len(set(self.targets)),
        )

    def __getitem__(self, index):
        img, target = self.data[index], self.targets[index] - 1

        img = Image.fromarray(img)

        if self.transform is not None:
            img = self.transform(img)

        return img, target

    def __len__(self):
        return len(self.data)

    def _check_integrity(self):
        root = self.root
        for fentry in self.train_list + self.valid_list:
            filename, md5 = fentry[0], fentry[1]
            fpath = os.path.join(root, filename)
            if not check_integrity(fpath, md5):
                return False
        return True

    
 class CUTOUT(object):
    def __init__(self, length):
        self.length = length

    def __repr__(self):
        return "{name}(length={length})".format(
            name=self.__class__.__name__, **self.__dict__
        )

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)
        y = np.random.randint(h)
        x = np.random.randint(w)

        y1 = np.clip(y - self.length // 2, 0, h)
        y2 = np.clip(y + self.length // 2, 0, h)
        x1 = np.clip(x - self.length // 2, 0, w)
        x2 = np.clip(x + self.length // 2, 0, w)

        mask[y1:y2, x1:x2] = 0.0
        mask = torch.from_numpy(mask)
        mask = mask.expand_as(img)
        img *= mask
        return img

 def get_loader(cutout=0, batch_size=32, workers=8):
    
    mean = [x / 255 for x in [122.68, 116.66, 104.01]]
    std = [x / 255 for x in [63.22, 61.26, 65.09]]
    lists = [
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(16, padding=2),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ]
    if cutout > 0:
        lists += [CUTOUT(cutout)]
    train_transform = transforms.Compose(lists)
    test_transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize(mean, std)]
    )
 #     xshape = (1, 3, 16, 16)

    train_data = ImageNet16(DATA_PATH, True, train_transform, 120)
    test_data = ImageNet16(DATA_PATH, False, test_transform, 120)
    
    assert len(train_data) == 151700 and len(test_data) == 6000
 #     assert len(train_data) == 151700
    num_classes = 120
    
    train_loader = torch.utils.data.DataLoader(train_data, batch_size, shuffle=True, num_workers=workers)
    valid_loader = torch.utils.data.DataLoader(test_data, batch_size, num_workers=workers)
    
 #     return train_data, test_data, xshape, num_classes
    return train_loader, valid_loader
--- a/xnas/search_algorithm/RMINAS/utils/loader.py
+++ b/xnas/search_algorithm/RMINAS/utils/loader.py
@@ -0,0 +1,91 @@
 import os
 import random
 import torch
 import torchvision.transforms as transforms
 from torchvision.datasets import CIFAR10
 from torchvision.datasets import CIFAR100
 from torchvision.datasets import ImageFolder


 def cifar10_data(batchsize, workers):
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    _train_loader = torch.utils.data.DataLoader(
        CIFAR10(root='./data', train=True, transform=transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(32, 4),
            transforms.ToTensor(),
            normalize,
        ]), download=True),
        batch_size=batchsize*16, shuffle=True,
        num_workers=workers, pin_memory=True)

    target_i = random.randint(0, len(_train_loader)-1)
    more_data_X, more_data_y = None, None
    for i, (more_data_X, more_data_y) in enumerate(_train_loader):
        if i == target_i:
            break
    more_data_X = more_data_X.cuda()
    more_data_y = more_data_y.cuda()
    return more_data_X, more_data_y


 def cifar100_data(batchsize, workers):
    CIFAR100_TRAIN_MEAN = (
        0.5070751592371323, 0.48654887331495095, 0.4409178433670343)
    CIFAR100_TRAIN_STD = (
        0.2673342858792401, 0.2564384629170883, 0.27615047132568404)

    transform_train = transforms.Compose([
        # transforms.ToPILImage(),
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize(CIFAR100_TRAIN_MEAN, CIFAR100_TRAIN_STD)
    ])
    cifar100_training = CIFAR100(
        root='./data', train=True, download=True, transform=transform_train)
    cifar100_training_loader = torch.utils.data.DataLoader(
        cifar100_training, shuffle=True, 
        batch_size=batchsize*16, num_workers=workers)

    target_i = random.randint(0, len(cifar100_training_loader)-1)
    more_data_X, more_data_y = None, None
    for i, (more_data_X, more_data_y) in enumerate(cifar100_training_loader):
        if i == target_i:
            break
    more_data_X = more_data_X.cuda()
    more_data_y = more_data_y.cuda()
    return more_data_X, more_data_y

 def imagenet_data(batchsize, workers, data_dir='/gdata/ImageNet2012/'):
    """Data preparing"""
    traindir = os.path.join(data_dir, 'train')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    train_data = ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(
                brightness=0.4,
                contrast=0.4,
                saturation=0.4,
                hue=0.2),
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_data, batch_size=batchsize*16, shuffle=True, pin_memory=True, num_workers=workers)

    target_i = random.randint(0, len(train_loader)-1)
    more_data_X, more_data_y = None, None
    for i, (more_data_X, more_data_y) in enumerate(train_loader):
        if i == target_i:
            break
    more_data_X = more_data_X.cuda()
    more_data_y = more_data_y.cuda()
    return more_data_X, more_data_y
--- a/xnas/search_space/DARTS/cnn.py
+++ b/xnas/search_space/DARTS/cnn.py
@@ -146,7 +146,7 @@ class DartsCNN(nn.Module):
                    sample[true_id, self.basic_op_list.index(op_name)] = 1
        for i in range(self.all_edges):
            if np.sum(sample[i, :]) == 0:
                sample[i, 7] = 1
                sample[i, len(self.basic_op_list)-1] = 1
        return sample

    def _node_index(self, n_nodes, input_nodes=2, start_index=0):
--- a/xnas/search_space/RMINAS/DARTS/darts_cnn.py
+++ b/xnas/search_space/RMINAS/DARTS/darts_cnn.py
@@ -0,0 +1,195 @@
 from xnas.search_space.cellbased_basic_ops import *
 import xnas.search_space.cellbased_basic_genotypes as gt

 basic_op_list = ['max_pool_3x3', 'avg_pool_3x3', 'skip_connect', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5', 'none']

 # Augmented DARTS

 def geno_from_alpha(theta):
    Genotype = namedtuple(
        'Genotype', 'normal normal_concat reduce reduce_concat')
    theta_norm = darts_weight_unpack(
        theta[0:14], 4)
    theta_reduce = darts_weight_unpack(
        theta[14:], 4)
    gene_normal = parse_from_numpy(
        theta_norm, k=2, basic_op_list=basic_op_list)
    gene_reduce = parse_from_numpy(
        theta_reduce, k=2, basic_op_list=basic_op_list)
    concat = range(2, 6)  # concat all intermediate nodes
    return Genotype(normal=gene_normal, normal_concat=concat,
                    reduce=gene_reduce, reduce_concat=concat)

 def reformat_DARTS(genotype):
    """
    format genotype for DARTS-like
    from:
        Genotype(normal=[[('sep_conv_3x3', 1), ('sep_conv_5x5', 0)], [('sep_conv_3x3', 2), ('max_pool_3x3', 1)], [('sep_conv_3x3', 3), ('dil_conv_3x3', 2)], [('dil_conv_5x5', 4), ('dil_conv_5x5', 3)]], normal_concat=range(2, 6), reduce=[[('max_pool_3x3', 0), ('sep_conv_5x5', 1)], [('max_pool_3x3', 0), ('dil_conv_5x5', 2)], [('max_pool_3x3', 0), ('sep_conv_5x5', 1)], [('dil_conv_5x5', 4), ('max_pool_3x3', 0)]], reduce_concat=range(2, 6))
    to:
        Genotype(normal=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('skip_connect', 0), ('skip_connect', 0), ('dil_conv_3x3', 2)], normal_concat=[2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 1), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('max_pool_3x3', 1)], reduce_concat=[2, 3, 4, 5])
    """
    Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat')
    _normal = []
    _reduce = []
    for i in genotype.normal:
        for j in i:
            _normal.append(j)
    for i in genotype.reduce:
        for j in i:
            _reduce.append(j)
    _normal_concat = [i for i in genotype.normal_concat]
    _reduce_concat = [i for i in genotype.reduce_concat]
    r_genotype = Genotype(
        normal=_normal,
        normal_concat=_normal_concat,
        reduce=_reduce,
        reduce_concat=_reduce_concat
    )
    return r_genotype

 class AuxiliaryHead(nn.Module):
    """ Auxiliary head in 2/3 place of network to let the gradient flow well """
    def __init__(self, input_size, C, n_classes):
        """ assuming input size 7x7 or 8x8 """
        # assert input_size in [7, 8]
        super().__init__()
        if input_size in [7, 8]:
            self.net = nn.Sequential(
                nn.ReLU(inplace=True),
                nn.AvgPool2d(5, stride=input_size-5, padding=0, count_include_pad=False), # 2x2 out
                nn.Conv2d(C, 128, kernel_size=1, bias=False),
                nn.BatchNorm2d(128),
                nn.ReLU(inplace=True),
                nn.Conv2d(128, 768, kernel_size=2, bias=False), # 1x1 out
                nn.BatchNorm2d(768),
                nn.ReLU(inplace=True))
        else:
            self.net = nn.Sequential(
                nn.ReLU(inplace=True),
                nn.AdaptiveAvgPool2d((2, 2)),
                nn.Conv2d(C, 128, kernel_size=1, bias=False),
                nn.BatchNorm2d(128),
                nn.ReLU(inplace=True),
                nn.Conv2d(128, 768, kernel_size=2, bias=False),  # 1x1 out
                nn.BatchNorm2d(768),
                nn.ReLU(inplace=True))
        self.linear = nn.Linear(768, n_classes)

    def forward(self, x):
        out = self.net(x)
        out = out.view(out.size(0), -1) # flatten
        logits = self.linear(out)
        return logits


 class AugmentCell(nn.Module):
    """ Cell for augmentation
    Each edge is discrete.
    """
    def __init__(self, genotype, C_pp, C_p, C, reduction_p, reduction):
        super().__init__()
        self.reduction = reduction
        self.n_nodes = len(genotype.normal)

        if reduction_p:
            self.preproc0 = FactorizedReduce(C_pp, C)
        else:
            self.preproc0 = StdConv(C_pp, C, 1, 1, 0)
        self.preproc1 = StdConv(C_p, C, 1, 1, 0)

        # generate dag
        if reduction:
            gene = genotype.reduce
            self.concat = genotype.reduce_concat
        else:
            gene = genotype.normal
            self.concat = genotype.normal_concat

        self.dag = gt.to_dag(C, gene, reduction)

    def forward(self, s0, s1):
        s0 = self.preproc0(s0)
        s1 = self.preproc1(s1)

        states = [s0, s1]
        for edges in self.dag:
            s_cur = sum(op(states[op.s_idx]) for op in edges)
            states.append(s_cur)

        s_out = torch.cat([states[i] for i in self.concat], dim=1)

        return s_out


 class AugmentCNN(nn.Module):
    """ Augmented CNN model """
    def __init__(self, input_size, C_in, C, n_classes, n_layers, auxiliary, genotype,
                 stem_multiplier=3):
        """
        Args:
            input_size: size of height and width (assuming height = width)
            C_in: # of input channels
            C: # of starting model channels
        """
        super().__init__()
        self.C_in = C_in
        self.C = C
        self.n_classes = n_classes
        self.n_layers = n_layers
 #         self.genotype = gt.from_str(genotype)
        self.genotype = genotype
        # aux head position
        self.aux_pos = 2*n_layers//3 if auxiliary else -1

        C_cur = stem_multiplier * C
        self.stem = nn.Sequential(
            nn.Conv2d(C_in, C_cur, 3, 1, 1, bias=False),
            nn.BatchNorm2d(C_cur)
        )

        C_pp, C_p, C_cur = C_cur, C_cur, C

        self.cells = nn.ModuleList()
        reduction_p = False
        for i in range(n_layers):
            if i in [n_layers//3, 2*n_layers//3]:
                C_cur *= 2
                reduction = True
            else:
                reduction = False

            cell = AugmentCell(self.genotype, C_pp, C_p, C_cur, reduction_p, reduction)
            reduction_p = reduction
            self.cells.append(cell)
            C_cur_out = C_cur * len(cell.concat)
            C_pp, C_p = C_p, C_cur_out

            if i == self.aux_pos:
                # [!] this auxiliary head is ignored in computing parameter size
                #     by the name 'aux_head'
                self.aux_head = AuxiliaryHead(input_size//4, C_p, n_classes)

        self.gap = nn.AdaptiveAvgPool2d(1)
        self.linear = nn.Linear(C_p, n_classes)

    def forward(self, x):
        s0 = s1 = self.stem(x)
        features = []
        aux_logits = None
        for i, cell in enumerate(self.cells):
            s0, s1 = s1, cell(s0, s1)
            if i in [int(self.n_layers//3-1), int(2*self.n_layers//3-1), int(self.n_layers-1)]:
                features.append(s1)
            if i == self.aux_pos and self.training:
                aux_logits = self.aux_head(s1)
        out = self.gap(s1)
        out = out.view(out.size(0), -1) # flatten
        logits = self.linear(out)
        
        return features, logits, aux_logits

    def drop_path_prob(self, p):
        """ Set drop path probability """
        for module in self.modules():
            if isinstance(module, DropPath_):
                module.p = p
--- a/xnas/search_space/RMINAS/DARTS/darts_img.py
+++ b/xnas/search_space/RMINAS/DARTS/darts_img.py
@@ -0,0 +1,222 @@
 import torch
 import torch.nn as nn
 from pcdarts_op import *
 from torch.autograd import Variable

 def drop_path(x, drop_prob):
    if drop_prob > 0.:
        keep_prob = 1.-drop_prob
        mask = Variable(torch.cuda.FloatTensor(x.size(0), 1, 1, 1).bernoulli_(keep_prob))
        x.div_(keep_prob)
        x.mul_(mask)
    return x

 class Cell(nn.Module):

    def __init__(self, genotype, C_prev_prev, C_prev, C, reduction, reduction_prev):
        super(Cell, self).__init__()
        print(C_prev_prev, C_prev, C)

        if reduction_prev:
            self.preprocess0 = FactorizedReduce(C_prev_prev, C)
        else:
            self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0)
        self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0)
        
        if reduction:
            op_names, indices = zip(*genotype.reduce)
            concat = genotype.reduce_concat
        else:
            op_names, indices = zip(*genotype.normal)
            concat = genotype.normal_concat
        self._compile(C, op_names, indices, concat, reduction)

    def _compile(self, C, op_names, indices, concat, reduction):
        assert len(op_names) == len(indices)
        self._steps = len(op_names) // 2
        self._concat = concat
        self.multiplier = len(concat)

        self._ops = nn.ModuleList()
        for name, index in zip(op_names, indices):
            stride = 2 if reduction and index < 2 else 1
            op = OPS[name](C, stride, True)
            self._ops += [op]
        self._indices = indices

    def forward(self, s0, s1, drop_prob):
        s0 = self.preprocess0(s0)
        s1 = self.preprocess1(s1)

        states = [s0, s1]
        for i in range(self._steps):
            h1 = states[self._indices[2*i]]
            h2 = states[self._indices[2*i+1]]
            op1 = self._ops[2*i]
            op2 = self._ops[2*i+1]
            h1 = op1(h1)
            h2 = op2(h2)
            if self.training and drop_prob > 0.:
                if not isinstance(op1, Identity):
                    h1 = drop_path(h1, drop_prob)
                if not isinstance(op2, Identity):
                    h2 = drop_path(h2, drop_prob)
            s = h1 + h2
            states += [s]
        return torch.cat([states[i] for i in self._concat], dim=1)


 class AuxiliaryHeadCIFAR(nn.Module):

    def __init__(self, C, num_classes):
        """assuming input size 8x8"""
        super(AuxiliaryHeadCIFAR, self).__init__()
        self.features = nn.Sequential(
            nn.ReLU(inplace=True),
            nn.AvgPool2d(5, stride=3, padding=0, count_include_pad=False), # image size = 2 x 2
            nn.Conv2d(C, 128, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 768, 2, bias=False),
            nn.BatchNorm2d(768),
            nn.ReLU(inplace=True)
        )
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x.view(x.size(0),-1))
        return x


 class AuxiliaryHeadImageNet(nn.Module):

    def __init__(self, C, num_classes):
        """assuming input size 14x14"""
        super(AuxiliaryHeadImageNet, self).__init__()
        self.features = nn.Sequential(
            nn.ReLU(inplace=True),
            nn.AvgPool2d(5, stride=2, padding=0, count_include_pad=False),
            nn.Conv2d(C, 128, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 768, 2, bias=False),
            # NOTE: This batchnorm was omitted in my earlier implementation due to a typo.
            # Commenting it out for consistency with the experiments in the paper.
            # nn.BatchNorm2d(768),
            nn.ReLU(inplace=True)
        )
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x.view(x.size(0),-1))
        return x


 class NetworkCIFAR(nn.Module):

    def __init__(self, C, num_classes, layers, auxiliary, genotype):
        super(NetworkCIFAR, self).__init__()
        self._layers = layers
        self._auxiliary = auxiliary

        stem_multiplier = 3
        C_curr = stem_multiplier*C
        self.stem = nn.Sequential(
            nn.Conv2d(3, C_curr, 3, padding=1, bias=False),
            nn.BatchNorm2d(C_curr)
        )
        
        C_prev_prev, C_prev, C_curr = C_curr, C_curr, C
        self.cells = nn.ModuleList()
        reduction_prev = False
        for i in range(layers):
            if i in [layers//3, 2*layers//3]:
                C_curr *= 2
                reduction = True
            else:
                reduction = False
            cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev)
            reduction_prev = reduction
            self.cells += [cell]
            C_prev_prev, C_prev = C_prev, cell.multiplier*C_curr
            if i == 2*layers//3:
                C_to_auxiliary = C_prev

        if auxiliary:
            self.auxiliary_head = AuxiliaryHeadCIFAR(C_to_auxiliary, num_classes)
        self.global_pooling = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Linear(C_prev, num_classes)

    def forward(self, input):
        logits_aux = None
        s0 = s1 = self.stem(input)
        for i, cell in enumerate(self.cells):
            s0, s1 = s1, cell(s0, s1, self.drop_path_prob)
            if i == 2*self._layers//3:
                if self._auxiliary and self.training:
                    logits_aux = self.auxiliary_head(s1)
        out = self.global_pooling(s1)
        logits = self.classifier(out.view(out.size(0),-1))
        return logits, logits_aux


 class NetworkImageNet(nn.Module):

    def __init__(self, C, num_classes, layers, auxiliary, genotype):
        super(NetworkImageNet, self).__init__()
        self._layers = layers
        self._auxiliary = auxiliary

        self.stem0 = nn.Sequential(
            nn.Conv2d(3, C // 2, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(C // 2),
            nn.ReLU(inplace=True),
            nn.Conv2d(C // 2, C, 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(C),
        )

        self.stem1 = nn.Sequential(
            nn.ReLU(inplace=True),
            nn.Conv2d(C, C, 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(C),
        )

        C_prev_prev, C_prev, C_curr = C, C, C

        self.cells = nn.ModuleList()
        reduction_prev = True
        for i in range(layers):
            if i in [layers // 3, 2 * layers // 3]:
                C_curr *= 2
                reduction = True
            else:
                reduction = False
            cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev)
            reduction_prev = reduction
            self.cells += [cell]
            C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr
            if i == 2 * layers // 3:
                C_to_auxiliary = C_prev

        if auxiliary:
            self.auxiliary_head = AuxiliaryHeadImageNet(C_to_auxiliary, num_classes)
        self.global_pooling = nn.AvgPool2d(7)
        self.classifier = nn.Linear(C_prev, num_classes)

    def forward(self, input):
        logits_aux = None
        features = []
        s0 = self.stem0(input)
        s1 = self.stem1(s0)
        for i, cell in enumerate(self.cells):
            s0, s1 = s1, cell(s0, s1, self.drop_path_prob)
            if i in [int(self._layers//3-1), int(2*self._layers//3-1), int(self._layers-1)]:
                features.append(s1)
            if i == 2 * self._layers // 3:
                if self._auxiliary and self.training:
                    logits_aux = self.auxiliary_head(s1)
        out = self.global_pooling(s1)
        logits = self.classifier(out.view(out.size(0), -1))
        return features, logits, logits_aux
--- a/xnas/search_space/RMINAS/DARTS/darts_plot.py
+++ b/xnas/search_space/RMINAS/DARTS/darts_plot.py
@@ -0,0 +1,52 @@
 from collections import namedtuple
 from graphviz import Digraph
 import sys

 Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat')

 Genotype(normal=[('sep_conv_5x5', 0), ('dil_conv_3x3', 1), ('dil_conv_5x5', 0), ('skip_connect', 1), ('dil_conv_5x5', 2), ('avg_pool_3x3', 3), ('dil_conv_5x5', 2), ('avg_pool_3x3', 4)], normal_concat=[2, 3, 4, 5], reduce=[('sep_conv_5x5', 0), ('dil_conv_3x3', 1), ('dil_conv_5x5', 0), ('skip_connect', 1), ('dil_conv_5x5', 2), ('avg_pool_3x3', 3), ('dil_conv_5x5', 2), ('avg_pool_3x3', 4)], reduce_concat=[2, 3, 4, 5])

 def plot(genotype, filename):
  g = Digraph(
      format='pdf',
      edge_attr=dict(fontsize='20', fontname="times"),
      node_attr=dict(style='filled', shape='rect', align='center', fontsize='20', height='0.5', width='0.5', penwidth='2', fontname="times"),
      engine='dot')
  g.body.extend(['rankdir=LR'])

  g.node("c_{k-2}", fillcolor='darkseagreen2')
  g.node("c_{k-1}", fillcolor='darkseagreen2')
  assert len(genotype) % 2 == 0
  steps = len(genotype) // 2

  for i in range(steps):
    g.node(str(i), fillcolor='lightblue')

  for i in range(steps):
    for k in [2*i, 2*i + 1]:
      op, j = genotype[k]
      if j == 0:
        u = "c_{k-2}"
      elif j == 1:
        u = "c_{k-1}"
      else:
        u = str(j-2)
      v = str(i)
      g.edge(u, v, label=op, fillcolor="gray")

  g.node("c_{k}", fillcolor='palegoldenrod')
  for i in range(steps):
    g.edge(str(i), "c_{k}", fillcolor="gray")

  g.render(filename, view=False)


 if __name__ == '__main__':
 #   try:
 #     genotype = eval('genotypes.{}'.format(genotype))
 #   except AttributeError:
 #     print("{} is not specified in genotypes.py".format(genotype_name)) 
 #     sys.exit(1)

  plot(genotype.normal, "normal")
  plot(genotype.reduce, "reduction")
--- a/xnas/search_space/RMINAS/DARTS/pcdarts_op.py
+++ b/xnas/search_space/RMINAS/DARTS/pcdarts_op.py
@@ -0,0 +1,104 @@
 import torch
 import torch.nn as nn

 OPS = {
    'none' : lambda C, stride, affine: Zero(stride),
    'avg_pool_3x3' : lambda C, stride, affine: nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False),
    'max_pool_3x3' : lambda C, stride, affine: nn.MaxPool2d(3, stride=stride, padding=1),
    'skip_connect' : lambda C, stride, affine: Identity() if stride == 1 else FactorizedReduce(C, C, affine=affine),
    'sep_conv_3x3' : lambda C, stride, affine: SepConv(C, C, 3, stride, 1, affine=affine),
    'sep_conv_5x5' : lambda C, stride, affine: SepConv(C, C, 5, stride, 2, affine=affine),
    'sep_conv_7x7' : lambda C, stride, affine: SepConv(C, C, 7, stride, 3, affine=affine),
    'dil_conv_3x3' : lambda C, stride, affine: DilConv(C, C, 3, stride, 2, 2, affine=affine),
    'dil_conv_5x5' : lambda C, stride, affine: DilConv(C, C, 5, stride, 4, 2, affine=affine),
    'conv_7x1_1x7' : lambda C, stride, affine: nn.Sequential(
        nn.ReLU(inplace=False),
        nn.Conv2d(C, C, (1,7), stride=(1, stride), padding=(0, 3), bias=False),
        nn.Conv2d(C, C, (7,1), stride=(stride, 1), padding=(3, 0), bias=False),
        nn.BatchNorm2d(C, affine=affine)
        ),
 }

 class ReLUConvBN(nn.Module):

    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
        super(ReLUConvBN, self).__init__()
        self.op = nn.Sequential(
            nn.ReLU(inplace=False),
            nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=padding, bias=False),
            nn.BatchNorm2d(C_out, affine=affine)
        )

    def forward(self, x):
        return self.op(x)

 class DilConv(nn.Module):
        
    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
        super(DilConv, self).__init__()
        self.op = nn.Sequential(
            nn.ReLU(inplace=False),
            nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=C_in, bias=False),
            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
            nn.BatchNorm2d(C_out, affine=affine),
            )

    def forward(self, x):
        return self.op(x)


 class SepConv(nn.Module):
        
    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
        super(SepConv, self).__init__()
        self.op = nn.Sequential(
            nn.ReLU(inplace=False),
            nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, groups=C_in, bias=False),
            nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False),
            nn.BatchNorm2d(C_in, affine=affine),
            nn.ReLU(inplace=False),
            nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=1, padding=padding, groups=C_in, bias=False),
            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
            nn.BatchNorm2d(C_out, affine=affine),
            )

    def forward(self, x):
        return self.op(x)


 class Identity(nn.Module):

    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        return x


 class Zero(nn.Module):

    def __init__(self, stride):
        super(Zero, self).__init__()
        self.stride = stride

    def forward(self, x):
        if self.stride == 1:
            return x.mul(0.)
        return x[:,:,::self.stride,::self.stride].mul(0.)


 class FactorizedReduce(nn.Module):

    def __init__(self, C_in, C_out, affine=True):
        super(FactorizedReduce, self).__init__()
        assert C_out % 2 == 0
        self.relu = nn.ReLU(inplace=False)
        self.conv_1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
        self.conv_2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) 
        self.bn = nn.BatchNorm2d(C_out, affine=affine)

    def forward(self, x):
        x = self.relu(x)
        out = torch.cat([self.conv_1(x), self.conv_2(x[:,:,1:,1:])], dim=1)
        out = self.bn(out)
        return out
--- a/xnas/search_space/RMINAS/MBConv/mb_v3_cnn.py
+++ b/xnas/search_space/RMINAS/MBConv/mb_v3_cnn.py
@@ -0,0 +1,151 @@
 from xnas.search_space.mb_ops import *
 from xnas.search_space.proxyless_cnn import ProxylessNASNets
 from xnas.search_space.utils import profile, make_divisible
 import json
 import xnas.core.logging as logging
 import numpy as np
 import os
 from xnas.core.config import cfg

 logger = logging.get_logger(__name__)


 class MobileNetV3(MyNetwork):

    def __init__(self, n_classes=1000, width_mult=1.2, depth=4):
        super(MobileNetV3, self).__init__()

        self.width_mult = width_mult
        self.depth = depth
        self.conv_candidates = [
            '3x3_MBConv3', '3x3_MBConv6',
            '5x5_MBConv3', '5x5_MBConv6',
            '7x7_MBConv3', '7x7_MBConv6',
        ] if len(cfg.MB.BASIC_OP) == 0 else cfg.MB.BASIC_OP

        # ofa
        self.base_stage_width = [16, 24, 40, 80, 112, 160, 960, 1280]

        final_expand_width = make_divisible(
            self.base_stage_width[-2] * self.width_mult, 8)
        last_channel = make_divisible(
            self.base_stage_width[-1] * self.width_mult, 8)

        self.stride_stages = [1, 2, 2, 2, 1, 2] if len(
            cfg.MB.STRIDE_STAGES) == 0 else cfg.MB.STRIDE_STAGES
        self.act_stages = ['relu', 'relu', 'relu', 'h_swish',
                           'h_swish', 'h_swish'] if len(cfg.MB.ACT_STAGES) == 0 else cfg.MB.ACT_STAGES
        self.se_stages = [False, False, True, False, True, True] if len(
            cfg.MB.SE_STAGES) == 0 else cfg.MB.SE_STAGES
        n_block_list = [1] + [self.depth] * 5
        width_list = []
        for base_width in self.base_stage_width[:-2]:
            width = make_divisible(base_width * self.width_mult, 8)
            width_list.append(width)
        input_channel = width_list[0]

        # first conv layer
        first_conv = ConvLayer(
            3, input_channel, kernel_size=3, stride=2, act_func='h_swish')

        # first block
        first_block_conv = MBInvertedConvLayer(
            in_channels=input_channel, out_channels=input_channel, kernel_size=3, stride=self.stride_stages[0],
            expand_ratio=1, act_func=self.act_stages[0], use_se=self.se_stages[0],
        )
        first_block = MobileInvertedResidualBlock(
            first_block_conv, IdentityLayer(input_channel, input_channel))

        # inverted residual blocks
        blocks = nn.ModuleList()
        blocks.append(first_block)
        feature_dim = input_channel
        self.candidate_ops = []

        for width, n_block, s, act_func, use_se in zip(width_list[1:], n_block_list[1:],
                                                       self.stride_stages[1:], self.act_stages[1:], self.se_stages[1:]):

            for i in range(n_block):
                if i == 0:
                    stride = s
                else:
                    stride = 1
                    # conv
                if stride == 1 and feature_dim == width:
                    modified_conv_candidates = self.conv_candidates + ['Zero']
                else:
                    modified_conv_candidates = self.conv_candidates + \
                        ['3x3_MBConv1']
                self.candidate_ops.append(modified_conv_candidates)
                conv_op = MixedEdge(candidate_ops=build_candidate_ops(
                    modified_conv_candidates, feature_dim, width, stride, 'weight_bn_act',
                    act_func=act_func, use_se=use_se), )
                if stride == 1 and feature_dim == width:
                    shortcut = IdentityLayer(feature_dim, feature_dim)
                else:
                    shortcut = None
                blocks.append(MobileInvertedResidualBlock(conv_op, shortcut))
                feature_dim = width
        # final expand layer, feature mix layer & classifier
        final_expand_layer = ConvLayer(
            feature_dim, final_expand_width, kernel_size=1, act_func='h_swish')
        feature_mix_layer = ConvLayer(
            final_expand_width, last_channel, kernel_size=1, bias=False, use_bn=False, act_func='h_swish',
        )
        classifier = LinearLayer(last_channel, n_classes)

        self.first_conv = first_conv
        self.blocks = blocks
        self.final_expand_layer = final_expand_layer
        self.feature_mix_layer = feature_mix_layer
        self.classifier = classifier
        self.global_avg_pooling = nn.AdaptiveAvgPool2d(1)

        self.all_edges = len(self.blocks) - 1
        self.num_edges = len(self.blocks) - 1
        self.num_ops = len(self.conv_candidates) + 1

    """ MyNetwork required methods """

    @staticmethod
    def name():
        return 'OFAMobileNetV3'

    def forward(self, x, sample):
        
        features = []
        
        # first conv
        x = self.first_conv(x)
        assert len(self.blocks) - 1 == len(sample)
        for i in range(len(self.blocks[1:])):
            this_block_conv = self.blocks[i+1].mobile_inverted_conv
            if isinstance(this_block_conv, MixedEdge):
                # one hot like vector
                this_block_conv.active_vector = sample[i]
            else:
                raise NotImplementedError
        for k,block in enumerate(self.blocks):
            x = block(x)
            if k in [4,12,20]:
                features.append(x)
        x = self.final_expand_layer(x)
        x = self.global_avg_pooling(x)
        x = self.feature_mix_layer(x)
        x = x.view(x.size(0), -1)  # flatten
        x = self.classifier(x)
        return x, features

    def genotype(self, theta):
        genotype = []
        for i in range(theta.shape[0]):
            genotype.append(self.candidate_ops[i][np.argmax(theta[i])])
        return genotype


 def _MobileNetV3CNN():
    # remember to add cuda() for it.
    return MobileNetV3(
        n_classes=cfg.SEARCH.NUM_CLASSES,
        width_mult=cfg.MB.WIDTH_MULTI,
        depth=cfg.MB.DEPTH)
--- a/xnas/search_space/RMINAS/NB201/geno.py
+++ b/xnas/search_space/RMINAS/NB201/geno.py
@@ -0,0 +1,274 @@
 ##################################################
 # Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2019 #
 ##################################################

 from copy import deepcopy

 def get_combination(space, num):
    combs = []
    for i in range(num):
        if i == 0:
            for func in space:
                combs.append([(func, i)])
        else:
            new_combs = []
            for string in combs:
                for func in space:
                    xstring = string + [(func, i)]
                    new_combs.append(xstring)
            combs = new_combs
    return combs


 class Structure:
    def __init__(self, genotype):
        assert isinstance(genotype, list) or isinstance(
            genotype, tuple
        ), "invalid class of genotype : {:}".format(type(genotype))
        self.node_num = len(genotype) + 1
        self.nodes = []
        self.node_N = []
        for idx, node_info in enumerate(genotype):
            assert isinstance(node_info, list) or isinstance(
                node_info, tuple
            ), "invalid class of node_info : {:}".format(type(node_info))
            assert len(node_info) >= 1, "invalid length : {:}".format(len(node_info))
            for node_in in node_info:
                assert isinstance(node_in, list) or isinstance(
                    node_in, tuple
                ), "invalid class of in-node : {:}".format(type(node_in))
                assert (
                    len(node_in) == 2 and node_in[1] <= idx
                ), "invalid in-node : {:}".format(node_in)
            self.node_N.append(len(node_info))
            self.nodes.append(tuple(deepcopy(node_info)))

    def tolist(self, remove_str):
        # convert this class to the list, if remove_str is 'none', then remove the 'none' operation.
        # note that we re-order the input node in this function
        # return the-genotype-list and success [if unsuccess, it is not a connectivity]
        genotypes = []
        for node_info in self.nodes:
            node_info = list(node_info)
            node_info = sorted(node_info, key=lambda x: (x[1], x[0]))
            node_info = tuple(filter(lambda x: x[0] != remove_str, node_info))
            if len(node_info) == 0:
                return None, False
            genotypes.append(node_info)
        return genotypes, True

    def node(self, index):
        assert index > 0 and index <= len(self), "invalid index={:} < {:}".format(
            index, len(self)
        )
        return self.nodes[index]

    def tostr(self):
        strings = []
        for node_info in self.nodes:
            string = "|".join([x[0] + "~{:}".format(x[1]) for x in node_info])
            string = "|{:}|".format(string)
            strings.append(string)
        return "+".join(strings)

    def check_valid(self):
        nodes = {0: True}
        for i, node_info in enumerate(self.nodes):
            sums = []
            for op, xin in node_info:
                if op == "none" or nodes[xin] is False:
                    x = False
                else:
                    x = True
                sums.append(x)
            nodes[i + 1] = sum(sums) > 0
        return nodes[len(self.nodes)]

    def to_unique_str(self, consider_zero=False):
        # this is used to identify the isomorphic cell, which rerquires the prior knowledge of operation
        # two operations are special, i.e., none and skip_connect
        nodes = {0: "0"}
        for i_node, node_info in enumerate(self.nodes):
            cur_node = []
            for op, xin in node_info:
                if consider_zero is None:
                    x = "(" + nodes[xin] + ")" + "@{:}".format(op)
                elif consider_zero:
                    if op == "none" or nodes[xin] == "#":
                        x = "#"  # zero
                    elif op == "skip_connect":
                        x = nodes[xin]
                    else:
                        x = "(" + nodes[xin] + ")" + "@{:}".format(op)
                else:
                    if op == "skip_connect":
                        x = nodes[xin]
                    else:
                        x = "(" + nodes[xin] + ")" + "@{:}".format(op)
                cur_node.append(x)
            nodes[i_node + 1] = "+".join(sorted(cur_node))
        return nodes[len(self.nodes)]

    def check_valid_op(self, op_names):
        for node_info in self.nodes:
            for inode_edge in node_info:
                # assert inode_edge[0] in op_names, 'invalid op-name : {:}'.format(inode_edge[0])
                if inode_edge[0] not in op_names:
                    return False
        return True

    def __repr__(self):
        return "{name}({node_num} nodes with {node_info})".format(
            name=self.__class__.__name__, node_info=self.tostr(), **self.__dict__
        )

    def __len__(self):
        return len(self.nodes) + 1

    def __getitem__(self, index):
        return self.nodes[index]

    @staticmethod
    def str2structure(xstr):
        if isinstance(xstr, Structure):
            return xstr
        assert isinstance(xstr, str), "must take string (not {:}) as input".format(
            type(xstr)
        )
        nodestrs = xstr.split("+")
        genotypes = []
        for i, node_str in enumerate(nodestrs):
            inputs = list(filter(lambda x: x != "", node_str.split("|")))
            for xinput in inputs:
                assert len(xinput.split("~")) == 2, "invalid input length : {:}".format(
                    xinput
                )
            inputs = (xi.split("~") for xi in inputs)
            input_infos = tuple((op, int(IDX)) for (op, IDX) in inputs)
            genotypes.append(input_infos)
        return Structure(genotypes)

    @staticmethod
    def str2fullstructure(xstr, default_name="none"):
        assert isinstance(xstr, str), "must take string (not {:}) as input".format(
            type(xstr)
        )
        nodestrs = xstr.split("+")
        genotypes = []
        for i, node_str in enumerate(nodestrs):
            inputs = list(filter(lambda x: x != "", node_str.split("|")))
            for xinput in inputs:
                assert len(xinput.split("~")) == 2, "invalid input length : {:}".format(
                    xinput
                )
            inputs = (xi.split("~") for xi in inputs)
            input_infos = list((op, int(IDX)) for (op, IDX) in inputs)
            all_in_nodes = list(x[1] for x in input_infos)
            for j in range(i):
                if j not in all_in_nodes:
                    input_infos.append((default_name, j))
            node_info = sorted(input_infos, key=lambda x: (x[1], x[0]))
            genotypes.append(tuple(node_info))
        return Structure(genotypes)

    @staticmethod
    def gen_all(search_space, num, return_ori):
        assert isinstance(search_space, list) or isinstance(
            search_space, tuple
        ), "invalid class of search-space : {:}".format(type(search_space))
        assert (
            num >= 2
        ), "There should be at least two nodes in a neural cell instead of {:}".format(
            num
        )
        all_archs = get_combination(search_space, 1)
        for i, arch in enumerate(all_archs):
            all_archs[i] = [tuple(arch)]

        for inode in range(2, num):
            cur_nodes = get_combination(search_space, inode)
            new_all_archs = []
            for previous_arch in all_archs:
                for cur_node in cur_nodes:
                    new_all_archs.append(previous_arch + [tuple(cur_node)])
            all_archs = new_all_archs
        if return_ori:
            return all_archs
        else:
            return [Structure(x) for x in all_archs]


 ResNet_CODE = Structure(
    [
        (("nor_conv_3x3", 0),),  # node-1
        (("nor_conv_3x3", 1),),  # node-2
        (("skip_connect", 0), ("skip_connect", 2)),
    ]  # node-3
 )

 AllConv3x3_CODE = Structure(
    [
        (("nor_conv_3x3", 0),),  # node-1
        (("nor_conv_3x3", 0), ("nor_conv_3x3", 1)),  # node-2
        (("nor_conv_3x3", 0), ("nor_conv_3x3", 1), ("nor_conv_3x3", 2)),
    ]  # node-3
 )

 AllFull_CODE = Structure(
    [
        (
            ("skip_connect", 0),
            ("nor_conv_1x1", 0),
            ("nor_conv_3x3", 0),
            ("avg_pool_3x3", 0),
        ),  # node-1
        (
            ("skip_connect", 0),
            ("nor_conv_1x1", 0),
            ("nor_conv_3x3", 0),
            ("avg_pool_3x3", 0),
            ("skip_connect", 1),
            ("nor_conv_1x1", 1),
            ("nor_conv_3x3", 1),
            ("avg_pool_3x3", 1),
        ),  # node-2
        (
            ("skip_connect", 0),
            ("nor_conv_1x1", 0),
            ("nor_conv_3x3", 0),
            ("avg_pool_3x3", 0),
            ("skip_connect", 1),
            ("nor_conv_1x1", 1),
            ("nor_conv_3x3", 1),
            ("avg_pool_3x3", 1),
            ("skip_connect", 2),
            ("nor_conv_1x1", 2),
            ("nor_conv_3x3", 2),
            ("avg_pool_3x3", 2),
        ),
    ]  # node-3
 )

 AllConv1x1_CODE = Structure(
    [
        (("nor_conv_1x1", 0),),  # node-1
        (("nor_conv_1x1", 0), ("nor_conv_1x1", 1)),  # node-2
        (("nor_conv_1x1", 0), ("nor_conv_1x1", 1), ("nor_conv_1x1", 2)),
    ]  # node-3
 )

 AllIdentity_CODE = Structure(
    [
        (("skip_connect", 0),),  # node-1
        (("skip_connect", 0), ("skip_connect", 1)),  # node-2
        (("skip_connect", 0), ("skip_connect", 1), ("skip_connect", 2)),
    ]  # node-3
 )

 architectures = {
    "resnet": ResNet_CODE,
    "all_c3x3": AllConv3x3_CODE,
    "all_c1x1": AllConv1x1_CODE,
    "all_idnt": AllIdentity_CODE,
    "all_full": AllFull_CODE,
 }
--- a/xnas/search_space/RMINAS/NB201/ops.py
+++ b/xnas/search_space/RMINAS/NB201/ops.py
@@ -0,0 +1,554 @@
 ##################################################
 # Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2019 #
 ##################################################

 import torch
 import torch.nn as nn

 __all__ = ["OPS", "RAW_OP_CLASSES", "ResNetBasicblock", "SearchSpaceNames"]

 OPS = {
    "none": lambda C_in, C_out, stride, affine, track_running_stats: Zero(
        C_in, C_out, stride
    ),
    "avg_pool_3x3": lambda C_in, C_out, stride, affine, track_running_stats: POOLING(
        C_in, C_out, stride, "avg", affine, track_running_stats
    ),
    "max_pool_3x3": lambda C_in, C_out, stride, affine, track_running_stats: POOLING(
        C_in, C_out, stride, "max", affine, track_running_stats
    ),
    "nor_conv_7x7": lambda C_in, C_out, stride, affine, track_running_stats: ReLUConvBN(
        C_in,
        C_out,
        (7, 7),
        (stride, stride),
        (3, 3),
        (1, 1),
        affine,
        track_running_stats,
    ),
    "nor_conv_3x3": lambda C_in, C_out, stride, affine, track_running_stats: ReLUConvBN(
        C_in,
        C_out,
        (3, 3),
        (stride, stride),
        (1, 1),
        (1, 1),
        affine,
        track_running_stats,
    ),
    "nor_conv_1x1": lambda C_in, C_out, stride, affine, track_running_stats: ReLUConvBN(
        C_in,
        C_out,
        (1, 1),
        (stride, stride),
        (0, 0),
        (1, 1),
        affine,
        track_running_stats,
    ),
    "dua_sepc_3x3": lambda C_in, C_out, stride, affine, track_running_stats: DualSepConv(
        C_in,
        C_out,
        (3, 3),
        (stride, stride),
        (1, 1),
        (1, 1),
        affine,
        track_running_stats,
    ),
    "dua_sepc_5x5": lambda C_in, C_out, stride, affine, track_running_stats: DualSepConv(
        C_in,
        C_out,
        (5, 5),
        (stride, stride),
        (2, 2),
        (1, 1),
        affine,
        track_running_stats,
    ),
    "dil_sepc_3x3": lambda C_in, C_out, stride, affine, track_running_stats: SepConv(
        C_in,
        C_out,
        (3, 3),
        (stride, stride),
        (2, 2),
        (2, 2),
        affine,
        track_running_stats,
    ),
    "dil_sepc_5x5": lambda C_in, C_out, stride, affine, track_running_stats: SepConv(
        C_in,
        C_out,
        (5, 5),
        (stride, stride),
        (4, 4),
        (2, 2),
        affine,
        track_running_stats,
    ),
    "skip_connect": lambda C_in, C_out, stride, affine, track_running_stats: Identity()
    if stride == 1 and C_in == C_out
    else FactorizedReduce(C_in, C_out, stride, affine, track_running_stats),
 }

 CONNECT_NAS_BENCHMARK = ["none", "skip_connect", "nor_conv_3x3"]
 NAS_BENCH_201 = ["none", "skip_connect", "nor_conv_1x1", "nor_conv_3x3", "avg_pool_3x3"]
 DARTS_SPACE = [
    "none",
    "skip_connect",
    "dua_sepc_3x3",
    "dua_sepc_5x5",
    "dil_sepc_3x3",
    "dil_sepc_5x5",
    "avg_pool_3x3",
    "max_pool_3x3",
 ]

 SearchSpaceNames = {
    "connect-nas": CONNECT_NAS_BENCHMARK,
    "nats-bench": NAS_BENCH_201,
    "nas-bench-201": NAS_BENCH_201,
    "darts": DARTS_SPACE,
 }


 class ReLUConvBN(nn.Module):
    def __init__(
        self,
        C_in,
        C_out,
        kernel_size,
        stride,
        padding,
        dilation,
        affine,
        track_running_stats=True,
    ):
        super(ReLUConvBN, self).__init__()
        self.op = nn.Sequential(
            nn.ReLU(inplace=False),
            nn.Conv2d(
                C_in,
                C_out,
                kernel_size,
                stride=stride,
                padding=padding,
                dilation=dilation,
                bias=not affine,
            ),
            nn.BatchNorm2d(
                C_out, affine=affine, track_running_stats=track_running_stats
            ),
        )

    def forward(self, x):
        return self.op(x)


 class SepConv(nn.Module):
    def __init__(
        self,
        C_in,
        C_out,
        kernel_size,
        stride,
        padding,
        dilation,
        affine,
        track_running_stats=True,
    ):
        super(SepConv, self).__init__()
        self.op = nn.Sequential(
            nn.ReLU(inplace=False),
            nn.Conv2d(
                C_in,
                C_in,
                kernel_size=kernel_size,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=C_in,
                bias=False,
            ),
            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=not affine),
            nn.BatchNorm2d(
                C_out, affine=affine, track_running_stats=track_running_stats
            ),
        )

    def forward(self, x):
        return self.op(x)


 class DualSepConv(nn.Module):
    def __init__(
        self,
        C_in,
        C_out,
        kernel_size,
        stride,
        padding,
        dilation,
        affine,
        track_running_stats=True,
    ):
        super(DualSepConv, self).__init__()
        self.op_a = SepConv(
            C_in,
            C_in,
            kernel_size,
            stride,
            padding,
            dilation,
            affine,
            track_running_stats,
        )
        self.op_b = SepConv(
            C_in, C_out, kernel_size, 1, padding, dilation, affine, track_running_stats
        )

    def forward(self, x):
        x = self.op_a(x)
        x = self.op_b(x)
        return x


 class ResNetBasicblock(nn.Module):
    def __init__(self, inplanes, planes, stride, affine=True, track_running_stats=True):
        super(ResNetBasicblock, self).__init__()
        assert stride == 1 or stride == 2, "invalid stride {:}".format(stride)
        self.conv_a = ReLUConvBN(
            inplanes, planes, 3, stride, 1, 1, affine, track_running_stats
        )
        self.conv_b = ReLUConvBN(
            planes, planes, 3, 1, 1, 1, affine, track_running_stats
        )
        if stride == 2:
            self.downsample = nn.Sequential(
                nn.AvgPool2d(kernel_size=2, stride=2, padding=0),
                nn.Conv2d(
                    inplanes, planes, kernel_size=1, stride=1, padding=0, bias=False
                ),
            )
        elif inplanes != planes:
            self.downsample = ReLUConvBN(
                inplanes, planes, 1, 1, 0, 1, affine, track_running_stats
            )
        else:
            self.downsample = None
        self.in_dim = inplanes
        self.out_dim = planes
        self.stride = stride
        self.num_conv = 2

    def extra_repr(self):
        string = "{name}(inC={in_dim}, outC={out_dim}, stride={stride})".format(
            name=self.__class__.__name__, **self.__dict__
        )
        return string

    def forward(self, inputs):

        basicblock = self.conv_a(inputs)
        basicblock = self.conv_b(basicblock)

        if self.downsample is not None:
            residual = self.downsample(inputs)
        else:
            residual = inputs
        return residual + basicblock


 class POOLING(nn.Module):
    def __init__(
        self, C_in, C_out, stride, mode, affine=True, track_running_stats=True
    ):
        super(POOLING, self).__init__()
        if C_in == C_out:
            self.preprocess = None
        else:
            self.preprocess = ReLUConvBN(
                C_in, C_out, 1, 1, 0, 1, affine, track_running_stats
            )
        if mode == "avg":
            self.op = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False)
        elif mode == "max":
            self.op = nn.MaxPool2d(3, stride=stride, padding=1)
        else:
            raise ValueError("Invalid mode={:} in POOLING".format(mode))

    def forward(self, inputs):
        if self.preprocess:
            x = self.preprocess(inputs)
        else:
            x = inputs
        return self.op(x)


 class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        return x


 class Zero(nn.Module):
    def __init__(self, C_in, C_out, stride):
        super(Zero, self).__init__()
        self.C_in = C_in
        self.C_out = C_out
        self.stride = stride
        self.is_zero = True

    def forward(self, x):
        if self.C_in == self.C_out:
            if self.stride == 1:
                return x.mul(0.0)
            else:
                return x[:, :, :: self.stride, :: self.stride].mul(0.0)
        else:
            shape = list(x.shape)
            shape[1] = self.C_out
            zeros = x.new_zeros(shape, dtype=x.dtype, device=x.device)
            return zeros

    def extra_repr(self):
        return "C_in={C_in}, C_out={C_out}, stride={stride}".format(**self.__dict__)


 class FactorizedReduce(nn.Module):
    def __init__(self, C_in, C_out, stride, affine, track_running_stats):
        super(FactorizedReduce, self).__init__()
        self.stride = stride
        self.C_in = C_in
        self.C_out = C_out
        self.relu = nn.ReLU(inplace=False)
        if stride == 2:
            # assert C_out % 2 == 0, 'C_out : {:}'.format(C_out)
            C_outs = [C_out // 2, C_out - C_out // 2]
            self.convs = nn.ModuleList()
            for i in range(2):
                self.convs.append(
                    nn.Conv2d(
                        C_in, C_outs[i], 1, stride=stride, padding=0, bias=not affine
                    )
                )
            self.pad = nn.ConstantPad2d((0, 1, 0, 1), 0)
        elif stride == 1:
            self.conv = nn.Conv2d(
                C_in, C_out, 1, stride=stride, padding=0, bias=not affine
            )
        else:
            raise ValueError("Invalid stride : {:}".format(stride))
        self.bn = nn.BatchNorm2d(
            C_out, affine=affine, track_running_stats=track_running_stats
        )

    def forward(self, x):
        if self.stride == 2:
            x = self.relu(x)
            y = self.pad(x)
            out = torch.cat([self.convs[0](x), self.convs[1](y[:, :, 1:, 1:])], dim=1)
        else:
            out = self.conv(x)
        out = self.bn(out)
        return out

    def extra_repr(self):
        return "C_in={C_in}, C_out={C_out}, stride={stride}".format(**self.__dict__)


 # Auto-ReID: Searching for a Part-Aware ConvNet for Person Re-Identification, ICCV 2019
 class PartAwareOp(nn.Module):
    def __init__(self, C_in, C_out, stride, part=4):
        super().__init__()
        self.part = 4
        self.hidden = C_in // 3
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.local_conv_list = nn.ModuleList()
        for i in range(self.part):
            self.local_conv_list.append(
                nn.Sequential(
                    nn.ReLU(),
                    nn.Conv2d(C_in, self.hidden, 1),
                    nn.BatchNorm2d(self.hidden, affine=True),
                )
            )
        self.W_K = nn.Linear(self.hidden, self.hidden)
        self.W_Q = nn.Linear(self.hidden, self.hidden)

        if stride == 2:
            self.last = FactorizedReduce(C_in + self.hidden, C_out, 2)
        elif stride == 1:
            self.last = FactorizedReduce(C_in + self.hidden, C_out, 1)
        else:
            raise ValueError("Invalid Stride : {:}".format(stride))

    def forward(self, x):
        batch, C, H, W = x.size()
        assert H >= self.part, "input size too small : {:} vs {:}".format(
            x.shape, self.part
        )
        IHs = [0]
        for i in range(self.part):
            IHs.append(min(H, int((i + 1) * (float(H) / self.part))))
        local_feat_list = []
        for i in range(self.part):
            feature = x[:, :, IHs[i] : IHs[i + 1], :]
            xfeax = self.avg_pool(feature)
            xfea = self.local_conv_list[i](xfeax)
            local_feat_list.append(xfea)
        part_feature = torch.cat(local_feat_list, dim=2).view(batch, -1, self.part)
        part_feature = part_feature.transpose(1, 2).contiguous()
        part_K = self.W_K(part_feature)
        part_Q = self.W_Q(part_feature).transpose(1, 2).contiguous()
        weight_att = torch.bmm(part_K, part_Q)
        attention = torch.softmax(weight_att, dim=2)
        aggreateF = torch.bmm(attention, part_feature).transpose(1, 2).contiguous()
        features = []
        for i in range(self.part):
            feature = aggreateF[:, :, i : i + 1].expand(
                batch, self.hidden, IHs[i + 1] - IHs[i]
            )
            feature = feature.view(batch, self.hidden, IHs[i + 1] - IHs[i], 1)
            features.append(feature)
        features = torch.cat(features, dim=2).expand(batch, self.hidden, H, W)
        final_fea = torch.cat((x, features), dim=1)
        outputs = self.last(final_fea)
        return outputs


 def drop_path(x, drop_prob):
    if drop_prob > 0.0:
        keep_prob = 1.0 - drop_prob
        mask = x.new_zeros(x.size(0), 1, 1, 1)
        mask = mask.bernoulli_(keep_prob)
        x = torch.div(x, keep_prob)
        x.mul_(mask)
    return x


 # Searching for A Robust Neural Architecture in Four GPU Hours
 class GDAS_Reduction_Cell(nn.Module):
    def __init__(
        self, C_prev_prev, C_prev, C, reduction_prev, affine, track_running_stats
    ):
        super(GDAS_Reduction_Cell, self).__init__()
        if reduction_prev:
            self.preprocess0 = FactorizedReduce(
                C_prev_prev, C, 2, affine, track_running_stats
            )
        else:
            self.preprocess0 = ReLUConvBN(
                C_prev_prev, C, 1, 1, 0, 1, affine, track_running_stats
            )
        self.preprocess1 = ReLUConvBN(
            C_prev, C, 1, 1, 0, 1, affine, track_running_stats
        )

        self.reduction = True
        self.ops1 = nn.ModuleList(
            [
                nn.Sequential(
                    nn.ReLU(inplace=False),
                    nn.Conv2d(
                        C,
                        C,
                        (1, 3),
                        stride=(1, 2),
                        padding=(0, 1),
                        groups=8,
                        bias=not affine,
                    ),
                    nn.Conv2d(
                        C,
                        C,
                        (3, 1),
                        stride=(2, 1),
                        padding=(1, 0),
                        groups=8,
                        bias=not affine,
                    ),
                    nn.BatchNorm2d(
                        C, affine=affine, track_running_stats=track_running_stats
                    ),
                    nn.ReLU(inplace=False),
                    nn.Conv2d(C, C, 1, stride=1, padding=0, bias=not affine),
                    nn.BatchNorm2d(
                        C, affine=affine, track_running_stats=track_running_stats
                    ),
                ),
                nn.Sequential(
                    nn.ReLU(inplace=False),
                    nn.Conv2d(
                        C,
                        C,
                        (1, 3),
                        stride=(1, 2),
                        padding=(0, 1),
                        groups=8,
                        bias=not affine,
                    ),
                    nn.Conv2d(
                        C,
                        C,
                        (3, 1),
                        stride=(2, 1),
                        padding=(1, 0),
                        groups=8,
                        bias=not affine,
                    ),
                    nn.BatchNorm2d(
                        C, affine=affine, track_running_stats=track_running_stats
                    ),
                    nn.ReLU(inplace=False),
                    nn.Conv2d(C, C, 1, stride=1, padding=0, bias=not affine),
                    nn.BatchNorm2d(
                        C, affine=affine, track_running_stats=track_running_stats
                    ),
                ),
            ]
        )

        self.ops2 = nn.ModuleList(
            [
                nn.Sequential(
                    nn.MaxPool2d(3, stride=2, padding=1),
                    nn.BatchNorm2d(
                        C, affine=affine, track_running_stats=track_running_stats
                    ),
                ),
                nn.Sequential(
                    nn.MaxPool2d(3, stride=2, padding=1),
                    nn.BatchNorm2d(
                        C, affine=affine, track_running_stats=track_running_stats
                    ),
                ),
            ]
        )

    @property
    def multiplier(self):
        return 4

    def forward(self, s0, s1, drop_prob=-1):
        s0 = self.preprocess0(s0)
        s1 = self.preprocess1(s1)

        X0 = self.ops1[0](s0)
        X1 = self.ops1[1](s1)
        if self.training and drop_prob > 0.0:
            X0, X1 = drop_path(X0, drop_prob), drop_path(X1, drop_prob)

        # X2 = self.ops2[0] (X0+X1)
        X2 = self.ops2[0](s0)
        X3 = self.ops2[1](s1)
        if self.training and drop_prob > 0.0:
            X2, X3 = drop_path(X2, drop_prob), drop_path(X3, drop_prob)
        return torch.cat([X0, X1, X2, X3], dim=1)


 # To manage the useful classes in this file.
 RAW_OP_CLASSES = {"gdas_reduction": GDAS_Reduction_Cell}
--- a/xnas/search_space/RMINAS/NB201/utils.py
+++ b/xnas/search_space/RMINAS/NB201/utils.py
@@ -0,0 +1,203 @@
 ##################################################
 # Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2019 #
 ##################################################

 from collections import namedtuple
 import torch.nn as nn
 from xnas.search_space.RMINAS.NB201.ops import OPS, ResNetBasicblock

 from copy import deepcopy

 from xnas.search_space.RMINAS.NB201.geno import Structure as CellStructure


@staticmethod
 def str2lists(arch_str):
    """
    This function shows how to read the string-based architecture encoding.
      It is the same as the `str2structure` func in `AutoDL-Projects/lib/models/cell_searchs/genotypes.py`
    :param
      arch_str: the input is a string indicates the architecture topology, such as
                    |nor_conv_1x1~0|+|none~0|none~1|+|none~0|none~1|skip_connect~2|
    :return: a list of tuple, contains multiple (op, input_node_index) pairs.
    :usage
      arch = api.str2lists( '|nor_conv_1x1~0|+|none~0|none~1|+|none~0|none~1|skip_connect~2|' )
      print ('there are {:} nodes in this arch'.format(len(arch)+1)) # arch is a list
      for i, node in enumerate(arch):
        print('the {:}-th node is the sum of these {:} nodes with op: {:}'.format(i+1, len(node), node))
    """
    node_strs = arch_str.split('+')
    genotypes = []
    for i, node_str in enumerate(node_strs):
        inputs = list(filter(lambda x: x != '', node_str.split('|')))
        for xinput in inputs: assert len(xinput.split('~')) == 2, 'invalid input length : {:}'.format(xinput)
        inputs = ( xi.split('~') for xi in inputs )
        input_infos = tuple( (op, int(IDX)) for (op, IDX) in inputs)
        genotypes.append( input_infos )
    return genotypes

 def dict2config(xdict, logger):
    assert isinstance(xdict, dict), "invalid type : {:}".format(type(xdict))
    Arguments = namedtuple("Configure", " ".join(xdict.keys()))
    content = Arguments(**xdict)
    if hasattr(logger, "log"):
        logger.log("{:}".format(content))
    return content

 def config2dict(content):
    return content._asdict()

 def get_cell_based_tiny_net(config):

    if hasattr(config, "genotype"):
        genotype = config.genotype
    elif hasattr(config, "arch_str"):
        genotype = CellStructure.str2structure(config.arch_str)
    else:
        raise ValueError(
            "Can not find genotype from this config : {:}".format(config)
        )
    return TinyNetwork(config.C, config.N, genotype, config.num_classes)


 # Cell for NAS-Bench-201
 class InferCell(nn.Module):
    def __init__(
        self, genotype, C_in, C_out, stride, affine=True, track_running_stats=True
    ):
        super(InferCell, self).__init__()

        self.layers = nn.ModuleList()
        self.node_IN = []
        self.node_IX = []
        self.genotype = deepcopy(genotype)
        for i in range(1, len(genotype)):
            node_info = genotype[i - 1]
            cur_index = []
            cur_innod = []
            for (op_name, op_in) in node_info:
                if op_in == 0:
                    layer = OPS[op_name](
                        C_in, C_out, stride, affine, track_running_stats
                    )
                else:
                    layer = OPS[op_name](C_out, C_out, 1, affine, track_running_stats)
                cur_index.append(len(self.layers))
                cur_innod.append(op_in)
                self.layers.append(layer)
            self.node_IX.append(cur_index)
            self.node_IN.append(cur_innod)
        self.nodes = len(genotype)
        self.in_dim = C_in
        self.out_dim = C_out

    def extra_repr(self):
        string = "info :: nodes={nodes}, inC={in_dim}, outC={out_dim}".format(
            **self.__dict__
        )
        laystr = []
        for i, (node_layers, node_innods) in enumerate(zip(self.node_IX, self.node_IN)):
            y = [
                "I{:}-L{:}".format(_ii, _il)
                for _il, _ii in zip(node_layers, node_innods)
            ]
            x = "{:}<-({:})".format(i + 1, ",".join(y))
            laystr.append(x)
        return (
            string
            + ", [{:}]".format(" | ".join(laystr))
            + ", {:}".format(self.genotype.tostr())
        )

    def forward(self, inputs):
        nodes = [inputs]
        for i, (node_layers, node_innods) in enumerate(zip(self.node_IX, self.node_IN)):
            node_feature = sum(
                self.layers[_il](nodes[_ii])
                for _il, _ii in zip(node_layers, node_innods)
            )
            nodes.append(node_feature)
        return nodes[-1]


 # The macro structure for architectures in NAS-Bench-201
 class TinyNetwork(nn.Module):
    def __init__(self, C, N, genotype, num_classes):
        super(TinyNetwork, self).__init__()
        self._C = C
        self._layerN = N
        # self._datasize = datasize
        # self._feature_res = feature_res

        self.stem = nn.Sequential(
            nn.Conv2d(3, C, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(C)
        )

        layer_channels = [C] * N + [C * 2] + [C * 2] * N + [C * 4] + [C * 4] * N
        layer_reductions = [False] * N + [True] + [False] * N + [True] + [False] * N

        C_prev = C
        self.cells = nn.ModuleList()
        for index, (C_curr, reduction) in enumerate(
            zip(layer_channels, layer_reductions)
        ):
            if reduction:
                cell = ResNetBasicblock(C_prev, C_curr, 2, True)
            else:
                cell = InferCell(genotype, C_prev, C_curr, 1)
            self.cells.append(cell)
            C_prev = cell.out_dim
        self._Layer = len(self.cells)

        self.lastact = nn.Sequential(nn.BatchNorm2d(C_prev), nn.ReLU(inplace=True))
        self.global_pooling = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Linear(C_prev, num_classes)

    def get_message(self):
        string = self.extra_repr()
        for i, cell in enumerate(self.cells):
            string += "\n {:02d}/{:02d} :: {:}".format(
                i, len(self.cells), cell.extra_repr()
            )
        return string

    def extra_repr(self):
        return "{name}(C={_C}, N={_layerN}, L={_Layer})".format(
            name=self.__class__.__name__, **self.__dict__
        )
    
    def feature_extractor(self, inputs):
        features = []
        feature = self.stem(inputs)
        features.append(feature)
        
        for i, cell in enumerate(self.cells):
            feature = cell(feature)
            features.append(feature)
        out = self.lastact(feature)
        features.append(out)
        return features

    def forward(self, inputs):
        features = []
        feature = self.stem(inputs)

        for i, cell in enumerate(self.cells):
            feature = cell(feature)
            if i == 4:
                tensor1 = feature
            elif i == 10:
                tensor2 = feature
            # if i in [4,10]:
                # features.append(feature)
        feature = self.lastact(feature)
        tensor3 = feature

        features = [tensor1, tensor2, tensor3]
        # features.append(feature)

        out = self.global_pooling(feature)
        out = out.view(out.size(0), -1)
        logits = self.classifier(out)

        return features, logits