diff --git a/README.md b/README.md index 798a10f..42f3f75 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Note that SpikingJelly is based on PyTorch. Please make sure that you have insta The odd version number is the developing version, which is updated with GitHub/OpenI repository. The even version number is the stable version and available at PyPI. -**Install the last stable version (0.0.0.0.8) from** [**PyPI**](https://pypi.org/project/spikingjelly/): +**Install the last stable version from** [**PyPI**](https://pypi.org/project/spikingjelly/): ```bash pip install spikingjelly @@ -40,7 +40,7 @@ git clone https://github.com/fangwei123456/spikingjelly.git cd spikingjelly python setup.py install ``` -From [OpenI](https://git.openi.org.cn/OpenI/spikingjelly): +From [OpenI](https://git.openi.org.cn/OpenI/spikingjelly): ```bash git clone https://git.openi.org.cn/OpenI/spikingjelly.git cd spikingjelly @@ -80,7 +80,7 @@ Read [spikingjelly.clock_driven.examples](https://spikingjelly.readthedocs.io/zh ## Fast And Handy ANN-SNN Conversion -SpikingJelly implements a relatively general ANN-SNN Conversion interface. Users can realize the conversion through PyTorch or ONNX packages. What's more, users can customize the conversion module to add to the conversion. +SpikingJelly implements a relatively general ANN-SNN Conversion interface. Users can realize the conversion through PyTorch. What's more, users can customize the conversion mode. ```python class ANN(nn.Module): @@ -103,8 +103,7 @@ class ANN(nn.Module): nn.AvgPool2d(2, 2), nn.Flatten(), - nn.Linear(32, 10), - nn.ReLU() + nn.Linear(32, 10) ) def forward(self,x): @@ -112,7 +111,7 @@ class ANN(nn.Module): return x ``` -This simple network with analog encoding can achieve 98.51% accuracy after converiosn on MNIST test dataset. Read [the tutorial of ann2snn](https://spikingjelly.readthedocs.io/zh_CN/latest/clock_driven/5_ann2snn.html) for more details. You can also run this code in Python terminal for training on classifying MNIST using converted model: +This simple network with analog encoding can achieve 98.44% accuracy after converiosn on MNIST test dataset. Read [the tutorial of ann2snn](https://spikingjelly.readthedocs.io/zh_CN/latest/clock_driven/5_ann2snn.html) for more details. You can also run this code in Python terminal for training on classifying MNIST using converted model: ```python >>> import spikingjelly.clock_driven.ann2snn.examples.cnn_mnist as cnn_mnist @@ -146,21 +145,71 @@ As simple as using PyTorch. ## Neuromorphic Datasets Supports SpikingJelly includes the following neuromorphic datasets: -| Dataset | Source | +| Dataset | Source | | -------------- | ------------------------------------------------------------ | -| ASL-DVS | Graph-based Object Classification for Neuromorphic Vision Sensing | -| CIFAR10-DVS | CIFAR10-DVS: An Event-Stream Dataset for Object Classification | -| DVS128 Gesture | A Low Power, Fully Event-Based Gesture Recognition System | -| N-Caltech101 | Converting Static Image Datasets to Spiking Neuromorphic Datasets Using Saccades | -| N-MNIST | Converting Static Image Datasets to Spiking Neuromorphic Datasets Using Saccades | +| ASL-DVS | [Graph-based Object Classification for Neuromorphic Vision Sensing](https://openaccess.thecvf.com/content_ICCV_2019/html/Bi_Graph-Based_Object_Classification_for_Neuromorphic_Vision_Sensing_ICCV_2019_paper.html) | +| CIFAR10-DVS | [CIFAR10-DVS: An Event-Stream Dataset for Object Classification](https://internal-journal.frontiersin.org/articles/10.3389/fnins.2017.00309/full) | +| DVS128 Gesture | [A Low Power, Fully Event-Based Gesture Recognition System](https://openaccess.thecvf.com/content_cvpr_2017/html/Amir_A_Low_Power_CVPR_2017_paper.html) | +| ES-ImageNet | [ES-ImageNet: A Million Event-Stream Classification Dataset for Spiking Neural Networks](https://www.frontiersin.org/articles/10.3389/fnins.2021.726582/full) | +| N-Caltech101 | [Converting Static Image Datasets to Spiking Neuromorphic Datasets Using Saccades](https://www.frontiersin.org/articles/10.3389/fnins.2015.00437/full) | +| N-MNIST | [Converting Static Image Datasets to Spiking Neuromorphic Datasets Using Saccades](https://www.frontiersin.org/articles/10.3389/fnins.2015.00437/full) | +| Nav Gesture | [Event-Based Gesture Recognition With Dynamic Background Suppression Using Smartphone Computational Capabilities](https://www.frontiersin.org/articles/10.3389/fnins.2020.00275/full) | Users can use both the origin events data and frames data integrated by SpikingJelly: ```python +import torch +from torch.utils.data import DataLoader +from spikingjelly.datasets import pad_sequence_collate, padded_sequence_mask from spikingjelly.datasets.dvs128_gesture import DVS128Gesture root_dir = 'D:/datasets/DVS128Gesture' event_set = DVS128Gesture(root_dir, train=True, data_type='event') -frame_set = DVS128Gesture(root_dir, train=True, data_type='frame', frames_number=20, split_by='number') +event, label = event_set[0] +for k in event.keys(): + print(k, event[k]) + +# t [80048267 80048277 80048278 ... 85092406 85092538 85092700] +# x [49 55 55 ... 60 85 45] +# y [82 92 92 ... 96 86 90] +# p [1 0 0 ... 1 0 0] +# label 0 + +fixed_frames_number_set = DVS128Gesture(root_dir, train=True, data_type='frame', frames_number=20, split_by='number') +rand_index = torch.randint(low=0, high=fixed_frames_number_set.__len__(), size=[2]) +for i in rand_index: + frame, label = fixed_frames_number_set[i] + print(f'frame[{i}].shape=[T, C, H, W]={frame.shape}') + +# frame[308].shape=[T, C, H, W]=(20, 2, 128, 128) +# frame[453].shape=[T, C, H, W]=(20, 2, 128, 128) + +fixed_duration_frame_set = DVS128Gesture(root_dir, data_type='frame', duration=1000000, train=True) +for i in range(5): + x, y = fixed_duration_frame_set[i] + print(f'x[{i}].shape=[T, C, H, W]={x.shape}') + +# x[0].shape=[T, C, H, W]=(6, 2, 128, 128) +# x[1].shape=[T, C, H, W]=(6, 2, 128, 128) +# x[2].shape=[T, C, H, W]=(5, 2, 128, 128) +# x[3].shape=[T, C, H, W]=(5, 2, 128, 128) +# x[4].shape=[T, C, H, W]=(7, 2, 128, 128) + +train_data_loader = DataLoader(fixed_duration_frame_set, collate_fn=pad_sequence_collate, batch_size=5) +for x, y, x_len in train_data_loader: + print(f'x.shape=[N, T, C, H, W]={tuple(x.shape)}') + print(f'x_len={x_len}') + mask = padded_sequence_mask(x_len) # mask.shape = [T, N] + print(f'mask=\n{mask.t().int()}') + break + +# x.shape=[N, T, C, H, W]=(5, 7, 2, 128, 128) +# x_len=tensor([6, 6, 5, 5, 7]) +# mask= +# tensor([[1, 1, 1, 1, 1, 1, 0], +# [1, 1, 1, 1, 1, 1, 0], +# [1, 1, 1, 1, 1, 0, 0], +# [1, 1, 1, 1, 1, 0, 0], +# [1, 1, 1, 1, 1, 1, 1]], dtype=torch.int32) ``` More datasets will be included in the future. diff --git a/README_cn.md b/README_cn.md index b9895c4..eaa480c 100644 --- a/README_cn.md +++ b/README_cn.md @@ -26,7 +26,7 @@ SpikingJelly的文档使用中英双语编写: https://spikingjelly.readthedoc 奇数版本是开发版,随着GitHub/OpenI不断更新。偶数版本是稳定版,可以从PyPI获取。 -**从** [**PyPI**](https://pypi.org/project/spikingjelly/) **安装最新的稳定版本(0.0.0.0.8)**: +**从** [**PyPI**](https://pypi.org/project/spikingjelly/) **安装最新的稳定版本**: ```bash pip install spikingjelly @@ -40,7 +40,7 @@ git clone https://github.com/fangwei123456/spikingjelly.git cd spikingjelly python setup.py install ``` -通过[OpenI](https://git.openi.org.cn/OpenI/spikingjelly): +通过[OpenI](https://git.openi.org.cn/OpenI/spikingjelly): ```bash git clone https://git.openi.org.cn/OpenI/spikingjelly.git cd spikingjelly @@ -148,19 +148,69 @@ SpikingJelly 已经将下列数据集纳入: | 数据集 | 来源 | | -------------- | ------------------------------------------------------------ | -| ASL-DVS | Graph-based Object Classification for Neuromorphic Vision Sensing | -| CIFAR10-DVS | CIFAR10-DVS: An Event-Stream Dataset for Object Classification | -| DVS128 Gesture | A Low Power, Fully Event-Based Gesture Recognition System | -| N-Caltech101 | Converting Static Image Datasets to Spiking Neuromorphic Datasets Using Saccades | -| N-MNIST | Converting Static Image Datasets to Spiking Neuromorphic Datasets Using Saccades | +| ASL-DVS | [Graph-based Object Classification for Neuromorphic Vision Sensing](https://openaccess.thecvf.com/content_ICCV_2019/html/Bi_Graph-Based_Object_Classification_for_Neuromorphic_Vision_Sensing_ICCV_2019_paper.html) | +| CIFAR10-DVS | [CIFAR10-DVS: An Event-Stream Dataset for Object Classification](https://internal-journal.frontiersin.org/articles/10.3389/fnins.2017.00309/full) | +| DVS128 Gesture | [A Low Power, Fully Event-Based Gesture Recognition System](https://openaccess.thecvf.com/content_cvpr_2017/html/Amir_A_Low_Power_CVPR_2017_paper.html) | +| ES-ImageNet | [ES-ImageNet: A Million Event-Stream Classification Dataset for Spiking Neural Networks](https://www.frontiersin.org/articles/10.3389/fnins.2021.726582/full) | +| N-Caltech101 | [Converting Static Image Datasets to Spiking Neuromorphic Datasets Using Saccades](https://www.frontiersin.org/articles/10.3389/fnins.2015.00437/full) | +| N-MNIST | [Converting Static Image Datasets to Spiking Neuromorphic Datasets Using Saccades](https://www.frontiersin.org/articles/10.3389/fnins.2015.00437/full) | +| Nav Gesture | [Event-Based Gesture Recognition With Dynamic Background Suppression Using Smartphone Computational Capabilities](https://www.frontiersin.org/articles/10.3389/fnins.2020.00275/full) | 用户可以轻松使用事件数据,或由SpikingJelly积分生成的帧数据: ```python +import torch +from torch.utils.data import DataLoader +from spikingjelly.datasets import pad_sequence_collate, padded_sequence_mask from spikingjelly.datasets.dvs128_gesture import DVS128Gesture root_dir = 'D:/datasets/DVS128Gesture' event_set = DVS128Gesture(root_dir, train=True, data_type='event') -frame_set = DVS128Gesture(root_dir, train=True, data_type='frame', frames_number=20, split_by='number') +event, label = event_set[0] +for k in event.keys(): + print(k, event[k]) + +# t [80048267 80048277 80048278 ... 85092406 85092538 85092700] +# x [49 55 55 ... 60 85 45] +# y [82 92 92 ... 96 86 90] +# p [1 0 0 ... 1 0 0] +# label 0 + +fixed_frames_number_set = DVS128Gesture(root_dir, train=True, data_type='frame', frames_number=20, split_by='number') +rand_index = torch.randint(low=0, high=fixed_frames_number_set.__len__(), size=[2]) +for i in rand_index: + frame, label = fixed_frames_number_set[i] + print(f'frame[{i}].shape=[T, C, H, W]={frame.shape}') + +# frame[308].shape=[T, C, H, W]=(20, 2, 128, 128) +# frame[453].shape=[T, C, H, W]=(20, 2, 128, 128) + +fixed_duration_frame_set = DVS128Gesture(root_dir, data_type='frame', duration=1000000, train=True) +for i in range(5): + x, y = fixed_duration_frame_set[i] + print(f'x[{i}].shape=[T, C, H, W]={x.shape}') + +# x[0].shape=[T, C, H, W]=(6, 2, 128, 128) +# x[1].shape=[T, C, H, W]=(6, 2, 128, 128) +# x[2].shape=[T, C, H, W]=(5, 2, 128, 128) +# x[3].shape=[T, C, H, W]=(5, 2, 128, 128) +# x[4].shape=[T, C, H, W]=(7, 2, 128, 128) + +train_data_loader = DataLoader(fixed_duration_frame_set, collate_fn=pad_sequence_collate, batch_size=5) +for x, y, x_len in train_data_loader: + print(f'x.shape=[N, T, C, H, W]={tuple(x.shape)}') + print(f'x_len={x_len}') + mask = padded_sequence_mask(x_len) # mask.shape = [T, N] + print(f'mask=\n{mask.t().int()}') + break + +# x.shape=[N, T, C, H, W]=(5, 7, 2, 128, 128) +# x_len=tensor([6, 6, 5, 5, 7]) +# mask= +# tensor([[1, 1, 1, 1, 1, 1, 0], +# [1, 1, 1, 1, 1, 1, 0], +# [1, 1, 1, 1, 1, 0, 0], +# [1, 1, 1, 1, 1, 0, 0], +# [1, 1, 1, 1, 1, 1, 1]], dtype=torch.int32) ``` 未来将会纳入更多数据集。 diff --git a/bugs.md b/bugs.md index 6a2c024..26f054b 100644 --- a/bugs.md +++ b/bugs.md @@ -11,4 +11,6 @@ Some fatal bugs and when the bug is fixed are shown in this table. Note that the | Bug: Cupy backend for spiking neurons, https://github.com/fangwei123456/spikingjelly/issues/106. This bug makes spiking neurons with cupy backend output wrong spikes and voltages. This bug has no influence on release 0.0.0.0.4, which does not use cupy. | 2021-09-16 | | **Release: 0.0.0.0.8** | 2021-11-21 | | Bug: MultiStepParametricLIFNode, https://github.com/fangwei123456/spikingjelly/issues/151. This bug makes the gradient of the learnable parameter in MultiStepParametricLIFNode incomplete when backend is cupy. | 2021-12-10 | +| **Release: 0.0.0.0.10** | | +| Bug: When using CuPy with `version >= 10`, CuPy will change `torch.cuda.current_device()` to 0, https://github.com/cupy/cupy/issues/6569. This bug will break training when using Distributed Data Parallel (DDP). | 2022-03-22 | diff --git a/docs/source/_static/API/clock_driven/lava_exchange/step_quantize.pdf b/docs/source/_static/API/clock_driven/lava_exchange/step_quantize.pdf new file mode 100644 index 0000000000000000000000000000000000000000..93180be751df8228c5956e33570bab3c0ed5401e GIT binary patch literal 99537 zcmaI6V{|3m7A_pyc6MyLqmJ#QW81cE8y(vnJL!&Xcg&7$+@#<4oO{0e7-^*LQLMY{d<)FXag9O%=C@SZA<~||Aec_8CVz_ zIsrKTCjY;1nEtl`0UH}zCr1Di<39+5KF}E3d?>j7$suEGWUeo0>-K@+(}RVBiJp;@ zjf0Jios*pdz|77`&&t8V#=^qP!piXpL0;eCBm5siU>FpQ9c`T*3_lq9q~N~{eUkP+ z^n8~351*puRv!!i7(}f;C=xa{v^Dx*RNC0a)X5CM^bgG+jW{|v80%ZZxMj_2j@sZ4 zB9D7?8KjURaXc%`#|C4X5D0ca8=wveP7vpsuLRJl#iMaOJ}K)aD8<_FiNaOufjW7~ zJ!Q=*%{NUC*_OMg;Qk&&1!h9ZfXP~a=^qBHZ8{U7w`BDo{?V&MX znw!>@GgACP@6M(3*_UZiOARr<#!_(fW-{-&% z`Q1Sdl9pmeANw8d4rC;kA|^ZNo_zNsQQT)DZ)ofS92{>U z5U{7>ITQ-$OR-q{a(VzODx#>X*aNZv?nZiIM$c>kE-KwU9F?e`=``;f4E?f5tiqrH zsO#XJ02l6Zcr@z<@?wG(v>z1)lsU>`f$cn%U!*t(oWKSqtedSBVJhq8_sl;MJiG%` zH)%kpElhLjQL#S035;#KOk$`=z~?cS1qti}N-VtL_)H<9M^lW(Pnmsg?YQ00C~lZ# zTxO-2QSdVBWXnbucJoGz(r-G_$>l5|b2sBF5ID{RwEfs@D_mS{;nxc1=utS%Oj;wb zONW@}k{~$(e^@MT`t8aX^eW_rfuVFbx}r3^394_f*`MgR#*mznSH2cZ1Qu}?h;lz{ z3i_uxU4UENkO{2aLVI15Rx9V?FS}V?Z2km0{n>I0mEJ`Z7a$wpR}~7NMD*80d>WX= zM|SHZgIe-{3b;ag*qqji0*Z?ogg_*ao+60SL>#J~(WvmKN2H@;7&_a9jV z5?+$|QSYC6Etg?kH=Z>X><+-Vh!RuqusaJRxl+jG%VXeHykG+EP`^F;4sp=+>hoa# z4Y&nBgTWZFbq+#&AedT%+*tsPJFA$=?9}dA5JE}{?Wuw!PlfTQZ~kw<(0XIU&y!P- zCW3%m9xy&PjVC0Y@yd z|H@^sFlqJw0B{=ZZ>b7I#B-ZJ;tA&LmyYx;vLQ>Bw~*w+|5DxE@wnIfT;STYdR;60 zbb1Ez_y@aJ?!@vSfHR+iD4XjRy2l9wk?_(460I^p>OvWF>J}7Cj;_~s+Z+utrg}|| z`RB#DG^GHZLMAjXaGI>17aO~4Nj}+8S zJ8C|uJqRN7bfVuQDFNRUBqpzX96%pn;G#gWdl;K1{Pi%xgiU?3-LUf0k7?o4@2-y% zQZt?Z;ERoqhBbH>_kbh{#RZZfFLaumfn1bj6WcwD&H5{?(Kj*hE{=gu2+=?OfiMuy zZmT52zf&a%#!nXqvVm_%7LMu`_9OcfAjLqZ;T#h?V)2AMe``8& zOO8ZRv_Q^uUQYau`UVQSw|I4tK#oF1%|Lw@S*?$ARn!yim&4IZ!3`vK@4`F1DE@+P zy3wZIQhMG;4TBFX|JqulEa;;yO`Q%CcHI`5gJinn_OX0LtGaL|pHh0B=FmQEQ>i<_ zT|{>0GdX*RPf-Y+k$4}*+CLn2=KnY8>;B9~9Hcjw&LJ*Hs<+(pS{e{AqOmU1l0hz1 ztmtcRpJAXF;&2>62=P4eJmiKE?JoDbw_+|h zj~nHuEY{=_Tf{50Dj1gqrybUPmf@^jD~{wHPH-;`X0lqc~omD#Xv7xHjh`c%I_O>eb& z3Ls^VAHO17<4qj3`VF!pl-&MSBrx0IaZb?%v!FCj*DIZ}DCd(WlZg>6dP_iO>&N6L z^dVkYZR_@Ndp{Y}4XtLc)+KCPnex<7Et3Iz(j97kPrAij6&~G?3^$eEcIDniRvw=? zz|m9+y$IbP1uNe$_7l3|QuQVsu5m)NE0jaJ8J^r?Taj0kC6V1gge&Dfd6P6714lgn z!UNpRi}BFsw*lel-f9BlY}XJ=@aFAlvPJwF>~t^3Z@q~qM898Fi#o{#kflFFSUkY< z7P-J4?1ZE|x_rD*4IuSAfBgA!5c2)UfoY^ihsVBGci{P=t7nR%+U@b}_+mneR4!W- zQR2_F>rkJb=o{lAm1SL}(PqU^+F8uYJ&&ui*LjALV@xWORO+gkBckTp-P2ItE-r0D z`@H!X5XFpz#C!hN`=ca9?@Q=PR&-6yuEQiTp&&sK3f6fx;7Cr4pb)lq8vXLeT>jfvQ4DR(Zo zS&vHk+eMT7Af>Gzncu+@r;O7W_8J44k1V$Imk+GaD!Ce~#sU&%B>XO@Q`)b)P5ke{}53 zYyeIc761na2Y{W66~NBU4&Y#81aL4j|JD8Pm5q}Hz{bY*cYVfuF6NIo7DhH0HbzFk z|GPf(u(GoL)&D2Yrw7SyX2|m}S;UnSSME?QwH#sLKfaPO-+CM@5)%{!e z|1J3c$p0Tu|EcqN{mjG4!Tfi9?*BH-$oWsx>?{CwPDTJb+Xn%htbc7R>`X9h|Kj~5 zjEm`Cl0ITTegEcQVgF0a-$p+P{8&tk%>O3;5ZFH^l8@qlL4AVw1p6OQe}R86`p9Vkg$cLVVi|J#r{zKNkx=%2lw$JsSJHf>GQTTtE z`oE9G=S23O8AZU+@N>rbY~y3tvlBNqH#Kwm=;$MuLD={oBOUXn^<#=Lbp)`&FnkVq zg8z(BbgYbkzt)ccuD?k>#->lZsJ^wil{Hmq$!Q9Tt_G7wI z`xoc}9DN?vikV_ulW|ZQ-e_ zUt^5txyrQre0_3VNTRrMfYRVh4<^B>*3}q{`NsuFADXg1Ef59;?!^?Qb6bfop1T2*$hKr~$%Z(Ey4I4UOEY*#ko6j$>IG z>m$h3+qM9&3(PVzF@TkKWCRV=;(cEN`Pyr{cXTu~I(Bk)HeqCN@N0L|ib{MG$PThy z16Wyr9&oS-aO?>j1QJ2zp7PZ)6wC)J)%4eXGgNh~wR7gMhX4jZXx3s6t%KM78ehlT zh4afokWY+F%eXMfwm(#cq_T6K%Tl;m8lKN;9_`<}<6qvEj zp%%yw@v%v*QLi!Rf9?dZGe(mNuF1c&Bo_kj6Z#zprI zxr?6WWVh*8kmYgo1T+|gxVhT*oK}t8_rf}mA-ofld;VVLYVx=nq#w-HUi_aSnB=5B4Br?-*7+$6ptoN zlWRE~8GVO%zahx_h1b`IQ=sfR0l*x9I*4-Of`Hs#(AR-X$UPxdhlqjtq;&m)16gKz zcj|!Bilk3YqzNj0vHgHd?BlAA3;fZ0Ds8|$4D>)5M>ChwGx#-@fX7xgeUTvj)E|-U zLDJ@XQsfXe1KbXwBpkzL-`VVizNzujZ~1>s6@v9CpU>Jb?=~qQfEOu zpqTg}IF#`vFuT$q_@a@8>Ddq^sQ$W_`_0g=mOW57qv|v<)sL|mjB8cEXn1a6ZyL_N zXLtAv=?Fm%5Y!MDSrr~RUjY3}LwNg=m+$q^Vgs5UaQpx|1F_RTOzJ_IbK zDtxT~T5x4}{3klt);%HcgDIHc+WPp!jesC6;hM?w@fi`{7yb;u4ZgrcoZz()TOTAU zci8-1Ug?YB-s5uve{29HQ9ViF@8#71+H5s{U(I(m?CtdM(OwLC_E^y{)0ifk*y=id zo{-*ITK&)m!@WOd;2t-@->(_A^+Lr$PLOopIwV}eD0Bd(p<4#Hz_Xrf%A?*%4pv8pwuzNf%qJP!@yY%VCY>mg9p$C^Od#wBJB8)nU(tJS{;N%Zs#|*02B?*vE@(@PynEB94>>~>d4e=2} zVYZ5aZ4tkR0kqHvkb@$(NdoK?Jmz?yF!w(6`Bxu$Wx@~r%BS8}7>OR|BUt1bS)j#? z_apf9)9ISB1&O{b_K@KxGgCcq>s!1aPi5@#>jBUzKkVBWeRgOEFn4C_vied0!A^YW z4ugI0O^jS(R_OKl>HBZo?LNbm%dc6CABZW&vXh|0F!$v)g~0gy5zVi`1UWVN{E>>k zeLM?Wvmju3cZG1ZUpbd~{E-ypMxYCO^?j0rH-&ENd_jmR@v^@{(F{^t5TSo=4n8)q zIUk!v3Z!2nr_e#Q9uV(PO$A&T+BR~j9!aRh_A_NUo=Xdw2Ol{)`+* zg{O_P1A(l!*gMDv{@2Ms#n)vZV8Y#_Nd%~eTy@{3uRV}kPpXJuUU4-akNaj|&}%y^ z2p{G0`!obFA?x_Ax$VG}7zvoyRHLu6U=O5>=L`%mVWIRNDwFpd6Q~D(?mKP);?=Df z&--*~W-FJF2jXJ4kCK4mpYDem-7!EM%gE0thM^}%`5ogTs66hSr??)@y|x80 z4bgM;1M+4|EyBX)_4eC&s66;{m9E#jR;CCgeC_uT zR+fNS-mGSttD?vUF64r?i;&5x#Mt6)cxhgzNLu5RL|l$?2ZV(p&yu9U)arzs4WZ%% z6?SUQy*OLR0jvVH{F_%;3d(w!S&B=YpY6;jBs2O19=S4E-(w9;5H=D=2Bt|K&qV6? zJdf#r3~0%CjHi~>w>4}lZ60qU_h$VZ?y4bEfY6>`uCXJFdX2+;X6Bs3V#`W%`F%4Qdf%!o9KUX9WuS9o|BoRSE028yGMTx z|I4D=!)Ul`#!Q7^a~NK%S>Q)u?_ecmT?!YLKJ&8tg#7x!(gj-VDMr{a)^$?iu!Y>d ziFHy=%FOPs6^eV9K;YFh1@_p84SRPYB9~ew&C*-y8SZ4V zBzcs&Hri{d$30+>#L9o?hIlSiwJ;fR9_)Z8 z4KLuf8^MebHoSF-XWY!bB)AT_rD*mDG#Z48K4h9CF9sxL_AUtV8pvSoeAI}lbkz42|kE&Ws%GqHL2E1!%udSi?2 zGz0{FhM_q!`eJY#eon5o_~<@iJ_U>Cm%+Qg+el0GTs|}VKb($SG+rOcEsUPukkR)w zp>x_?hlb)5pMj)nNx9Q=G+rrH`|F;1mSfY4NZ*z^=n6C+5qKo#I)o75no@O2Qfs5J zwm8twCJBD2{1z#gg5RYNQ1)C$6A0%MW$}~*IVCc?3pd;K^dr3EbKb;Qha``uY?ky% z*nr8Xb5Xumh&MZMEY48mXq*0n9etCzvr2 zbwra}l^C~ffvwVzyfO(^D#qIu~qdKRJ7f7MrKzfD$@0)Q;ZzchTb; zg$)Qlj9K#>t`_+|uHros(=%E#t*@9sI;oRHa&9NCKeY4-Q5Pt6(`uWBX~kdMu*yv)PB zCP%9W@7E%OA8$to;_qrkDWHI5MMQUP^-* zu#txRVj?=DDvkLXipQJfm1%&G0TU#N-=)YkY29mOdkPDFHqs(31g)mRY=(|78(dY9bbwJ{gJ>Ad*ZLFQnMI$j7OT6KOFgM$uaUiB0p)Ed9HWg&B? zWQ9PbJ3OX~bZBK2W+wq-c=8aJS`{asen;i;IS`Iest{GV|`Ll%?Z$(Jfz0tZs ze)xGaOc%n|;)8qri&oClNwh!=IkL zeXz+lYoqVYx^HzLGY%5ZJ6S2>Nm3*}Knm|+mw>|?2}STIRBw!Lo4qf`Gkt15n^S$h zLpcv0co~#{|D(Ky9W-D0(t)Mx({;fS0k-nK-q4}L?1*GrZXsPnzdV*@8yv1@UWACf z9Mgw={E#;=$^;;ERJ6~qRiemv8wq1@xTXqcI28Uj~_a}}jr7BC7&N=iIVLvB0I=nCB*h`7V}>_j5^m#PFi zjN7lPHcq3pXp5*fO^QM3rx~5ky6YYaUz^NfVUALXTE@Bv@5_oZER`MP6lc+18b5jO z_XQIRl$RY&cI%1x-ynxAG6 z9&m)~H?g=|&0=ngy}kfx5-0XTH?D9O5{B5=_O}tu&ML+mmzgyrUAf~qUEgR6NDIk^ z4K3FcVNx!AZ%&lwzf~H=xa6?r5>g*jFP{`GE_COBA!#_NZoR5mw6dMQZrm| zoKf6_ZN;e#WJcIm8nrjVt-}0U2N7vMeoi@L;IL_ZSWr>y(&XxQwNW%(!>t3q1j73vmQS9Y zo(IkegjrOAMblbET}#|SH-nKR`B=bON52n zD~qs3xdrb9(rF%4Y8RPfZ)79vhdb&1Dph#AtE=jiSQid8N5pS*k*+j(^3FcWE*VeG zRL7-2Q&qArT2{g?zgPBqM{jQY(^m>uIB)A7wUd(YC3NJWms&eX%u2PTJ2<@SsbmF< zIbAtpOf}2In$>b?&H{BqN1i#Q7*gU^4B3+%aA$ORGSeg)^uuCEqLcBwRX3=+;^#0s zzrdnNjZ=%3#v_ZiJRWKYvF=|+Mf=MI*$ruE3tNB3`sTTIM0b!m%YB?*iCBxWIGW~e zb~STrkf5qX(blc%MJWvY3&9pEvt4b~CeiXogTmk?io`~=6?hlxXxe%?>`#g&+-_e5 zI@hXWZIi1pAvc}m*bF#4%9eDCcK7Cw<3BD@W6PiQ@|dB=uos~cvLBZsVJI0Ji>O2Y) zL7F)a8kjgRJaal{qMfu@q{lDg;vqA3Q_8{e_m=x;z^r5~AI(d^R<@Bnl_z8`dh@-5 zjUBd2AhU!nTBnIvc898){9oB^fyz>)i z*}L2y?xGP8E=Z&6vA1IVXV8XE_Cqx#WT}|C#ls67IiA%ek`eBa(>0tEVOIL(3k-&; zg22MkcFBh`8>%YCEDh-GOiz6Rd53aGh{zo2m7 zezf#RNnZ>;5Z)x!B-~e+x*EtGl>}BSu8*%0N*}dx^pDCmhPG2y=KcvPBkdtFMgIy( z%Ip);PbJ;D&CKSY3>H0nJ(yr&oYVO1*F;uBWbBBt5BhX(uPe zT+S>4wOtn-Vj4JVVJG>YL|Nfic6jJXeb1o_`g!pCSBC+XR$lU>rK)~fz*>8ap|N(f z54G{D9UiyX#)B}caHPS_>Nj&eZFKd@-gU}h0ck_Wqek=Fml#&+6<{dvBxf^!D?I%f znXX(%ebzL6(s|`;jAU8S-@uP#$=qrZjLfk!{X6XSr5qcr?PdoyPrQ4Y;OX}q;?ytw zm#L^Cv`@RxB+9U%%26#KOG9uNH)ZDR!JtvKul_1os+@57klz*zTMK_(=v9}|_WL_7 z6g+yb%%<{JlcrQ-LJ!%iod3>Uc%@b?)mbB8X}Cse?wQxt`#W-{>chlRa&sZJx=%K53C)S{<$$7E}w?= zxKeJ*`4q;DXa{Pw?un8L*9TFEgE<4Ko+P2J)ob&8m+@m(p4g_#9?7Lxx>icSh6Ad4 z8F$gjBF0(kdG%Mk-gty~nI6Ah`Z7mcvGpm$>xl@P0Uh@?-Jza%&~Cfg{^UcB4*RdI z!K-5dZVHksAfwVe(ACEeCIZu}8Bu6FcS01uE5{3FxoKwQ&EyGmUvWkmf{xer$%dn{ z$D|t@&;xf!@#~q&p$Y|NBwAmi6?mLk(NsAwB$=$zX~fZ-u=J#^=B0 zFmLHPA&98jN+b#uid(G5IUrs@>_W7lP{$m!JCkz-Ta~a}7@~{}-pUdy!F!p>mc89c z+Eimmi4e;oblD1c$W{TWGROpZC+3#xZ?viAAL|4oC8^4-j$$6sC*rvnJd}RZl_(Up z=bIuo=lc1J8ZdpQAU~q#CSTwX4^2EYrD)C5@wN4|X<@rxg-)KgVH8;*jAOMKD@EK4 z*eX!Suk%U%X8eXCE2e*o`DgKw8vB{{I~J`0gtZ|RS?t!m52JR$hPl0NEqb>_kF>Vyr6jnFv zG(FdS-#Qn~jz%BMQxAsJ(8ThHpJ$&GxD|t`9}Gk_FEq!DX39rK-a>B-x}n-%zlcr3 zRMLKB^vFuvK)0_Pui~xjjK`hV&n~O$AM!Kr-XE2HKr3(>rcYJCEx!Ug!N>A;Bnx>e z-?80lCMIe)3R!6w8LX>9IQ?xKcM)&y{@i&mhx9_mTbkHGYA@~t9@%4 z*a~!SRvsgg?K_1`@Et7{u{f#j?g+rjgfp^&H>%@XTFXYo^swWJ=YNQ02)-NZ^OLBg zF#&(t0sb_pi=ncw8>xaxGphp~7By#T4Wz4%+iA5K|Ks~Q`=WKuAEap~z7m@0h1~eF zcy!EOT1}JIuGc9vOj;#G%t;f?@2!u5u6e?3lrORKxcYbFPH2C&a0hENt;6f zBkj)zg}^#W9HLm+waJd^WM5M)(`}w!Ufy!>4rU0ZFevW@QEOg_Tr+ADDKCaK1a>Ss zc2%#DdEr-^CQe3R_?-s)(x>ZIJD(LwrA)36Wa_8pG>4;=?gkT+aFA>x^ zC&*Z$`709_d!njbsCspNI9 z)D8qVzi9>*^jaAdNitJ}XXdjmRWe!ci?AtGiInI~|HPOETuDb$`_@l7Y+XlynETfJ zOxlSJVTGuNrE!B>gut>X>D5@ zjed8cz(wM1Q3FKKaL;11IbTJR4{lPd98>lpV>+6F1kQl`>PWI8Ey3GK_%-~PaKX5l zTB~tJEIXg1#vuK6v8;kS|;t;45%Jv$ju*H>{twcU*JwTD5Tl%P?zAN0QM$d+(FVb*Ss?hZ8K`~$5vyftl zD9jFT`4=pLIro59K|#q67Z{`g>)Qit7kv@K{+nV0Pw0cRafI9kOQ<(2VxATqVksit zCBf6aY%0N z)XVm^^`XHj4F=g$NKC!V9Wmq<4MM4#CAc_}mZYH(sdES|;RN!XGE22C#NvJLL zX)G~;vIBN`0i>o><+~CMp8_s7;>A^dz*6IV7iq^n#8#X{wn)U0+81GsxQ^p)R6OVg z7Pdw#=SOXAZC+Ek(5ooi9pgM`9diCou7gLMRD>(=fU{fh??jie(wdP*@?u}&)|xUs zD+>a3`j9I0GX!dFWuASQC*JUPBr3QKQDIrj!$U8%AzC)}a*#WVuk<*SZs0sPzP8$JTl=pFmqK1x_8xINCH zev#G7oY^n#esPIOUu>0yZyf@O~Og^Z|+9oQFV}n z+%7a{=zfoZB!(T-tGg9*?ZN!gObP;y_jo1rV5)IiP9Ito#Bhj|naf#yNh^>cq1RmV z*7-*J-g;-GN;VI9fuP@nAZoE{UYzW203J%^K(TIpdRU_ElI2hO{OTOKqWrr768O$% zFiB=7Rh6SO1WqG$HeuK;Y(HcwnL z?~e;i&Q)RC1ZUiyC|FKS_BZZ)XJdg@%?kFsjQ&u9 zzT}a;3kI4Kh#-Zo#dJe_=tT38?=|YyTZ2d)=WHUInd7_{7aids-VU?N1|(hEjIw3V z6XBAePGo^t|0hx%<<)#S#|mnb97Uw9I#XjK_$>Nr<(Pz4IEso)+ukSWv_+@Zp)kQp zU$yXZ6=K5~t{y||l8ZvbuqsURMjI;3iXXL7r$3Uw=~t-AwfmXMU!RiA^m!)rUn_l< zU!Og9@c)pw9g>Y=a*JS!|@$*=#sV_+<6Q1}zS>Sn-KtxqWIAnL$Rz&4-Ss_?oA zM`eXMzlSIL9&`sw`P}3PVJ?kQO_`MrCJ*neh_ysaW_rCxehVHs7V+D&2`ML#)z@vV z#&Ua1-W^O*lL$w(4tnUsO)!jUcA0~()1-iN=Qe}v;!Djw?cM>GRH7lYf?w7H8H%3{ zDtHmK^s9!oF^bF)+A4qXyt8Z20T=nS+Q<1XWsx&FzXw25XRN$dvH%}7R-zhEpvBO!KIwtfe+9Ok` zyXHj0WUPCHHxR{-d1B=-AbBG|*%Etr^LV4_cx&s}Th##0Z??uSOWgdXdO8YmtF9-5 zd{$zsQPZqqO^ee?(@jCPW2=jtq|(3mlF(VK{N=nH9rT;xqJOI>M9e&i@Ukslwb*Dj+$NC>!o7^F(fooUV(MH@@rjzOlJzoMp?N^==*?{G4Cu1b0(@h5Dy|ERi^Qk^ z)_8`$r{o|#)Jog0_E8ii^X3xT)U2mNE!Lfw?urdjsfH^g4R4E(+2%$w8{)<$`-Oq_ z$T@Wd4VtEywkE@$UJy59S=O09hLeAOyx7OCjFdj8v|}g8F>8#ZQ z9WmS8234|e8ixz6ACVQM{@Gu&# z{G$xrrir6v0|8|kbeX@S^`Lz?uCGX>g_Y`EpJ@B&U@6B3Wk-5TM*Io-eA! zVy*#K-!{oOIRJ+ZN7nc&2E26}kiLit18w7zvlB3K)V7zx3|c_fg+K$3*W;;_aHHW% z*hb7Ne)}-6kcw+^;Iq>_bbEL>o3~r2+HhqaLa0~=E@4NbFZb@_7ce12HVKWFnNcBr z6hw!I1z{~=Zz}2Z>tV`MOT3#sfnl-2)uN+@1Xhw#S2{&l_7ci(T<427U%*UtnOe!E zr4jHo5#Bb@NKUL#^eL8P%A$*C>%+c@tCB1~pFy33^y8vJF0-Y`#-5f^nRb-+T}m79 z0mXFCD!Z@~5wu&N6|Tcsw}-XI`JW0j+e1lRYRNl=qVnC!i%OMUHC}_y{NzW?a<@$L zCCa4oy~xsD?(`P)iZYO%-?KRl4cB#45oer4%2x1mu*aUb&jsR5;pAVMiRD~*J(pk` z@A(yvL63?uAkWc3ah-%xPLV4Fb7CwqX~+fDtvUwU^otuawZddq%SO`*>GD z#r8pFY1O45SIM2B5d%gg@c<3c2MgL?#K-xH+{JafpiAIwria3o>U+{%TeQxrn$t8i zX}LBhp74sH%FVuR*$8ELK^DI;sUoN8M8EKZ6fHFj)l-MaWU`$Oe%D(0OdD8tCVFr6 zb4nX6M9G9|UaaI}yOJ-JIL$n_qAhmXy*@>=a|i4cIzxVKcywJ48gp}zqZ7{b%CLWQ zUUI4$y#}f~3J<+(ULWO6J1*b2ih98^%S!B1D?neWTOK~Uy2boS@<}GWgkP!x2%GEJ zCcP$1g-tfKasltev@fCG5CWKUg(vb%>n2~fRX{a~wViF01wrBvOz`%AMn#4rM!w3y zdudIuS}NJ1)?Nvq(|rFxv>Ty+rmMPn{c^hST&P3rEQ-;6s%tSg&w%JCT<~~8mUwF5 z#Eyk6AtYpTM_IWh=MwN0f|iEA z)bN{i)}-`VdfUFj_xKdD?o*T|VuZ%TDCp7k;UtRG#2ox?xfBSF1|cJX)*BFA*<=mF z13t((oe>oc8lB%?;a74MUA0Up$&gYUNwjx(7?J~;cdD$mX|WAOG3rlg9AO{o*6nbI-$!YEiHLkyoa?gGoL3`T1s6PI z@Aiz(4`^djel0TQ;;Ut|RUfqMYU$9L{`?~|pxhS(W|Na<`ippYy;26j`b>t-sxl@b ze}v0C;VA}*xFeM4boN{QML{++`|rmT4%+t1mHg2Hrvdk0^J~@)cB&+iL>_6l>nio` zgz%jPBcjPY-8123d`>#78b5ezL zo{yLBmp$X!{9GR0s zAQM)F(xlRiOCYS?S7J3CPBryR%qa#%byTyoS4d4no1`R#lTGkIj?BF&TkNF41LC$_ z1lP*1<+W3|{-BjHHP|S%hNFzdr`}AqJY8EHByTYZxXAlxG5Xhht=3!&V)GgCVH-3o z7B~f&=T-E}eB^RQ0xaQE^Z8o-2_qVA$;E393-}Qo%!!rvxBB>G0gfY*LL3npu+H2p41VY1bnu`2EE-hoV!;RYt{E5jdkJolmv zyiaNSSyfplGBJPp)Z0NJm5H1)`!J4u&^O7m=u7oIb*^02v%S^Gcr1I%H zSw|&jTLr94l_f(GOsrSkahcD(4*YH+m>h45!&NRyq|lu(d0`r) zU-S$cnZUpG%c9Nign`@(S{n}da9I5@kv2)pZBM|cdHy-ib6_4~SS)C3QFt6YS#f z(RpIH?A(DeV!uL=r(fz>ECd2qWwxp2heTcnF~O-0d6^3wDBZ=L*9mL+@68Ms_lCh-RvR zJgaxw1CUgSw>=TdWk}dgOU~~GTi>`k#89Aa9TQrm4lhj=`oSvs8$21fTDBlA+#8u= zUn}Sp$R%enBe%V19<)$2P@Bo3&Oj_uVGgf!+9gU;gG}Khy0TmOtyI80*MO{HOPuSB zDntb;YsXBnti3F3CgmI1v;zF(Z*|vW2^ejmEex9As^57EDe6qOgZ@C2kYSBG>=Wz2 zgY{RsX%lVV4E;KT`IBwU*S)jFJl6EY#m#m$<8$bGlRmkxA{wyv038}T!7(vU7zWDI zaRx&jNZPNkz$6G6XrI}k&@ftwF*Q;HF-+2{cXloq)7q$Cg#EL1Oe#$Iwh-CU3je%N zUPMV;S@9RudVh0WXr8y+Ez-E-0g#N1c~^PN@9Qft+fI5TejP{h%aAYmswHmwtQR)t z?N%p%b5|Uvothq8Sa7SHgXLkw`8z7NM$-Hhfd@HKh7NCCLze(WL)Nv?-|~36xDEyGA7r7g}Sn*s&_RT*C-f z7a{m7NslrU*qwTYYuC=udnt@h)8K)9@(yu^P(7)hYvazq-fKiXR4Ao|m^i&}T6I^> z@+o?C&@E4@TT>lBFK|^~pLm=>UC5d7^ZGqbPs$}+<&WsiRHt(=Yr1+lu>pt(UP%!& zjtWg&u7O(`W*^$&k%F2~sW5(10@bGorv_5NOqqMt=k9JaS?u3GagF$8;oudO)wO8X z$9~KhE7R9rvHP&kbi0n*hFILfaek@6>cyuLST3K9%Tu3==!~Z=ew$HFmk?cqleJ;0 z)W}L95#z@|!nmAWcl*Y#)`(KSrG@HdC3x5X26vdAc39M3QSBzuy+;zTxzXm3H)9PO zQ)gm)QK4h9HbGbA`Q2H7PQCTf93?VwONbIDON1q|g z)J4|vJLH;A=k6m~Cf%Bvir)Y^>b5p==w=sLu%%BKJHx;L#}l8Ib}dz|y3)i>h=nqX zChfT$HjEl;xk2oKST66AQXA*x#DI7#Edubj_=F^|dDcHTuy#8O<%z7?Efe_1=M=ur z6=Z&Cwg9Oxt37bfQl{k5$QCfG90Vo(ZnHSC#F?t-|!*;4Gdv2mq3Ss(rB*G3%t!mq1^$jErQ zq3UbgxPwR2##o1KHC1`@ME_>28ChxICFn84NLb2PO*I?;C4>$Ichoxhy}XmEFR`sm zV9XmUv4UMzQnmw-3q*mOmrQt?+opZ*;ggFiC-5sn!i(7!2~4R=mQzqO=IzQ$n3`h6 z6XAAmp8QJ-j)(HR)oEgo3stM!srT4U*NUZ(qy~7KXN55lGA9~S$s4UR>`N7HDxF1 zEV_J|8XTSs*UVPj!Qqr_T8Ak7tGNa40fPTxKazyr*TMCVkCjzzEu4a5`;xkd--d|W zCEomgyuG7ErI3IKB<$nB;EtP3vYU+cM&5xxSkWRKzfM6#bnSHXg}8=OS{Z?SK0zWL zzV?oHOe7PQNvU>^&%&*ZJ@v=Cxha$5Qgq;gVf(9^ zPkV?d4sI&Ovn-(JmAMaU@!%-3E|=d>i`rHn;r@?Vk2_NKu&HNoj)g>h#-tJIBpd6r zLqBJ)p{93EybYn=7_Lj+3Mv9*(gCr;3V&JYh{gIgw`~eR2g@f{nSp{b)M05-J^fV_+?=6fX%-LJeNVt@esKi z(;>5KM(;zWc}6^R+BYWfG&rf<2j9S^VO8m_T}5^$vyXXs-ErrWNzJJlM9XiS*bj_; zKV34<#f!?rn~1!2(OT%%B$E1;rdzrCZ5awn>-r~v;{pmClKjPG8wfcVm zI6%k0As3T(zpc2IZb?ze*~3ezLVDRsiptW8a&kqcIF`ajgc|eNVmS0RRr=?wcRrcT zu}FL@weO`tLc*fWf9d2;9hI6$4RV}aJ{?yCU6q*68|5+jTJ?F@atL`v_I5G!3Zk}& z>R1|o!hZ`2{8Xm46z`;{mn>j*YNfl}XJ0O9a{JltE?H z*iKeh{4Oq6qezxVU>1**U){F(u;0%z{8g@s<)kl}`(G)E@hQm9QVm>flM&p?(vr71 zoFuEAfc7>esNIKRoUmg_GcTwb$)hs|vd8h?7~pc?=X;6sagUHyH2SC)H26>@s8!r< ztlvh-S2llGa0c6Ivb|&4RmEoEG;5>%P$6P~PDdseJpHC{K6hZf=`fq&3-Em+-Zkv_ z;(&8l*Q)TX>k=&fJlYwRo93KB#h|S@K1|Q6F#%;HMx7|;Z`Z3+R3hi%wgfpU3J8BaI~g#yd7d0ZRH2dWsVScJ zS??DWoKmqe5{K8JC_GWk3B}8!z8wv7k?)AQ#MY|?>M(Jfnfb(3cE{bMXrmg@MiX#! zW)nLnEHAxL)7<5x6O6ru3yrDy2*Wve=P;1f;r+rav)>7YHc_L&8?w?r^D~~&DDB=A zKJtA_6+`?vS@W)}KQSzXVdBwe3oB2?uWe>{Uk&b}^v=q_;6O0@eJfouX z&UwRTH!sZ8j4n>De16Arhp5&<k3lD%0nq zUe|ogP2@MY87S2b%VjkITBa7(9=vB3nLB#VM07o&2ap#g-)JzmO*oz?T(#BEdgeF!9AugMA;gjPsCZbm2A9;2b0cY+gMObC z0x87nzXp&3e^RSYd^qRxwdv_GC}xcs2wVH%^EpzEOh7NTUv@jiV2}nrI@ixy9@FE^Y*sikHG? z*3DQ!X>Gp9!&Tc>MEi~>5WoJ0 z#L7J?JCnRb$ndiBpob!dbNB218?|oy_&Ib=bz>~{{)5ook}dOwU&U{%0C(@aC50!6 zU+kjze%!Sj0+Q+~QRZF4+xlWz$gK+yPg##$vSvp1>%D5n+89GUF65*Hp-Y9vZX$T7 z{BRu3QBb-&DT2!IwMQK9C&PS8^7F0&Ibps)pN!=lPuxMnn@pVeoEx*YFhkYqFGNT? z_)H;)c{J$-b9uoFE58qhm0&m*VDvIU?nZp~@(ePoaC+Yl^GhOJq6{Ac$H{aB6Zp_!1-*H(s4uQJVy&8Y4>f5;x)d z1O`Ub?=}RIOLBu6S@sB%Pw!|m2F2BsQ}q26ZIdltFHtUJR@@@n@;`rBde_PMRXN^4 z+LC!}jO6p>ez!ukUUZaoZi!-=h`alTX&I+?E442b6f^sK%`&Fty;;Zuf9719TLz%> zr8KkzU!uh=>7#Sw<8agt!nl6GHt zhdRHW^2URiu}d=K7E{)VNc`*g51;Z136|Oeq~mM_-9?~r@;fUju$A;;UcUfo)Jj2P zfElu!BrYzT_Vct@$z<8LwAHvn19uu&F5;9`pfVqYQ5B6Dp&irEannpS{mSJcn(Rb& z5}@4M5ZG>Q-<;T0ykD_Y+Rf=1^z%Cc>ZTDZE#^*d&Xv9{F9^W3fPaQjD;s|hDAYnN zE5(usVWK4~M*Y-5A77uLyhnp@NQb3wsAtZu-6eR?pzYVE(kSH$vk|5Yw34Wh0MW?B zyklH3&Bh{^laX;B32mch4^{mBUfBMv+&rg7jQChMQk1n&tU+bgbesHT71tMzt}GZ) z`BVMVITZ7{3w+wEwD6&{V~Jm+UulZNOJNUl8DCt$D!;w-N;Q^QTBpL~QWov@s$f4moGA2t|0!BSbQP8BOIxoJhzoYZg zVfgv^Nz&Q3vYA68)ugPsC8(d3X((kXVc~rHY_?8PYd?p_qC9&Dgd&rfmft zI-MlxNBE1|gk~WNA>&L0Wqx%{EgVFNR4OjUNyi^Yyo=0oizD+%GF=ig6xyEL>=lgD zB%vCo>tUK*yW2R4l=KbEm~%$8STAtn_s&bQyu7@j3@suH$NYpw$YfXI=1f${j@Yrf z55|^6Nd~x@63Zte(p5%6@<#d@+IkAEgN01$+RD#eVpCa$G?mDbiyQ8?(5-c3`W09(H zFF8@!S*Cnt7fd1^pC%?a_-guX0FC{%%|Vv-5}l9Gb)di{-$d`yOxjS{}sRHl+AuoQ2E;i ze6-6N`9bY#Kb|MtlzUex+Op*y;n!@dwzH`#EIW$a6)%~gsBH%~N%6cSv2|%uIQiN2 zZi$Y~bH};+B@~Y%y&kxs$tmbx9t2Yh#1J0&fE$u>7Z7uKU1K>aF&al+f&G-~% zlA6Qbf5GCmU{#Hou2yX$dbN8!B?d1X0j~xMDE&##T<#GmZ(K6edKkj+jro&F)XaRo zg(ccZ?X~$3vKO2qrB+k12C%*cs;<#U<9cl#If0NGhCGb5(5VD>jm!yJu?L;#C? zHqx!i(4s9q<;qxue67gBX1lZ;mBpRR#1FM3J_r%gsXcfrpRl%PI)Sj%pMvY_@#7e- z)%%Eu!=vD>!%d1|3oJ0;j>65spn)>aLzIH#678m?PKsQqdm>hy9*ObZx)6K^@q=>- z!;nJ-+-ASSJU{l2X!s@fi&nbz;x0mi>Rm_?ubxRP3(oNiT&q{(HRNFPV|;&s!n(H^ z*5;Fq{4%2)(>V>3ptDw!5_@!-%mo>JE>jwxPv6ddGTWV?@O)ndyJcdVqo94>ha51+ zMYF((mc04N$U4&9-lEMQpfxaq&5vFN1%+zo=pfuGJQ_ID`!S*VVUILE76+;?P zA=6#M1vb;D(?|EzGJv~s8uiW}V|fwjU9~ArY0+xx$MTZNMC!1Z#u?nzv&|kH_A61A zf>~3OY#B3{cvAbAK>!p@QPwO{=#`d#!pp3a)zwtQVOA-@v@-$xF=*$z!=5jWo=YH{V65I%ox*C+j+93QKQ*~|mW zuGI@gF5o~0J;d0%abFa-mtq>hMxHBp_W_;iGAW8C!KZ5_(8>U7w!`$cgiFxGda zr`)ip11Ys3kW6Sb`*Kx^*}U!mZyzpc3wk5~M~U7Sd&ObbMd#QN+rv9w zh(*EKg3zkv!rLhBTAiT!2$PlF1!gKSr%?YnWhzJW$bI75`kIeXHD#!yBp4A;Z)cpI zf3J`vi+LlCB*cpHLB;_^AtKWZ7^?OK-cR_|ZVjBz1QM$s?J@O)0qQ4R>{VK-9K}<+ zjQ(T%z{8AIJpI@^WKSf!8E5aiK%EKa&i5W9!Us-`u0H2TmI9pv1!2Q$ME&3nuWbJh zgGx!eO0XGaKf^8&*f;e-!=!vIG0dF{{T@FDeJuS*dAi%8w<(sGkDf2ve0prVdqenBL=)`2^iymHBr;?~@Pg0~X;((r?F&*>_WgS97 zKi!w|Yv;L=hgRSO(>5r~)7IRR-I(b+bA(_yE||3Naa}be;D2HZejdNcld~2xD3~Ix z+bZda3Mc**W>IX+J)0OZcZNCzQ2DJEP2P=J#fVU1i%v1`@;Y4gch6n-{w~?zP+MCZ zzhq=IjB>f)M~igjsvcq`nxJ>$C6UZolm_p~d%m^UOhgagv>FV$_NOt^{=WbaKJUT) z^|3^J2U#4bEXSXE;>CT5au%rFZ_FlFzY`ahzq44*vsCU5&v$gpWF8s-ukj1lUz4l55Mg2BrI78Pz z5^leuviN^%Z5l`$jI8d&_sk0Tmj>k7MB%2ap7UTByApwy=Rnrp?fUUaj7=^_{3qm# z0~Y93{vp1(Q?l?JlyXsYaiszPm&-(yHF~p5%Q`k~iM8dJ&i+`>DLkoT(2xnu8`OCc zVgSR_LKiIjBvF}mB&-!1osJWDWwlrF#g(&8T~c(|pk1x1-7|x1JOjML|C>*>h5una zyOh{Osn@d8sT1wqQRVd7^wc>tGG4E+uPx;yx;|gsmVb{@htdhQfghVzwi)p13PXu@ z?`wz1)8F;7DB6PL*nE2`tY(ObF1)^lEE8Ht*Z@ps%yB&D6?PIOKSq2^@hA1UG z(I92CLLz8Y_M0s)e%HI9L2hnE+@Un1z&hXG0D%M0koI~T!Vo9)bS=Gz|H zq&A!53dCO}Reqq@(-9z*ozj~sB2B?_$&-31!HFSrWwZy-ezq!7J0G?$4#;3k1CpA4 zcd#$L&$DnC*oE~(`~I=PrVYRoJE|l#^|UY`Yb#7x7#s3BP&g}+{8bjH!Bpvz0r-9oOk7{ey-2wCuMcQw;-qAhRLjy? zYyd$9>|6q#KXNnwO5&CQ1u6HWyEo>82dMP2Ihb&nxm+IHRSZgS#ZZ!`&Fn81Isau7 z(D8csZr6m8!yM#KyiAP@SYj(sS6+Rne6-1YGN6K+W}479$Y9dq4;75S=Jfep?u8se zw3v@bzTieTgb2G9+5Aegs_z@NRzji&V;n7vaNW$9hsUy_!OAz+er{jgD-((iCCZ4c zc7IKme;evP^6O|kcFC?)qYpZxSu~FjD7j}-0DQSQsUoyfOs;!OVTfe}4m%qbe?Zn| z6v-}1RA|BeV~aqzq%o!Uhj>!BEgTVmB5&Yl-HYs)^&)9nXFW;dYC``1a zLHcXh)@N8EIb4`)Bfd1x^?b0vrUQQ!>q!LB#DiTlf~{8`#C7{^#QP+D| zCa>cY>LlwJ<|}qZMMOBiZke`!B`JUw?<#^LNB1g$DTJn5`6}zrX-sy21Ml zWvk9i#&1e)7%l+s)>mZX%b|(8g>BM**DRNurdR2RvjV@k9nDsMzv;(rLMdYz5xC zSa$rF3v==}%KT}*I!i`mg@sZx(hT~gXaDHU^?+b4g04*C+EavLchG0NEq9ZBn*G^b z?~5&0J~SUD2~qSlb*%5_Uy?Rp%;ne5O4v_q$_eYCQ}J}Zk$62W^Q4P14U5gn7k~ru z-m`o_AdDJ%{O8$QQDm*vec{GWa64+lkLQyvc^RfgDZ74U1BsWWo~4vd~Y1xWH|( z?0ZE*V1PzyWE(Qxri1kWBt&a(K@WvF*2kG)vyBkF!|4lchQ67ai{CGk*{iz_vvE&W zbcG{Ti5VVF8P*kq+1ivZK!o1wJfDSH<=SY-AK1bJ5T)3+!yruebol`^BT zA9kyoe#0bRkU7EDk70%uzDT>w*v``=oCwXQs`ob(@Gy%EZ{vt3TISFYi#u>1MW_+b zWf%f%0?zfK7TlSYh!9!lh`lvu6;SgwHolNt%8v;3ZFTS_tFQ@G>}w$x%<%UWS} z^!b_C_=;ueZ>G-s9a6h>fppu+X%CAxg~XE3KQn&aiE&=dIt zoylkuS=}946OPqPmXhY9AA@2AaiPF+OE4MMa_cBN zrF0UN&z`Vp9{EJ!*)~_*ju!DWM0rC0kyHB4cr2^{1?Sp?vnNg>^LoSnDh!0k*uYr% zs=P!eYtzB-W7Yip({Lw{~xVE=T(F$nyG%lhu@pnE%jk`DUK$4p- zoj#a7iakcD+Dis`ke?n0egrYt`KaDh{1-U0wYId~*z-C3^$S^~w#`0CG5Eh4=wKd; zGkw}*B<{x0hlUV1ld7}KA?zF2k#W1t$qT*V(HhB}V!9i-inOBBe{VbLbRBVNWS1?CeM717)=9be#;ssZTYO( zOfl(O&pm@jqEFm@R!_OnAG_Es)3_OtI}SQK1rE^swSMOST*u#gn^+_DFjYYVto}7& zS*n|b`COI0tTIgimpCo2FHaT?L~+`hjaVOBsGLrMZje@I)bhKDueqvX#)6PW@%?5O zAiZY=WeC^tVUD$o<1ROhBkYHqM1hiOFEw??@GLT9=v0=TB#rUYQs)k$SPdK%u zD87lZ`bvrFvoXFN1r0qogMyJ(V%iSlzgS4ZgurD9)~pUl2*|bl{Q8`1v}vagRhX@R zl22ty$`3?8r>|x*?%{7BDKON1S`D%J;Dh*ql8#NFOmRC098+l zj4lt zRZ(*t*&6--8Nt|r=e(-Y1(q^UZiN~JIDr!UZW5NxH3US`gTyI?Rtb!Lx8EV1Sw3Yv zfvx7*AW6Op4ZXBfg>>u&ka*dY8PsZW#XU37D)L>t=altr{RpuPk}117L^zYJ%@6

P(X`8wiZ5HQ_{-uTL$GbN;3EtUEG+mxEvcF36W*+K%i`>n?21lV-%13Kt?24QIH zb#n^|2zIF9xBnv1fZ+v|piZAx6b^`H;qv(0rgbp62|)BvAiakAdv2&zLSkC9HPGc# z9!}mhTRwpqdgH*%FlfnzeOnZr*u;ZKr4?wQ7vJ9B@_>i*02CreMVh? z2vv=z!Tj|e{TaFIA4Z49roizlC3h|dg-jHIy8anYW`rx?z3Y%P@6miJ5)lbm^oJXt zxz&#oR7?6wp->r&W;o2ZR!kIv3wU`&LZa^BdXN;+HWTqRNhH%`Boo55)Bi@UROk`a zGzh+h&Z^1Ln!x4;E+wUb`rBDgVm|=E+qcI)A>FNX)dcqq;{O%$0?P(;1_OdDiKXGKTZ=&hcz3F_1G z({2dfD*ASA^Ndn}7kkZ6Xd)hvLu&uKF=F=ePZM5LEzJ8?tzPfsAV}@HF<=e@?S=1J z=^FI)>O+XBVr-K`A?+2?XVR~AoL}iORHg`4cZ{_soMPS`jrIFYcf7}&g<*AVzV}F= z|B%u5Zc-o?T6QrfNlwle zS0C89aVk2A)HngdtE(0jmh3@HUxuClyxvVGUzHODtiqz!9X)%b*?2aZmaf@(`Y8K^ zUP{je1x12Y&}aFT6G^%6*uk(ebP>3<{{ut4!+)14=XW_E5p`b3jTJG09j@U$4)Vq> z@7>EytO#AWXFA8TxbV#{9dM&uf*XBd1*5&VF=!0a@qv@UVt7wSu|FDFJ^ff9-Xit} ztMNgexSdjR#C62M3QjNu?DNsxaUb}KOIO@EuNe0VEXvRk)=)RUk%F>1Z;Vy*&rLQ) z&9}g?7&(I~g(QsZzz2aRGf}bO9ub<7zi*^6#~H(3DoWOHO2?(q-1`N{-(ip+iKH)jLY za{6rOrr%&U^dRG=1}rdCH#QlmF*elZ>U_AStV89ZT5bMomC;e0rt5bEy3S!oQzd@* zh|oj8fyb3m4_-VwHz&6jPrlb)ej%`Ed@T(cqR!7zFHu82>v?A)29T9xiA{Rp6`-_~PYZg`{5%;VuVj3^D{qWIK zogdGM1Kxlqg0Z01y|F*d0u62lhwFnlK%M#v(wn+2i)$j_)DF@}zioI`xP+83dY(<9 zfd(&Xmf}3-?c#sY-*jXoWBC5E<}05%@T#M~nn%)<>q#|XH_a)pbKVqT+Uhf>EY=qI2w^P`IRLq0Po!x=Xhfs0wloT zr0&s7M;DZgH?Y~O%8IhejX46eXhH?4LAE5U5~*0u7RUO$C=Z9EAyd_!{&jmjE(7A9 z0=7s|zhociyX?J>C&3+V|5n8hL2}7sEaS*-gtYQ!Yq;*lj_D44aRt?0*XBnOU1;=52Rtj-Vyeu2)P*R#!k7lQH zCERihu3@a+|#OQ`-RPJaNU3`zWpP&i<4vuF)x*D;riI-+C|(eY#`SGF5we0ZvIX1CeMG zlzN^kXL65Cru!+N$94Q~Nf;AS`I9v&6b>V80_@T#$t!MRr!A9WL>vt77ZmyQ*@-uu z^}7Qk)=mapRHqOAjxet`nsJ%KqKD$T%mZ-2d~!@KLxDP@DKv1lQS=&d%>7FHf$#$xHS4McG6Ry59*$bX$ttE**OaPm)`RU&I&v*D~(+!`{rD; zLtq7{+4u!&@$x^zNXnr`tiyYUN=9SVNu&Yki~IrAVg=miQ!5aO4%EvalRu_<6;a^> zVX$e=Rg)lmQXe}jx0lJ%f-TTaA-y}oQu{FJE_d|kTdKQsEx7Pv;eO^CQa0Tt4ixh+ zV#JAYL<2iC6Qu@bgkmh{uasc@ZK|ce^fwM}{CCcQ^tKr(jCUiE?mZtPB|cMc5+OH& zB56V{&@ zmGRaviKuzNI1jIMn*nRA%0M4X+f*(l5%s`ez)<%#SN)Snt@Jx8H0_Y*g!CN0Hcjr$ z5;`Wk9RUZl$do0|!rbbw0Sdo76}xg&+!N|V@1X&_MI}vk5WZR}2rOi&-_#&QRlQ6_ zSq#^FCkrG|>Unqk>A`2_k;IZ#4T*Sz$Z&Vg!y()2YhcLZU@cHFAWZQxjGRIz%GUs& zg?z?Uu58q)HUDM+r)z>~F-O1dtiCOAA5OCA(e38WbWOggBdU&SiAV&TC;FlpEabwiw$OIvDoOf%uBHRK|I20s+vZ|)Wxxdz3z0+ zqJ-j3%X*<;l>WXOL6{F41n*CeAr2L>7K?Rz2HhP<6jZ1tG>S^nwa#55g4g??(+t^o zGBOw?qJFDBLdFsE9qC3COx-{gofrP%iIIo70h|nDbGowkTBAKJn_Y7*velTx$fH=hdfey z0S9AK4Eg|;~+#sn1#rG(@xo&p?Te&t!)^SFA=aC#N zH3RtsEb3ZDC-@)gtY@efn;DB?f_?$~*IVeqk|U8kyG<+gS*+WQn@E8T`O%)WP+^_!7ctsK~5-S`-H33qreBzB?`E3)qAZ^;<#eVsLFK zhdg-Ch4}qaP~*|GRHG&j1Rh!~{KIepm8pQiB^n`BER3#6KDDZ-rX3fRlF|OrR=NHC zxq0bvwcLg4qbo#8%bFkbR23i|#>`2Z$9;W)%;1&xW4&BRU(#y{jbFsq zM%KDgXy9aIbHYbCViqw1v&>^kr}sH*i_Z%{bn1fQ{Y{^)+xJ9j|6BQ z!$lO!iwxA9OiZYo&TP!!_*NU!66ZohT1i zuaLDKDK!Da=WxOjgKI}FVcCT8oO_r&1DaK z)eDV>rUlcQDpZ)Bx6U8zoiRbIM*gE)B_nVR8!hQEDX#A5u_)*Cy-bA77d9_GJ!vL2 zoiH~Mkd@ach8v+||NBD#yWuZBmc^;Mt7$1dywOYLYxA*5C*ix*G#GULT%_%35&|yQ zr3Em3?x95BqXPU`<&5nJCd%UV4)uoq0ac8lE0CLoRGPsSi6PdG=>{h;I+Sg0chgqL z9D4*qZ%}qQG%oD6FZ@t2UWwAxL4sG09Q>$J3Oc1ZMegr0ZOeEvfuoTNLvTFyN01@NnQaT03P#H(R8A=lg zeS9GJL5EL=&nvnIGQ42gNCo$sT0Qj`Xlsh=0&#_x&8NvLt@{nfG#e6q>HFU3=*$H1 zmv&ZM5-zPCimM-IW6mF%?m*~Z_W*p7kUjjPtdre2{u7AgQ0Gr35EDP_Sk;mnGGQU~ z>Mr>}A`8+XsQXR|P>+JZ^@Za2Gck@2@K|rw;t%?QDZQhnxJ<`rJ8)L+5Rdh?>?uE# zFaWi7U}*I?6#eq7vJEREBvk<_`q1z;$X0$@9OMu-EdM+)D%zJ?b1FCO^2?@YLP_=~K>p?4R)w%7F$-cZcX8SYu9U<1 z%62>uZ6vYAi3;11o2nL<81HsO`^f~wJ6>rPzxJ|0eeb*eMxm{a5jumWb=}5Rs=qS2 z8~{2^T40&a*_&C3&3Bnb@s{nEYO|VVX%V9kL@d|lTn>LgnefFt&ulU zcS?-Aw*sVxU(;U|P6*61BE*FL_tqmATd1b3_N3P z@2Rb;p-&%S^u6=KpCJ0vQVh=@BgRujo=BNx^)m&s zz!ZEJ#JKaVCQ?nT`KF=wZGPWNfeG4ligkI#dK0z%Q9rk7Hsm=!h4J1YXcVYsCo%{$ zp2ZT6&a1^eliq}6U3CGj*F6xCWbhWNv|Nzp(Z}cP6T_<_`#(PR;Tfy1$SURG({bXD zlp2zl^vCvjyvxIBleq5D^<3SPn}lTim@b6ZxT;I7M`)J=VCl(_lljT@@*`@mQ3UWq zIlTMPDeiQLgog5o2a+#4XkYA_lnAe7%dznda@ znoZKagYP0n7h(_RcE#D;Z&U<-FRofX73#=6=Azh|O4_5|(dmOUe*XD{xnZDs9$gk+#$Jej-)0DrrOOurKREJ(~gXxh=O^6oJZepqo zSv>&eMc90F95o3NzL(l#52R;;!ZP+Rb)ufg# zvNvsXv_98Ibt|9dFQ4KH)+#RM7?;&$9c^LLrcKUS*h@b)f|?!({wK+Xwn>E4asZ9- zC4^#k*M(0_uO9cXX3$B3k7kZ%9Ctt~R6I1CM?`0SF``e`Vfkx5q;elu*?`1C`3=b{ zm=bdmeFhytJ~#S}2+;I%2ml4UkOvu=kTgERQ{IC>%Ys2b3-ms|gZwXj1E0P5j5_77 zmU?;4F&!SIu^j=&)qfh!Z6+99M`mmjLY<|HNK;g8=%)j-j|>(+iT-I`S5y6;)pDjI7^iJj&!`bB!`IcogeF6z)#D;{j%MvB@IXr( zqO37!VLN6+!UZE_AapUn)S3=!nk0;UOX=g={0PMV)BIMR$zxC#O23IcqM+&23XNoW zEHU8qPL&vdCAKp@uk+og!kLsJ$;*V7QrKxN5aE#uve0B+lBu!~`4Je=y$XjYJ!!Hs z<@vX1CxT06eP2&P7oW|>dLMVtGj2QL=w~tK-7X@eg^qq?O9HL0@duD9xjwO z@)S+h8B3!aBUIKz2Ia^UGO&cE4+A$X5feekcp!F{G}KpI60d4Esdq8cwgz-`=0v3LI#oXBBn3+G%Zor)LFgg6A?7R#4>r>ET?2 zwEiH3?TS&iNUk(RhwBnvpzf6c%J?85kv(jk?w|Jft(KB+^II z4>6|q0rjUM$t$l(Y`Sf}u=+VVhqGx!IYAY&6&A0-fOz5j$M3d2@;2a%yKZz_ARt;W zxGbgi@c#VjaxzU^*+z_pHyX!eCd-_YRZ<3Tl(QP-?dT)g1I9=%O2-M~t%GG7=r-sP z<$#`2tbXEbfr8$t8!{&Uvl2L^TKVg`#s*=9)oIc2&x<(3`2VVoVEXdqVpuY5lvZ)6 zoD&iZSfj7mZZK-Q9Bz^jO&-h+sd*};1BEcn&_fv=7wK@r!DeC#P-@8WM!0UnhFy!U zKVXb*6k@3UDv6;Dms%0_=XYf(4*L5XgW&vI*icH9E@@({w3?1n=2lhHH4t97jDf)K zEY`@Id+LR;*pajY1-#Tg96+G77Qd0JltGFD15$=nn=P#ar`9=6yb;F4`yukNVLMcZ zUM2A0+*+mqS;o|H@8Dw&Tkb-X)2leiH#2>LiphKD0I7&)B&mqsjg$!aM8AFVzlx6; zlk|dKed^-}9`bpI3&^2tujN=|!BxZ3f52Pf*yNa^9VDx_DF?1%xz8c;2$X(1hb1S7f=$!VcrK|wD0 z`7v7Dt(lDLf48u^xw*$W3Ku6vjXT(%R={~cRfsohg?GG!xn_x}!+Pv& zfnp>;Oe96L9ZktIByZT|_esXv8|~uKg`UeO5ayv5F|{e&{yJOq+>#RLt)AxRw`%|? zl=j+so*Qfjw#5tVB6mQ=oLp7eY*Hi9L#Wi(O^+^HRbIOBHGvLvelhbIFt(QzuOds4 zWC|YG)&(>xX|c17{;JVW^-HO_b~4ac78qfDN`7;v>dNnL&I7EzoX-aXFhsy7Fg!Sn zz!FZkpP2f*6=!9{ss<~=r$qbTRv#Z2M1}&I4AU=n>q|ugq!ECCndMC5_gLb{geqU1 zounwU0QYFA1`1gihnolz)_0+EvH6fw-laH^nXeTe-OW+8o_?r0W2p7D4i~%*rAhI;wx; z8WAZqDBWk72&`GBMC|t9iU|9r2G8+H_CxGoerbnx@mP{OB5gekOOZVa8;urCzMd8J;>bD~qDng;v}|L}R)zWd>zq2tDA+qR8gBbLP1% z{qh$D1$i|nxz{m7AMyxi)_=Fzf7*#uqLjD0)qf?dts-p*rNKly8v=Cw)Z;Iq`+(u; zJcJy+0Srjl@uN45Z>EG|1vc&oV@jJ;bb-_44Hur^o=Oz5KsOw_S9~Po z8j)-hPO?R0#tV<0P5&hd^kpfUwRlZ4`#Ys;q%>?^89NcW@q={U>5XLReN%GEO@oVH zhet(~VOF3<(x{>lY|Y?MMF5BK+d?PCkuzDdu#)D_9=FnEYS~1*F@T>v(l2X(0Jc@( z&|ey$1+Q`6^i$!|o8B25$DKT_gl&d{m(x#jX`hX4HlN`)Neaa6=!RCSosF{{wz zN=POQP!lujS7i@46NQ=t7`6I5 z?Mv~Yy0$>R4H2|J5=3opSgv_MQ*_|XUj@HnOBh`HIh1&|1x*$t-sE%X#VjgV@egum zo)8Fo-uN=;$W$K$v}cJ2z?rnjVY7+qEpROAmPia!ZuLc3Pib&d$yOiyoYiAezqQTG z7%||;(1d%cx{@D*eG?9G`iobZonJ%l~laq6rjB&MGbj8SR@`QU-9ZQO{sCkFt?pun_y^}hfVk41v%i>ny4fitp(Quvd1wzq8?8VF^?rDp5gS1 zUAG>U%tK(471hJ8_U+nGh6oN{G5ID=KPc$c`f;84X^XCh^O8`^M+(6PHld($@Bsr1 zm+!QJ!WY02uc;gFTw^xdJWd0UnefCUC{_{~VMO2CNE1<# zNA*3qmv=Yr#C#dh=4{0x*q#rD7nbowG$wAr=fGvbt1GO$8!%|mf(b%&}55lLz+QUay%}J%$v| zeg2EG!$C8#qI)6LxdWGs$~KH6tnV#OV+3GgY9|U3iHwXPDHqZ-gf*;wgF+Z`6Pe|A zlE}46tY%Gh&*|zF?40gxAchy7r!K}TZS5DTK9HNI3TftVhDF6r_Bx~lpmndNpE&HBCL;v7Tsc;D0(>L5O^T_=HqV56^I^Gp@v zH|I^%YFBtt#$z#r22{)AkR73A7`Cq-AT7*jdjIw0So(;5&sGKhlc`!p#7dj=hhW+$ zhzI}?2j;rnDfyKxyo0UMrHs3E0NLW+g1z4|$6Qvm^wD`pt0Z9$%J@F4<>syt;jE;U&|^g0?JiL8-%lQ^<2g6M$Km4HG#H4p&tFr7`qcf?rh5+x|$_VS}9% z8YPpi&N`#@vVhEz8NotFbNMHs1WUZlwOpdBotyPO7o78KGrOT0c~Z2JN&&Y(W=D_a zW5?sb(v9e-GhVwbm^KR1OEZr;c=I4mW3etqW&%@(z(L!hG(W(QXod5=jM^SYbQKs3 zio$;sFcF%_CGc|1D;YA{Vt}GgEf)q}_(E^F=%CKq__a1**6Ia!9x;U8#ILj4xbwia1Fe!R*8*%r)83Y)$HbiYKRcvx6I~sx4VAFjSyziP z&21Z9=-Ftjis3`1AQ=P(Kt9S8Yt!_fT7Vr@Jdz@j-9^sJ7K0$;Xq@BN=Vj-b9C@J` zoUH!XRZfQuI$V-k2F4MmI+U{7o)sSJHu=8>a_jB62^-u{_bxkpRp4{_k3p3~qEk8>Ump3Xpsqz%$r4>F6fP9=;yR2?2jp)HB_uH64Tng4e}hu6+yd~D zA=Z(9IcBGm588U?xhP68Wc%c=50(tcZ%IBu^R(->3s>qw(A2~*BeNO5w{A#TI@H$# zX4$mW&dbr}C!|M0$cPoPY4RPqS>mfe^6AVc7tnQRSF(9b6)zte6!ufYvu1(6m0nW7 zpf~_02_EGCyF_!-}ka zAjUKLC0;{benWZIo55Jj-m}&@H}9n#K5SodA;KxL%j_myZ6(i;7sNo!LY`??RDL@49U}b2<-)*cG62;ew`Utc?Rr%+Gdt1!!Y zS}B!Ep9>A^x9>FMOvumDxh-)Wh(P1Y53~Nx7-OWN0*Qh)5PQ>)!+eST6qX)Tc=C^p zVdM((pI05x#A{p(`s7r{gDj()&k_+s?ryb{hXF?@?)0aKGFFq& zYk=i)ze$!-?W;+XABQE9G%gk}VYQRL<5qG+d}0#~FdfJ#^Hk*Bnx_d>ef3?2&LyP_ zRJ@&ObPVK{sYFDwU(l0RRQp|EVy^;9^x0J8Xc2*+#q<{A9Jo=*1ZCj@PD1(ObM8El zc3+h311~txGQTkuEASnZ8ObNH##w_w8~j4i%#!u|xEP*ZVq%SYi!wxRb#gE@2vzNv z1I?vD#1T2iPnkicX2XqdcE3<;1%QWbiafA1_kndbCYQ5M%A&MHb-m5pi(p>_Lza1- z2m{IKOD~tJPT-L={><1)D%7t&zkAYSy3l(uB#cdo#h(}Y2_~6xR4q>&ax7xr$R5&h z3)_4aV+?RQu2lbMDXwBiLjWmsbWfCSLX*q5`pxCM;nghoFjWbxb6FHbi3YVP&PqbF zk`$}QWj5!40espuF_N1u_7afz8){EU`~do_p&g-J=!6!LNo)d>&?^mR5a=4IFw(PU zzVq9Aok&RlQ({NukC*YG?;9#Or>w+ujSw`eumsO>GIc1%?B5D1&`KM?iW)cwe&Tau zbi^u8$Gd2)`C(;WUbg?(>gDH**64P-59SOKU2Xh0n|fvtS7R|x@xUJU@2sJ3QnIh* ziIU_S)M1r&>I@tDj{Ll2bu#Euh)G}h3TIn%{{59AqR`we`BUjwkyJ$bpj#KToey4u1z`ACC z9xc=wpxS{XBNZp-$3Vw`4^6X*MJduga=ORfI*(5U-*ck&6C*UIik<_=M4O~iiQ=aM z5wsZ7OWv0I2<1}ei*^b%qjy9JBL7UZ*L(DdjqbPRNGGc!ENBGzTR2|L+<+|GKei3e z2hRdJ{WsiEuk6VL@k@A@VmW4eY2`3J=!}ATuHAOKt>+H968?usJ8P^*GZVn&Z>`+K zxIG{4-%VUiV?u!MK>1nGAvzgC6{9fuc9GFgbV0^4%+e+z%k{Lik*-Yc8nOw9yIbqQ zak*&SHPtfmI4D_Ho9<(S{vfKw9mztNv*g%88*7hEmcl?!Sg-#RCVOr`h`HAo%odHamos^!EX zr;C$_D>=f>&9CaE`OpUw9AG&2YK6f~PY=}&NjFoFxH%yP|>7!6-6 zBxd!1jMw|l8Ddb`Q`yC?HjtFQ$4*e8nebIuq$+4^+AAZ(W6{t~G@q5xl4<~%3~E@u zE&d4PsdQn{69j{v*gyH4C-VMGJr8(4j|VmuV^U;D85*Jei#43F&V-ey-ezd zXaU62cPtLB3uahO$*s7j4x7+)H1vUUoRbFt8O9=#Wx}N&FdN9SY>*XYtEy5 z(>65g5O|mD%Hc#MWmObFsU9R6KxHt_M*aSFTVA8jlVKVHvLNKzevFm<`xe2Q?s!q2 z2wdg^Q919K)nTzYdIdW6F|7O1OZsk;KDRvOe!#R?P_q!DZCakXzp>x3Y9BKo8LHCaBcfkeLG8sEn4x1C<$R^)UxLUvt5cYXFhQz2P;g}j|#Un z!#d;Xos)7rXoVh`_8CCz-39*TmcDx(+I*&Z7ulyMz^X&1AG0&VPsv(-QC)6hz# zSe=7mX3`WNB(x{b6;OlU?abb^8@0KM_gl$T?M@+r;9Nkj+5Ma9zi|+Sd2n)J@K5K% zImhK(zCc@I(pn?zHdo&E^=>Nw!cv|uYq(Y{@V?zldpg_Wy*cv-IJDM2S z{GHhuTEZ~002ly&Cpi(90P7mxYmu9fn@{hoQ5(y$OI`#|HNQ+MoqsV`Kzyd}(H9V+63Va{|~{nE>p}EC6-}_J45y=g7*=2w-Jl{s;cb z`~{{j91|xe3=0P*;QxlddRUlQ|H1#O=PwTj2Qz?^^J|~4J^zva((w=eOW)TB^RGRB zY0Ml9|G;1Rzx>QhZ2!pp#r=Qaf9(2qpTFZ@JuGZY|G?k*Upv`2ng8Q63p;>~jSaxY z%<;wd1z$8~)-RtKm|_0%{u3|Guh{&J(ckqK?>{=282*XNKYsqDeE}l_!x!*1N0BySFrw$fARkb)PKf* zaes~ZOZyA|9t7vtA^)F2)A^sI;QVh={7pf{L-QGpNteUu=y(33!o6T zv$6l-Y~l!zwKFzxw59xyu$YCl3E(S?|0Ma}+?O-3`O53RH-ia4FX?PxZDAy6Yxc$T zWrVV`iH+*lslfb`_y5mbp;!N}$*f-%|L?^ANBh5*?`yxmHvIL#*+j&|=*yO`%VlC~ z=4=jN{_^x|)Bmi#4vUF{?cWvC*Ku&LGs66r`1fvoZSdbq?v@eR2(Ey;GQ$c34AM>7 zZ8EiWfwZ>H+XWQR+6oDU&`m1f-`d(L4AQ;M+Z6KF+kA*W)T}q}DT>Bi)>yI8>iGka zv)Z32Gbc6*SYm8oB{n)ND;5VpQ`Xl9r*CYWYi4X5Oir%G=u`*xZWBzd2D;3hvaIgJ zpAN7_1mW_xArcV3;?cmd`k4?<`ZEl?t8ZewuX}t5O5fCY@8wbi$J6#F)YS4A6hIpB zS4}kpXIDJz>Qc*cdTI)nf6Vix8c-o2+Rw0vh=7Mb`#^%!zuHp?r?3IA&Me>>e)23} znji@_aN`2ew!YTDIeV->oa^>Y_3a%Tav?Q2r&gEJuN0?2w8wB|fu#VDzuK9HpstP> z1rW@^Ut+B8uJ(Y?l$k?5mlACv@pOJZ0DzYCEli1;WsI`dBssfeQO!aM^X z8Xdy~`}p~255yG!6#XCvppQ8O^(As6Bl{VfnoHaTWNc^(|5?Yq1jqD&2*Kako=Po{ zj*ROq12iz$>D$o`37HZbRbRL|d)JBoWT*DY2m`&`1j2Kqt`6)xW-S9mCnpC4i9_6V zI=TPJ@u64rSwjUh{z2CNRF_*&Uk(5Blz*)|`GMURDByMzSn%1kK>_%K1EdRJ1{!UI z^z(-D><8vDCVcm2%7-uHr=Il3pI^Rf`=8V&pCPd+@rC<(y9vIsPn!-oiK&IFo6KDf zXM0c+&>A@Qrs|)Q+CQdznKaOijSr0P`FrD{1?*j*G}F~T9+`mQOc4xCD*?pd%;Lhc zpMg`GprtXxW)`OKN>vUFo(VWWcqqiXosiv>X6C>PsE}9r@2n6YW^>t-RF*d8Mt3P5 zW7cOtIqket@c3R_D3~?CJYCsaUO>$sXzM@_9#%NwS42Qd()2&8qrUzEvC9O4+2^OK z+9iIf2SElD05vpoJ=_JSufPb-%UlJ7Y5Y?aLQEcA6(}HhnLiZ1E9WY32^em~lxz}V z3&_(PcEYi0_LJ4YUoC}?b{j-EgnS6>fWi)($$bD>W@hOL+Mg1eKOK?G-1-?zgMgeL z$JDcLa-bV7%DT&>d;2*R_Y-qGJ5oQ3>>;N4GX`7+%^WH^t!Nb_o)v6`-zgePW}yrJ z&Kiw5 zg9Q$xbWXr=JoK2R!rd`1!Q~?s*lpsc6EJK_RX}`2#k&#BhY^@E1Y{EF)X6Xyna!c` zE)>Y^YYfmEGsth-b7PY$3PLh@+uFAukNO1X*)sYs4FFI3O!o%tGtem*>mP3Al%ML& zAAIMN`Rd&lOa5h<^`t{jsfovSr=4U0yr5h>w%hvG#$cYNR>yaHRHwT}OFWa`RIB)Z z+cx~pA=gu1{v@(Bz3uyjb~c*LJ`D5Fhi7beYYfBEd_lL;67`yt1S z89#r0;P`l834eF}Ov9_%@0jc7=mc6CM9{OG0_)jzvdwx zLI#Xwah8Gi^-f1 znl2?`0p&%y>Sr)IV31!aG~$`E^%GbB*{nNzx&JW1y4TDEDSGYAaKr*u&JZTi8 zO_RhXH+HLKuu}cHNc+!HhkH6G{8+xEt`rZr3#N6DAqx9WT7<^v%r{E`^eVPZP;IqO% zwXqw$q;5R!>WdniZyl4p88Ou6?>Bm6D?{IZGZo$!Z$=Q&1feQFqg&6DHf zJ{=}Q>a*YXT_3_vTQK;lR+%lB$L}D~Z+vmz-;K^WJ^zwyh>$*~{fu3u7lPeK$-3pXZ81XerC}M2pE-J&JE@&seO~wq zCR z*;KS}aDI?1&<|kZ`;uq{U}xUZssZFjZ$4QKX|>E=_3N2zaJe&`M;Pg}G7TyI3a7Ry zdVE1+rQ1`cSw`n&^%}i}*13!-uAoKZ9d8qVev}C*I;@P#z&b?smg#*SDC8cw@1_n@ zHZ~-&0CdR zKa_QOqqRk*F7{G4^YDp&(R9~z29g$dESAkqHGFA|D2JwKc&mY^Nmg|U20_(hZaEDb zStlmZb+DT?*{Wi|&7<;-Jy~WZJT;6=2+?AXRJ{oo1U7SZPoIi%LmK86|5=3C>Ph@H zmZ_2a-0^`5;$;M)HepJFkCAgXQo!6eZ%;_rWSSLQpdIth;z%Bk#V%5O(Kop?aAP3- z6wsN50|zNhAQ<>vA%ZectDa08NMcv6oO6!R_dSfoQU*g;|8T_3VAq>YG_Z~rF5Yuh2+(bvtCgmmukzCd{@(}=z_IAqQeo| z`efV?UU_B=H8?_TWM97NtpfREA#|QdZx~e6*{Evqa6?ABE_PU?=K{)Pfa6qwKlV%7 z3CQNkL1j&E>M=^Z9r%~R4-`&{Me9<&$0n3YMpXXZt;!3Ui?{dnUs)|$FtibW zHjxcQsZJW|L^g9vsi-$(Vq@m|jZypgqdvHoq|8q77Hy5r zX=+-Cartl@nr|0YE*fv37IpE?n>)k4rsP*wRYpq7xawp>{_X?0tia>9hW3`!9J{3o zM3)Kp&N8kb2dh)K$M`pn`IA;kMA3QCbGHM9^q?8$8mMId9$>cvw5F+_Qo$P2w;1oi0s_MH{(dPFPnT1q&RRA6=Q^0YgjZU41RN+C3DpCQZH3_rg|T z+4c6>+ALWlK`k7YY;LQ!L2O?wQTlb~Ch8a2=XmEY*L1I(MDTurF#O6#zA(9Dz!P&f zI<-WBP*iLHx4LE%casr3jP*mCTN)lCu1H)~{Lu7qO^#j18^=Gn+Hb!S*Ww|d>HL*P zb7%Fi5-=rX7eRRK??#a!F(%8_xqxtmqJ#$5qZ?BMgH<;Iqru&t+pY~Uuh zdbcBBvIUA@t_)r8w$k-Ds|7 z$|XL+f1_dGWMGbrqVv5BC3CqI!@NxPgIM9g`24I+8(5rbGqEZbqlT6rb;j`wTaDSZ z<0VDk8r*1*!@;9KPw@%aW65aDd>*adlVLE%Undk1hM0CE4GPW7en!=v2#VN)=Y(N5Js+ae$fh8sCDmd5OMTG$A<1KDZB1FtBc#9Ka?H2WbnL;KgpY9k4Ew@^V|@ zl>&%3J3585k^7d_3Yu7B%hoLA0#SEwgnqQ=_v^&4twJ04GFr^5MOoBT<1bSA9E%Vw zR^du`m$`H+moqup^U=5Sieq$$>v=Exm6l7~a=s>&Z7ZSfxiw4$p?y8IiV}w5it;ql z33hSI8=Kr&33>Y=gVdHa8ly(Y{$hpc_lVY&?OLJCM7DGB;ah*N@8d4OM$&ILNYXb; zQcB(=p5oli5{@kqo*dOPbGbAb<`ulXSB&iN&o#C zeJ{kCNEE3R<xOb~JyJ4ELx9 zzgpN9;I&_6BqPkNy15u{Iy2TWj+%s}%6vw`WnU&Us&YLL?tfmeSEgrZFhrvSOS-~) z8WGd01gpjCE*&7|fqzjw{lLXUmVh-4hVM|mKrg-v*}_lyM_8vXMyMN_Lg_~{u--&2 zePL$_6BQJRx&q~dys=laiBzlr`9g23IR_8u;#r}LFYm-zG*U72T&F0(5(DOsg!hvj zq81^A42w=j0AHHv?Q1VrvEK~h_^FU?%u2_B>XA#K$ zi&>CC>qXp_jj$d`<)R=s$DE?f&&E%4fy%k5Y9Polttn|U*&`H7r6k@eVx~jb!#Zfd ztq19qN#j-Fx1_^fO@UNn=tbR~!9)%#*wCVq8pWBl@Xp!^Q;6z%t-Jr4j)ZGADH~RJ zA@=JC;sqAiTe8pk4ovCtbAq&*@HHn!@;I>JEP0B7xm9eJObAKY z6+9C{vq%QS9d5C;Zig2&N@obRE28pW{(qv@3)2bL3gP0E!Kkj zO+9`WoZr?_2Cm)(IHNnoceu{ni&4&}Yxn_Ati{JSh%FtuX@r;Ygg57mg{Fn?d4^h& z`@b*F8Jr7lVuCr?1g(JAX5^6>ZBx=If2Y5Pm+qW_2{N)zSVmB_d#3CLcGhkL#&Dxf9l$Cyyo;#q9rJz3z_qcC{SeFG>_dXgLqeS- znQL5NvFF0)H|@R%POV^M4&I`=EQ=4NtX-_hnuP?EZ5w6H4c<$BB5@_-6>{-)ihli( zr&l0BFjj#pvTSS1Qe2JDoSmZb`)@juIG4kW`4ESr<5=R~ByVfA)7h`mIcch^hpRcn zCsWs{->vI@4X9ZvhnW=Gt*Wb$h=oz?*;Nqx1jcS+*rsgJG?W)-tb3eCaji3U=T-*c_gHfrbkM#gxlBbIqBt0)3dd z0lZaup=(o_vXIzr=*rh1Gw{GXT;D$PZnyihVJ5MW{lqou$Txe4kOXMeH5WP1j ziy>ofad@{xG1xNuY?ivw@+CGqS9U79*J+pF%n+u1(BI6~{vo8M&tksm&ZMfr_p$kDn(N-E5vF;TwJW2p7ldK+m628W=I;ySD#(BKY1is&Di0y zAmqETDEAm0&%Nxw3~HZO}N9n7f5HjMi@qQ^FWQNep^#^vlM^6t8P4Ke*9b zup7Q8MjUW4<=@c;QCgzgfx$JeEKYNsprPz+6>9SpCowX3KM277>> zR%sn6!(SB1ei&2_QDH*3`%~{*H7|vE1biCeHA99T5d`k8!*^p}XccN#R5zqNF{&@n>$3H~ zfJhyMV4t{w^(yHf{*j#+*TRmI_mEE9ld2Bo19l?_(6)29V)%X;7NahO$G%g!)b#Z^ zP7wr}st%w%zSYdTgx0n?TB&?1H%Q)I*N6~29|*5z6?5NkL~TFfo0i;cpP1eku2Mij$ilZ34KSACt(N_? z!3D*an`y?YcPSiO?ahNbl-CTS0g^RFsifM>&OvgTS_ou$?G%~Uhs<6ayqnePRmpqw z906%h0*jon{9-26!v$EbgSGe22?z%4=y{Pi+ILm$HxD+uWKBXw1V3*k-EO9X`)$Zv z!REg07tQN5LSVXJ2nOI;VrocKshN#E+pQAKdCwSx$C*EG6;G=mW6w1+6>Z!g{P#0* zt0&0&I^{7(nD)gVY~Rn2Zp$~y>JtWB`ljrM8s?RL(TpHvyfHu_qMUs4I9 zrJCyny+yn9uH!TM#hc_pNV3mfSesr(1#+^6F;jgftg_k{7o5n5ukOSED8v?S(P^By z8x_-$|mCSgNJyQ#^8TZl7>!AH4ygJp$I6m`y#!sQBYN~pWvL14XLt1aOei8i z>G?Ucl&hLVvlgB|O>j}J9JWQi5>To!U`44^6d0|>(fXVV&z7E0{CLtN6;BkDd|7$q zNp~YDJmVXJ)=l4zy2(%Px)LoxO^NW0HIk@9T7*_N+CWXI@$}=J4{sNq73f~Z#GX^YkV&h=cz$kA$Z3( z1ktCumyFuTY*hYbEfw=kXC`M>H=H<*ov^lv`oQQ4UoV zpG~u0G9T|CefP(D$YZ3J=5l6}q+Cp8xi!G@RjPE!v}A6Qzyaw;eATb`!p$(L3j>5dtG;tFMw)h-(Q;`w41}@Qc58NBC{kr0QAoMxtyMIWu3W z+d4Z3)A?B`28K6hWHj}H(mH>(R zDb#=EhpCSFwRFZMyjQ9?dP!m5~gv&GH! zp+Jp(Tgv%)SixNF*MxR;Q!iu)WGyVQRBv=I>h-QvuZvgQkiaXyk}O!Lv1;PJ=+RkM8IMrs*!q_7Iga~tX)Mz-OaaPJ==_!E^S zL8`d?hN>>%nj#AB+Z)(>5h)Dy=*k@ zu2L~p&alXGoebyB!oAq;%HF6#&^jRt4)h?J8AqlYsUGmvf@=|Xel!D-*A9`k&MrFh z7Z_EIB4X4Bql5zH1W@i^{%lNa0!wDJnO~{!Ro4>q}zT)St1Shr(w@!TCDVs z)fPi2tnf@u$(pW<#)o7Ca=rP@!$tj+mHvDQm=<>vF&iap1Yb*U0sp})E9E2-T*8(G zB569Cypd9T!S}mu7~098WF;g0Tgu3vdbfaqEs27Zl{ms!04;y+5(m7pk zLsDpSqYLX;pL0{gRK|=4#RIhzE=akr+SVcLA_~A5!#&u*ddZ9iHO;d_UNe9G9E$PX z;SfjKw91LX_O1p-*VI*?F7@BqH@A7+n?i;QC1)(B%~>-|V?cAQM)(8v5Ds2=b7mI< zT*0f~F*>{88Gde6z=OiOBjvGVXt8{Jw|4pJC%GRud``{P=75kJG+Wi%>!q!c&gTue zC}?Dw(>Cvlryl+Xr&qn@VW19!PVMTEf8;mKS$XL@Q&V8(KwKg?osWOj=*63ZCC`mS zqW$<_O|dLpewcWB02oM3v$MIF`0zAk^23 zpxMR)c(CfLrTx=h_~!+BiL7rdW!$K&($iupm@3u3EsjhpXUopq+#Ht%5}+pwCTLW9 zey0w#JgyE(&juKTbG3Y{A0%goM|h(kk<5q`zvDHxtma2n-h5QyW-eUkR$$RaFu-}_ zq&lcWsj2BamhcWa>!vEtueKXz{)L>1$8`}o$rC{sA$e6b9b zXt}{IDwFqTLo2BsowAD0=HR3ZSsH+Ed% zVHI&AF_>}2>X5Q3%W*t*yJyr6vj@-Y^qfewbR!wX4!VD`0W6$rDVGsa&;UZRhDA{r z-M}gViPUBf>Q)|?)b^`|0BXqZZFvgT9AI&+@8fllW`enmdMieav*QEu18bfU+xHfp z#Tx@1yUnn>tEGn{FB07cr$Z7PAeTK?4B1&CFMkg(4N`aAI>iC(>v-;MjUZ?B4I|%v#@F1Wv=J; z4L%(Wq)XyLa67f8mSSlNp`y9;^o!$U<$X`T=I@J^gxcMBH3QlrOi2huBx?gy+T^M? z&<1l&i8{39L}t$37%XSXG<_PY5^F6MgxwQ zow5x>Iy$TJAjx2>^%Hgq&X^tQ4cj?I_M2WxTvDabgQWYXhDIP0_*rRST5Kf|I)C^%Wgrez3qr+!NAZ` z2*)$Qw0H=K&asdIEOG9$8ygy)MPx!xo6xZxc9Ji@t2Rcx&j_pVI{Q$~;{HC1W*`iQ z4_y7~<>e~jD)Jk_4v$hX;e=m-U~hGzGh1Hs(Z|tvE#KCFbhXB_u|KnAvK>4PkxoXW68wGqi zbp}G)lE=l=)p@ieqiJaS5ANP92d`dI050R!f(oErVhml%Q6yz7*Xiu|u*^761}sIv zD-$xHtb%2bbg(`|8fs|k9%F=FrGKfV=o?wIpMxR}e?v%B)sI^|UnrdDE0+3)KM0-F z3TW+Z^@*pqw9X}PFDI#E5i8FchapqsaT(Qohb3ho_)HTh^Bion0j)eFH%ec!rPofCcJC(1L4rZ-Q^N7A)4+n{Qhf4cXk;y0~WK<=s+%7u8}zw_};BnZcHu9Bga=?rw7 z2a?OSJM&HzL+96|QS~4TjRC(y6Q^E-3}jk>p+Q~gouK)_qd#>b@OJR#+=_=0`aBAO z0B*=UP~|$_MjK_tGpo^^>)MZhvmZh-bRBAHI1?XaNU>8C_p1DaP#Z)}+WmABvOzppOEAMiS(dKY$eRpV9shalDQtw5 zmg^a^3*oqM+sdQ03_08y%+@~MVP`(^HnJN{ z$W44BwkdXa+MO#WxdTCTYd1&Nm6;~54q7r!g6@%OoD+3e)$1m;3uM|y*l;=ZG-b&F zV6HmPj;XlV^@+((JqIkPvRuK<7fp9IKg>MrtT^2StW462u`q=!YUOy&w__`{rXIYnVjO1`2d1b_Rp`_>5Xr@kl zL$LT9GsD zJfXB$oRUZ=Nlt>=+4ZQ+Mr_Oqhb+9s!>s8?5(6N0C}QJ-w?ir~YUxpsoVKrqM4YTL z%wMWIyy%>G&y1uM9CirJt580-vW3yc!$8@N6&`VFVUraXg;{1cwt}m;YsM=#wn1X) z?x@7|V3qYT|L6Glk=8v)D0l0oK1DWYKQ9&JeZhIDMJ)TGdX@3)6E|Hpe9MNi|UpC*Y139_J%g}mMLS-&Vi(vf| zp^)f=ZM+Ef?d}va6@QwgJZH?s71tjqp_n~E9RE@Z%(3quSB*3S;Cyvc@9{2pL=G%v z4b0cX4!?&(oFroRN^K=)=r^R_*Ye!B7{w?}ZfjyYQWxqlHQuLTzSSM(TNpqk$7cIQ zlPl_i{N9jV8*$;Ck+~%!r64TD;*)T?FR>^gY4v72%w%p68wGJh7nNf?;`ejA8ARbJ zbDMBf_LPg9?>8iOl_=7SNUT~@ryuDGVPskR1X;t#s*$B86Q1NZ7|vI@yv2Pc>9CU5 zw6u7zkD_y@v4d(yn!6U{SL8JrgyfZGCm!2aNxlv7Y-BFuxeYA>YT%$h%9Rop&I)QL zA^PzP7mqe@x#&$UTwu)1QbeH8a8cmSXYV=Dt-&4R=-} z?mrR+q%h(-cf4&48k5rq$vrVIA$FK#&DoXb=F%qb!V&hQqgXu!w?t}cdOKA;I;7eaSDLKN zn`qHitTHCi?#1dleWoIg%p$1Eg6X`$$HB0M{TikXT7fd5o+?Vs!Z3q#HrtW6hUtzh}$P2F7Mml-QTGT zfqqQ@{kDzFHiZif@2?yC-9o*XDv(z%sFAu-cQI(5D+TVrJ7Qb*E)OHj#7YGljM$$4 z(r^Ec6>LiAp?tmXalz*>Lde-O!LGT@X(Ll@>v6WI3;MimbgAro`yKz=yZt2P%<^|C zvU7>#Y61$Enbtlw6@f_3kt48-E(pJ$XO+iMPx|(eA?Kcr3^+7DvIELeg@4xK5KZK?0 zf^q{lSP&0ir@2#}ik%)D*RY$vd%p+ik%uCLv?Mh~MmrJKq`%XSPdPP^aNutbZDluR z_0f~cT-Y5zR!LFk^BR&DG%>R0dDV-AW=*h>9)dMA`#c zrUMR-$62qNL5gG~Ca%)By95!exeR8TyH$=kJ*7cgkEJuvg(QAnfqU7ovt1PH#p50z zCMTvunBQ9>Re8o?akdH(Vj-6;uou4)gUP*U{wx*>_Rp+SRdRe#m?2My(7qb^MiPHF z@&Q9-+fl5zyVS8NvgaZWXmYzZ6u%BBKLRpqVbDCHV;5+TJX6ygk#{-}B3w^S%$jO7 z9f1Q+9a`nDN|tusJWE`@c#+Ytv6{J(tX|e8sHE{jk_*Qw&8tmMd`~dmMc@_`#)jt> zE2jN(-S^}IDFJhdo4L>%J+4!^5TuYRWx>K9P7|*~PpU-H@{+0^D=v+xzRqa! z67YkzqLp->irpKXcHL04Ngl7q))&KN&lfakR6t>NNF!*2J(@rznYM#rF}Vs>GAY9| z3cy(_TggDJF>{-je#}kYH}?{WrlA}vVX7slCE;51t^Ru}OnbMb3N?QtqI=P$ytGXu z*z_~~Z!;0c`GPdhCFXk(?8i-e^{r>ev1>n^-c-2fOWc|xVwFQDtESPp5lI0)?;7k~CEVcX(o9W6z* zm`--Qk*-n&?4^5`E95L{o^?GFt0c?vEVrruxR%#~X7zNmT7B9Cr_w(R~H|f$YS;N6bpqnu$uCZPRptsFbsdH*Y%fswt-9REV`8q zJEKTItlIRozlj-+LGJ~jaf*M>52o?r2pq1_#B9Lo&dB?fgte!^ySX?xS>gpu-t+q(>GN;1qh`aPOKMU_xL}tiFnlKlGn5$PyY$pfTxv9{)gd0g{`qiRPLGmTw2?&kD`K zr^GXeo-#t~Ny0_4T}<(^M8HTX2=I!6P<4_BLcz2gGVyv_j?QnMQPkWqI@ruRk7XAsAG9U?N`=k!)oistZU9f z&8Y23Ag<5emmCuY6a$a}eIfbduND%BL?1dP#j8sP^-Rv>1&+K8KozpeS4P%5{-Zq` zO@}$}gTZuXz_+fiGffE-iPN^+Fq!)UhNCC_<>~42UQ=_X?&)9;A;v$h^4zd*7eQP| z!fmz9JR9r1;0%<93?(6JGa-UjR4SZ6Eqk0(sdv<(y(+=SOe%D=Y5SG(=TYS|z*$uA(MMkLJ9Lg&r#rO-Xy3sPx2cNrvP z%B+wv<&tqsS8mxz)3Q!C@~Rt)NgdWw*6`22H*ndZ9^4c%Us(B78Nlnv?4oN?3L6=} zDY@0P#e91Z^u2HFrj{Ionq!!R$NZ`elSUhJ{$dgnTu`A_b@_|^8AxYi#Vi;!U>h%l zu`2~!tqW01!j+E+loBn$5s?G31v86zL4D(xC6eWYgzK@l;O+NQCL#KR`Y|cqcyXGY z^6y~o!ywEl8JUJ8NS>1K(xO@ z;wAeU9qA!mbDKRGE6}3f`(iM`g8t)qMU421d)4ggJy?W}Vsz{H4i>=}SNTC(&acw1 ztG>lt*%Rp}eTXb?I>VP-h$G&uxX8_YPr^a@dJZyXI-*+>aq-tF(%D83*4B#@;#YD| z%|^eo%J2v8a{O&Ci{G)%Xd17kYIdmMCYdHBOk9kh=G9TK#=KTvG16ec-aTJesIbAD zk@P^FL$B}HUbT8BT|vu26V1j?=uau-*1+*j|3JTQ6pxjrxriWxZUIYjM1keiJTP{p zOP=rg_bN@2zFzVU^r@P&b=Toh>>T_HM`8rb^c&Eqw~}5+KrK z@zWqknjM~QZWfL`(Xw4X*!=Ds6g(`B+bo%ldOTwXwMM(@t)zrhskZ!nZ3A$=1AxRx zj;4P9!gFSwz;*0s{M5A58rMAu*4=)|xRiW7;3c?WAjoC#r!wQr@;4(UB57u$;68h6(h;M{y5C6Jc z1&2zHNZpYM-u&>xN1|I8`LPgFjp>SS5`-_dyK9@re-RQrv_-}P#?*MaUx>aV6rK8r zUE-?Z_}z$g?%mGo^ulku0pbzJBbKt$j*bNg`qgMEJ%WoBzkD}X&uwQb@#kda?=)$X z%Mp6w;GN}mx5n+Y{UD`2yNgg#2a7r=j3xj2?lw=*9o@&2%+JsR-J5^Ks`#Er49?RT z=fNlitta#q(rZ~!w50PehqmAT4}+h2*^%=fU{v|lN1=Nu1JuyM55E%x1es9ae4Umk znXk-nLP+u-ECa=-NR15jI3gX~25Zvhz-EZ#t06;@>oGJOQ(YW~%9V$uKv;-Rf4k{d zFlxNAaIz=I)HJr$uS+#K^6_!&pC@jPoPZ>dqsPT|Rv0~PW_Pq4ICL}L3o@3VN{O@? zJXXbs_}Ue6WxX|{+bzDZ@&Wxg{j<-99G`<%3H-0px3HoyN|pnn1Zwwk*)4u_#UnZ0 zD1`j(*6Ru!)q!B(@qroT3Hq(Z+bE{~ox&$I-@8FiR#x&NFVJHkMBUG5919=beCNBG zdIOs4&q`uqLg>-aoJRRG>6?g!^`qZ%&37la5!GV>VhIG=%h3TYa6b9Wmp2N~S^D~; z)v8law}~@LSNxRCdfK6>C}zmW@V^RWf(}&_$2885v54QkI)?Mg>8ztXk>4kq(+|z2 z#&mVHdWamli4T(=csp%pjMU0g4kMj+wt8dXXe_os6lomYO~Rc|82Wly@1qiE?tu9ftRfvGF@?8e zj`?yW@mh*jo>3<{ZCEU<)HlXS-U`JH#zH4Wv+e+=e2BRX2us{XYU{b?Y_If$)j?P< zg^+;I7p7zi1*^wenOv!n8lu@>A57}!Q ztC;B7ir(xwM(P?1L$Q5FM;hnRL^84beD>t4l`(9jF>AElo%(7?P6$*w4O!x$Z#=g* zY*S4avK*(zr#FTImapV^bqURy>_i4?ch^FqBQffh+!3!yXb58ErCB=#wRK<(E(NYu ze)Xf#)W{Y9TYK{$Nbu{ADqux=F zA2GRaEVAAK(#~wyEM8{P)rY!PCY&C~iCEV7oaufz)Tn=lR%0A=qO5mhz2EDjrAWJt zhPiqMOdwsaOLwkXpulcNz>_vYuGrrZco*C^M;pzZH=POb;P#~;IjTW(lYKaUI`Icx zd|hnRd3TF!>a}LnS__GonlID~8M=Z%b@$;RNQxOXEh_QvNPW!>E{7lD)*`nlx%h~5 zC$Dv9u4(X&(o7d0Dl~BFt?`rew?_#LWi4=;iFTCOMKYnRs`cIkk({Zk|Zzrb-Z~4d-D7TnAhUI-lB#C!1vJh zVRCB88s2dZzbx2hWaq?|T#{|4iZug=q0&ICC|81NwckF<%4zqLK}fpxMH{^*Y5%f1 z|82)RS#y}+;|<`dmT6ezN|DQxawO9LcT|xJeq3Zb%(>LO%5tmgI9&d+jvz8w`;G?U zM!!6)>Ljj98E5%Dp&<%E)s}Coh=}U``mo&HdXOuuq}jNq9>3>&)~W~O$u|4SftE#Xwo>(XubSgjgqxCU zx9Vli1T{%*y06(NHIA)u3ZeT1m&PJUn{Rkw`7ac(hmk*daCj&j_2rK9JDY`3@1=!s zz?kt>Z3*V%i;T_(kQkoqzqXsHc ztFEYZ;89gID8d|fG8z&TX6UZG5mG@A|IK`#nhD>f4})Px3!5b$`94-Hcn%o|`ZZg$ z0^v4}S&xvdW6#AAvOm<;gn1z=RjL!qkSdiw@~d7Def+3ss_iu~x;jAj$NHq^UFLn9 z`B8;hwXeAJc<`_dGMkGS!BN+pj7`hEkTIL`(zR-|;9(Ob!Yi5S+fufDJqQ zIKOo8$*}?=$@eSTT&Yv2c0@PLPLJlCy@b&&{p-^oY$vjo=Z-??K70??XD{{ygXWg^ z8kM&#gSWPZ9Ze)vK8&nP)#-Cc_21FnLGU*=GjrGX@Vr{zSjgsV_&L$YW5nb>?Si&U z&dQWhqQqyvmnK&3m>IXB`K{}90em=H2MMvT+pEQQtf^GaSJkwcb9U~7DjmEZ;rTpG2gA|FQ590 ztfPI}=P>oqZ`@I?H-&;-Gjl~J8bQZ?6Gd=zZ=AN}T@{?5ywmN_PxuC5Hx~MeOt^_6 zso*?ny@@gh*a+u?G2kY+Fli_G`@T)kFW&~fBqn^HI-tYg_pFYHGGrb%SB9JM2Q7j| z>x6+GXkU{(4bWoV5$Z{iNj2?STy7=~QDaGK`&7#NSeBj@59YgDKfTzJpk8FnaZCA; z%~>0+6q*YW*AVZqzScd?SIao2$iDGP^U_evms;U?zCX$R^()Vjc#qq4kQtAwkXQxW zNmpXNk3eyjNw=fcn50Mn5v&msFy7o21%0s_RZw>if7=gv(3lIKut-J#iOL`k$*!AB z+)-x2SwIKBIIJisL2MpnjN;2^x|XqRh0m(rl7J8Mu|8Dny;aLAcRRfH!j@tpxI^I- zG=09<4^d7XCQVzRCotd3Qb~-AQPQaCrJ_842DITBY7`x_v@*>`s~VhRN*IRfFN&7eiT$V&sn!iHgc5A@<0Ah=8?#bUpj{(44( zh^O)9lZ({%F;VZt%utVGYfoB2>-Lxwt&gm1DfhZLt_5BJexDK)H<-v(jZdsyT}olU zxT9J6?r>o4(+c+L`hV1WnJ@j@vTD+kg*{)g_4=iH{VD(*iMo}Nbu}bu?&|$+K`x!V zmG~RAthMyJ2mZR(TzptEt#_BFn<{qVO$V-3Z8q|Pb+uxH8Ep&|5U}=Qj`uYiMm*wu zzHdY0>^vCIp__K4*v>WOu$k)ZDsHz+_X`v~{2cdRY{V>gl4%Sb`TQ;yYNN6Q{WNJ6gFbB5P1zQl)LAG#Q8(lB@*&|jbL4U*z18gN7dTRaUtVOO z-1wX}Wy`$%NlKV9J)4u5=y$P20rBR`JfY(1XlW7JmW^3n5X(qs4vdb+zDCHtl3LBd zrGnkaW(wJCB-T|dVWuw>&9aS%laHsg&)WmDPm1_CBhvU&^6G9Iq_ZhW3P?`pJKSGd z-Wzq?X_8{W72c2wu3*$NF_-Iv^e?Q*=kWA|lbHo!b!PTkDqkirOH?oR4BTF|s$H({ zbEt@Zhlzv=frD?zb6~KrFWrB|TSWO<6c?itG5zBkdOO3D89SQk@x061kBs7^cr++Y zHM%qq`X+veX8nW&CywG@EpZ}?Q&c6k%$u9HU^hdvOvsFYdiQC1(Bno7Z~$JvvK1_n z%CdpCI96>~hbm|l=p3)iT~M;?NMF;GJ2DAw2O!VUe5pVbX+|+nMSkGyxLe4Wq3`6R zjN^N!)bE7!bHcMvMO1b;s)sFJAJ9@^K~fJ#fm1O2s*h5v4SSKQed`0J4;_pm?Ws^Z z!`H9j+R=F2aTfq*;^przd}{Ve-FS($Ou1@HDA=yj99mQeI~(dY-^o@Rz0!d!l`;>y zIvOTPbwq+h3aVBqwYpA_SDl^j0rdB^$!F=6BP9$h#CnRY)$X*a2xIDuMNz zZB4?r`Ka&^%4K@%tRQ?H))!~E0M^ujkLZV$jwKNL&8KdAc0~v(psw?u5kKLB#~hg?eUSH11=| z;Ib(mPzj;!;N8p#8#YY|`?hHuD_pR27W({k_n>qNAlo>FeB&U=c%Mz;#)H+rL!bx* zJDD!T7~Wj=cM#q3Jq|(1ht;VgCVxzrEpDUY=T$NP?VL0f3+a8+{& zkz$mJVBKw&O8g^xP8c0MTNaso?EF+xg^8F^v@OehdWP3FM`3jQKz)l>o|z}@a-;~) zk5Xv(ou1u-JX~;WxQxa<%{OwCpZ|HmQuEgfK)!tEUdkwA4kH}QK&CL#a@vhx*<)4c_UVe26vOQaixyFZkafoTzY zZgrF*T zX~?{1bb0RQ^+DgFOCAn`3P)TQ11l-(V0HjvcP!d;T|wUT#+`Uz%^k*EoE{~KHK6r8 z<<~x_3q#eZM5_|Fd$eM!yeS(8eZRhZeDHdyM{OP*QN7L!Ccysjp&EvL$~us~;MCKh zkDGmDn4s{(RWpDl!R>frd%Hg^c8+?B9BfkB#bb|+6%I9@H^lKCcDdwpWdT7X&~nS~ zx$Ojj{?e9;r~D^UR-i$VjO3j_YtEsN61vnmRAjD4gwd~%B0j4~H<8)E)h516 zhz89*fV#%wwq0>6%@vk3>`qfAsE@I3;p6-=q=Wk5KP=z3M6=3(y zg%&VqwhAehk^uj}VcZ1(=7T|^)}KU!X_0&`7~sBtiPYUvK<1~YWZCR!*u%bxqj&3n z%*E@Nn4(0WW<}}t9r9t$2E;V*Ot>zr5Z3wkk3{Au_q%xPbf1#wJg|J#g0Pp{1115M z?n8@P(m4XAQM1hl5IMlc+Of!V*lYgS7jg#53-NYZ)t~^@$9k>SM+~EY|80RV*$%*f z;@XRr^viou|F6Ed(!!JJi>BWUJK-qAzOo4y=Pah_WAv>H{#YCx%H`5+qVwHtvoRRN zAr9D|2oqc6Ijpn({0wFku@cDI$7`Y6`dNW7)!S{1$v}H)Ny9uUiH<5yQNA%9wtWFM z;FzPqc}xNJDSa{KyW9V=P~{^?H~c|?P{+vOhj-Kr5Mx+f5k8`8jTXD<2b&4kdwE}~ zoN<`gsIZE`Nib#u4{l0j`R`3%rqPWXzb|qQAXYt)KAxp&R&ClOMYLG~QgXG)>{b%1tf|7KJ+p4R=#Foi9^Xn!p3i!g5W;w6iReFpGM zerc4k(MfU=J|Dj!$AP9p>C1!+`hU|YT6*oVqW#z=Rc0CzpGqiji zS_m5|c(1t?R9su-tcShJ@-g6;%c$w@^tS{Naa*gHx^9#h7mb(;2*qZ27^eqEiaY;I z27vlP3DaHQeF;?9q#)(ls?Um^NqHd&o))HzXnt6)=7;UTex~r`G3kws_NU~u__uUMKY<{;0;7*sEN=8}AQiaL~w{l9;ihYvOoha%4Ygt5bN zo&+lb>2XZ0PixCaI$ehJF~ouCK_yR>HE9@5ij+0?Ed2NZRwVFI^9gRyC);Y@E~LU7 z6nFN0s_2SFYg62n&5RGRV;3YZhS^9ht6^1;a zfmxFMZBoUKcSiT{^xu$2)tkv&c*vrYXl5vq=Vb16>0jqID#5-h=f*6vGC=X= z|7I3G2dT?6m_tYh;DkekvW^*NDj~J5kGO=heQdQB+q~exZ9Jz#$S?j7KTkAeQ;(5R zOItT6=?xsZMRGO@jC+?@p(ULYsui+){LgnK7&|>-H$%8BZ0_3~!crklD=e0@inEf* zUfqtR!`TfYaM)jdb_3w*v-A#^AAT4&=cgp!2M1(bV#*L0@_Wfvi*lWl zJX_7}kmS27{yU9#jG~-;-`kf|W`sc9RG1kGZm`p-C3V_#qeI7AkJoRU9MOHt-Sesm z0^wgFtN-a>)1clfzLstjtZ$O>hkgWRVFQfv$vf{dtDfm%DTqa9I|}Qix>!8Nq0YBy zksaET&kcP$z6G&9UJ3(iFrEu6HLw4HmMv)tG3$ZF_pSg5(nk+JivwAzj0W7i9Gx#1 zxm_{|Dm(6pGd3p04dPXy*1IF_@oFnEN!_6x_N1Z z^k1TXBD)?CAgVUE7Hn2C|I{cMNxEBPxOBY>{iQH_rG#s}YYcCe=o!#km-EcKbj9_1 zZS9bUCCsP2O8>W~h{zyk)z?;{nX8WA(PcoA`Gjw2_*}G+!bo}dAy1&XzE82S-I72A zi>NUakVrzXl{!|98i7AHeR<$;dG=6=-izpcD2T$X7kdU6qAu{mkKp(nz?Pk{vnZ2y za79QS?RAl1jPu9@{tNDaRhGemE-h1|PAS}9thZ4|gktbxNb83U8ut?F{Xr^8%sf$= zBOl(f#zID~%3;LBJrUuD9|E>x>vl6_`LrYSW+PBy0PlR()?bdOYmsh3P3BTJ`y$Bw z(cf|d&709GA5DGTE{uO!bXObI+X>sGLEu{;fHbQdf^%^M@hGU4h4`sHua{jm{p;Lf zr+#_8HM#2s6rYXH!s#QjkZpPA$z->0J|sq{sT#kP>2D@q|>t)d$>f8!uh?GtM`U(*|ser_?GcsOWO-v^8a~< z0Kue-O#;L0@qU{N0!c}QlNch7XK80JFjF86kA5#rdNJ4gM^F=X(IZY-w<2Zp!s28I zaH>xHjpL6qM0CvocXm;8YLO6KWgQoI@UOv8r`~=>wG*YJU#FiSXVZfK8WjJ+S{KaO z1X~ov1f`zIKzl0ODm2=;zOHt{@}~87i5Njj-6u}NatZSw|JpGM!D!KT$x7Mv?Z9$6 zVfr9M5!;iV8~0q-^%OXZR2C>&KYU-}waj!YmMnb@kFE_{xMfTh4!NKcN(6dZ){Dal zl_>EV21rImCC$bpK+7#Pe{$bq2f}?-a_uJ!H%{`22j(4*eR2*NjJjt2W$K&BKFS5p z$Abp=heq9sJ{0bbK@*9-?fJBcGEWYC*O>{$4-3Fr)dAw|pNJZho`Onn+oCUh1)e>! zuyxi#LDC*tGnnhN6qILa|r z=Xz_QJ-owb^{3Jy$<5|%!h*!EH&OeQ8$6T}@8I|J!V$seTsr-F@Qirwcz_`GMbr=& zrk_s{M)5KU6f$ptnx+Ju<4i?-3Q$97{ONkWgA8E3SPwUBu}x}()gSubQ@&1%WjEp^ zn2S6bVkQBE{2D2gG-L9Os{sCG4g+4ECpQ;wy_*a9j=@u)WAP#>&o}x)JHyDrY?1ab z^2*0Bi)L#q&UW&D(ra!oId})_DSx$`gF@ALT-ujMV&kDkNekXPAwV64KNt|VtUo#(8}OIQIpPL9hc zKDMx>t;NI>XL27Upfh?=@3{r7E-Wm(TrJ0A>>q?pw(O0pqS*y`^!AD?6!ofNL)D`E zHEbI#Q~Ym)e%4DVak1sYw^E0ubkCVZRPZS(203~xH+zSX(9Z5h_?J@qN3W7wH*X*v zwGw!_yE5<#OooDlP+0e?qxz{ne)pdb81Tm6F_9#A@oNq@iC!o$?URP$}B$$odi?qxW&KC9*CsANaiJ{M&0NS2hkqlN^c!}iUOIu zUx$G+Yff-Y2YaeBo7Z#|79jqmkDJ#ocpkp3Ij7;YS_;5g4+9xm24ko~ z&4iG-$wd*f_Fqm0i@F^Yhan)=cBx|YJuOJKd5oliikGjc9*tJk-U3_(V`jtjUkMpN zmLLk+Cat-5?4z^|PHUY;{?X)e=Xa3sTn=Nf^GMDmoH*NZmG92y`S+uTGP#A+12vj zd#%}!{8TcEdGlDXA?NDijx&4*MXuD4twCaH&4#Xcp@T%3H9h|sOOtN1?rWHE$6I}R&4BWEs{}gX!z&vkC7EuUQv?7E})t;f|*%b%yawCGkxCI6h&X$b!_s7@cSXN zQ6>o*T`SD4wN}31l5RF@gX*GOOzw`^|b z^2Y!r;aeGsRG%=?xupR~aCF5rrve#)l_Jv50^%UBEz0Rrxsv)(eWP9y3SQ<8bjuR- zTMA&A1s)m%v*Wz&&d?MpeLgoxm5r-ii<-2OyD)1Pa9z^v1yO(M0P+-UZqx*6i2O$#+a!mO-1Cm;I zkIt%rQQI)K2^yXQSRW$5(`J*lq8ISR6$>dG@_B#Vfh%IaW?I{r6-~}LVPf2XifyKE zp|6jgMl&Zubj}V1RFblL4e@KsKbg=9@_Y%{+&JM*#HT*{L17dqlg(~@%mqzuQ_yra z*O#Ly{Ly^TpPWD;HtI|eW0NLC4O1Cg$!+eJ+Y+7{Jid z7Qa{bf;D9ryH)d}2=uNk09a9k7#v?;{a#R?{K5zHBIJE=FRJFFR(ga5E%gmnE$>Sk%Gik*)iP?;-PTUODY zCwfYtyLE|>#h?|iE~+LSCXuws4Pw-?QlZM;2zFFUh;8PDr7xIVs69zd%7{5~agM{# zZs`iCF+?RkcnVTi$yR32nZP3S&)!7X#VvlBQB$?E!E4Bqlkp^rl&_HVxmh44L!0gnmv04#)^1iXKIe!kHp+MCB*jBUZag*GFw4v&VMPJi=f_O#A1SeuYxjti(qNKR3+YK#0W1yB*m(N_!Mknj;$ zwvFqXlHO;&Qgi*uwA=|Jq6Glq znBb&$5zAH&++Rlh zK+s6A!`>Z}>c9em>4D@uw_m9&NYsYAFE=Tv|B`#!;-auLzhnWFpYW*WuI z=Dnma>3Pj+NY>Yy0NCG7*8GNVjIrm}kTx1z!q~`&3S*CU;gz?(22Mv(>>kr~5hu2j z1bUns^}SVU(0GbLraq=>vN~S}5H;f?{hGjAEVxsbc2JqPjK|gxATXjT&?f~DI<9N- zZ0kl@N{L>k)6H-jC6FWLHl$1Cb5~Vmr3RBg>MTUpXfz9f{E>Yg_mx{LUrE2nsbK74 z!7%M@d-%QO*UUeHH>YQr66K>dK|56^#wY@APw&gHTpn$DK(MN#yG(kv&sjwMN?~Rr z2>L!+&Q%&n^Ay<+6K91XPh09lM1s_X4qP+2G)}C`1JUP`OF8*5A9Sy1Tf&p5qwh6u zljAW~KBb2mz$1vq^QQ2yO!l6eyNFhho}+BhgnDBWKrN?^Aa&8pl^-EWKxlpgIL!$$ zn;<@!fGS{}f2dL6ER&~BJ?}EJU7&tK&SisO99dp4UXT7`h5Y>=|NAKqhfbjca*UG{ zLGYcuVHaO>MvMrtH2p?Pbn2Su_3=~xi>arP_p-dkx;5JS`Rr(+WIC0O-&+pHhrEQh z?*oUH4t;L6?x4z4)foAEV*^F~gSrDeShrQ{%nN=t6>J7ew1SL?TL;i+w6>mDC`ERE zrHsVs82&s!QxX3y?J71q^_52$OKY#OFFULKb)}dh$!rI`v(v0hcUc1Ri3qkPVw&R2 zzk>HFOVD)F$BUp);Ia=vwiLuarkdtI&aV~#XfZ`7is}UdDkF(^5OCuo0_oNKBMZ}` z(c(Uj*kHv{(&y4zy&9Md6s~-XQM#E38cvxJjCoC+ZujAie{X-m3^W`-VBA^pzUCCD zYMdv|2VVC!S7S}?Z#pzAX4COvR5c!D)*qg8#8R4{t-8K_rQYxIevW8;2F}?%t|2Hm z9=4OQaD}2-sZ|OFbVj_)FOwz_W8}*v8$fE6>ARA7#Qij#H~s7)DZJ*AiAdsN0Zz)paBODDimI&G4cv9peHPm^av11n9JuP`NOM(l9qd|B*k)Z*iN~5mas@@Ysh7 z)i;Kct6u0=BFk~^6y<^Eu$xv(QfV9yBQ`V>x3Wn>Nu$`wCXuJHSEc|Gq^?Bhp%B|_ zS91Tw4jerfyxtedXR6b=nQb8#edgNkn-rlG;6*Wtrc+(t z1CRQpM6rl%M>f7CiQ9oiFLCRY79q&3#k#Cv=f=68qgln%*2RM~S` z3U*ks*D`iO4jK}S^!on8JESIXjn?jZ0mY5g$ooukQ{AUbbpCk32^!dDn7+iAXqhTL}p&maZR ziMYx_MGoPvwI)0|%IbZwFIEnvpfvJ^4Q++*>8xCL-1nOXPskmy_2k`>CGur)lnH{b zcwQeaE=P49_`NI|$JdQSaL|X+Sr8rdsX%haem0`yrcuf&7z((ha*W|Vy!Vq*CLrTN zge5}+?1jtZv~G&}T5J=Qo&yH5{%#v1@C!L&hKzu6tGFlvbd3Rc74wTx^8GTt_-$9$ z;|=~TWOtY5t>=7Qz+6pI<49R6itHhb(KTBeO|Qqa z25pr3zx!a9Ux$-6sJnb};h?>JIBGu&h!z2kEsoBADl_y01skA;8|N{1Dp{DEhfeIG z4Kcd92=)J|XxF7cehZ>wV*&3OdATw+OinXGFkDz!X=J*Ur1?hAu`_jX)H$6yC-)gZ z@Piw1s^w2lb9|Gv#eOVI5CuorsFj>}klK5&I_s&`8Qv+^sxhsne zJhKKs>bR;wsAbjk%vSoYL8MA^?hkG2zuquC*^iIM*AP+ac+ggKotw_5e&StT=6L$N zeZTarnLp)dI~KGVTk^x2m%HtFyUKCF)Dnig;9KZ=)RDwr@ zydu*G=!h9DQ$4e<=Q+*a?DI(|K}JClMMv?y5ysT!k*bNDK75%PFadCWTpQVj5pD^z z-8b&+!0SLHw^ObnzHJiAkAW``S6&Rf73H1ytDg3 zGvQYr>P8PZ@lnPGp|OK)B233K0M(CAvJAfo9tu{sxo+8Gfa_*)X!mdl(WiuDyyQ9C^wJCRpR?-)wplGWMk1j&g&^3!V3kf-h zqqJ~FXc)hgjykammWvevh&@&)+hU?;?Ln^pP2oNCLK2I+9MsIts0AaGUL1(zaUi1U z;&I0TM36yg=bcHRldqE?&^;{Cji^3dws(Z9T-JDxqm zPB?C!*agj^D_C2w!L3n6oYrSFl+0d7@#mv041Vb`u@0UH%wV=?^EnC zun{JWqVK+$+ga?hUC$U+0dt#oolAST)}Zw|wGA3La_j_D*YBt-Lcb6PnUZoGX;=AH<=I2t@ZuENtfUS9 zUgG-;ZmzyAq+7`OvrL=f_Pe*c@KebcY~2XPgyiueh@7Nai++sUtSW(oLsus5w?iry ztXE_$6xby}a5(6royr`fs(E}-zIBQXI4f!mvy4(@1Fc{68_V&nNVr-GTE7R`l`(L)El1Kw1064S||6v!JckOyI2!$!m)}6Iw2l- zoApC(b|>zNB9|}EAyGUG;Q9)^hVCI5#bdqBM;PiGOfVE&pK#d~tDHFo9 zUCo&hdZTbMrqw-TzG7h~uh88Awwsy~&m0=oJTjc*qfvkp&VB`Fodzd+P4RVJTqgmJ zZb-=iB*{j58yn0hutf4lN+~Sd>E{z>ZR)HGn}TQPcTUwR1ouzlL*PZPhX;-)@@%F- znbNmogIh;^h=Zom?h~3b1+1}BN96htixHJ$h9n={GHPpBhct@(g0koF^ff7lB`2Rj zSaWlel)ZbHXKy;F`~WEWp*~*5*MlX2NcP1yd&T34<^0%9J@E|CgnFd1Q+j|i(rM+< zY^E_-p*xt@Sg~pQrfYrV$0!FH$NycO|L@RN>PU%X(u$gC9(iUpEt{v3Yu|f zP~0XC9r=fJZ7`&ho#FFw0%A2=45s*^ua?L;l5zQ;f4#4apIls5s$hEIV3_MBG?cff zUh%9{*t^;lOZ<~a1cdRwLwA1@NXFSAM|PFrS-`653e;m>8m$JqCo4OLXEA#Im8uG% zm0=vnk{Vp)Fss zZ8hE?z=gnCW+r5`h4l@7uiP7=3Dc5ng*Q-!p%3ZSLe11Lw3{KcM~v7@!L?Q6T2;Kp=H{0x9MiUR z;f2U89-oq&GCUY^{P!S2$)~zGP|^(&vR)SkG&E)?`@+#6h&P6xU6i>HmN0>pAHEnR zCvZB<^%PJs@RU8NQK&d@yDh2v7CW%o;1z;#f;pr1ssAvyvFB+yzJ-$I@%x87#S}>d znu+r3!9^*qra1|jjaN^9{<)Gnr|F@LlPrqj__@lu0$AlOr-cSodhZB126QcdwO~Qt z%RkyKQ+Lndy^R+n`blHREWV|d9D#B+fjVF1U|O@cUp9fG7sNVa3vv45NKL4BAzR^5 zN1mM|&7@FV86|{z{rzSt8nGMONTh7_`-xB;=WqW^iMt}O=f;iPvDWP|dYBZ#c-Cr~C}|FmywQ@Ey^OQwW2^X=Yo$OT%)vlATC z(hLDLY|*3uBik~0WIx6sLo`23pCBx`>^!xO>jm)6K@V(;;kw<#C1R}vok#2CD#JLF zuU9Yplx$pY0e3^-nt!nsIa6|%caWp>FhLigXW0MFT$=`1=e1ENFQk&IFLgSzJV1FZ zFwATlgck&lZR`JBXt;_=@oc;b?|sj#BS19dw;rE(^6M!ug0~~_?r>*S{M?fkYQ~~J zY+8!06>na;j2{lnVbG}sRuCKKP6BTBC0AXpHW`1#F3USSi4X@cGLayskm!>{QGr?Z zGS@EWIRbw)Y^EEAbw=>0wd&K^dzmD6f3N|kI-1aPmN1Gj!86o_98k4 ze7`|#pu60bFStwIB$3q3h1pu%dy@8Nh(-j3m~z*;iM!IcE8LCOXb^Gx!QIF9vv5M3`kA?%Z4p&LMp`~`r z4G(y3YiC>5V17ic-SnCFR>FX1{z}P`mS>cH&WS7`pKB|9T1~K?h&p^hhGB5}zcLsF z#58u3Lt}1lRGQxTp!_ZIOOQ%#L4tSH!ynT~UVrH`)ZwN-9k2@f##=QU?93Zw> z&m^SXCqsQsfM9JYq>De$rmQaUPMmkWs^* z6i29yPt#C)F};-ivtLXEL|v-@(zmwD*d!xb-)!6>0cM$*ab%vMhm)r45gpnxGL{x6j#6&Yfup9%%G%7D@&~xL^1S%UIrUV|ZDEez0ZONn|m8#4$?h1!J=n$oS=+z%s zydb08@$Iglni8;Pw+N;-wf8oiPNozl-1vIQ!$L-v^K+r&S=la8h3Y!yfd(cx5N>Eo zAKpNt7cCf=Cp~UKdsjKvl?bT@+vA7v1w)+OV~}LQx+v^+PutVBr)}G|ZQHhuY1_7q zY1_7K>-Ow#@3Z&);@lH)?~k{T`L3+Ytcs|r$hGnbh)h7A!+&J#&=xP59%Scrq<@kN z5&>AYV-iYgNWEuJ;YZhyaP@lg=Lh-jR5$c(`GdiEV|8K`u~wQVa_iXz%afOw`buuL zZI}-xtvK(R|1v^ng05irAnssps7^B>CGzV7yytEYQG5m2`CbJAihRvfj>84I7m))o zAS7#<-RyXVIBRmrxJBmH`E#srJXb6NJ!rYAEihc*6+0yiYMWPL^a-}CFY&d(idXU|?CpIoNX!hpb4 zak%aT@$6IVj8rgf^GV1#U{s}c(TeW8K$4fT!F3-UE$<(sIp}m8WPY#+H%I<;^x+0K z5FxlnGQ2BH*P=7^JQLixhzE5Obkz1%^V9z99jgK?w9A4X@rLr%kJpaZ8h=!PZIoq5 zFhL~ggG^nG_6v{NHktUuaq1xu8)Rzi(7JGGUw&sQu2YNt{tc6Lky=>#LIsHO0d4W$7}R{DM~y_7-H^G=Q1z??#D4B8B}~L^ zW_sdo5`naUabU%H5Nw&!W@s8{L+7c}z&CWEqF+qMNB2UFs) z3DdHQVem{yITtGoUlW9So%3Rqet=#8p?Nj|7>9521#$DtuBz@L^`OvX5$wWS@V^m~ zS-(lue<7{^yO8`%vi^@k@?XMM{{KZt{tF_@|0fhn_s>3P;IXivkY5b3l(|;Syf8(E{{M#nWH)G7q^j-gL zFtdC!#!U3zGeF0R$Hc}6$;!z1%@TiG%763!+Q{FmF&+K)EUtaVG8G*oH7(=+vzh##VEO-KCeyPqeEZe^Hj~*{82HRpFDwz zy9CGsi9ndKI*KMcx(orrZotfLY+wWVEy2TZhz>ct~{+QI?M@iw~LHyrB!vwwOzYHE5hYUEbbf5wtV9QYowt-sv-am4*;a<7sbaM9ZNXLOo9gvu>SyH zB!=$WfjxZa(@Xyal>`3%;sn4eyuB6kc6Z+&(8KJ5#mvau)Y!z(UOe7J-ClQJv}<}(|&V$0N&Eb6a?a#&`H3zC6=HEp8rP1J9-dSmxEM9SfpkF zSfnUqh_|agB`PemvXN~();TZ0Fb;q{1G9;{t6z9K_A%TME^T07b8%)R zb`NqN@IDMPDBmaFjU%HYAYcYo2H$2@H@kPUtS_%G#PTwP?)N*pmcz-74HO{xWg_sa z2k;zh+Bg^)55DL9`sga#5ueFe-@@SF6YX;d`)lCAMp1Qn5oUX&rw>MV`%o7Qz>gOe zfRvP@=j{pnUFw5o)h7osc$;VL^&l*sxu^-?^2-SB#nOi!fVTBr4SC+D2^wa*Sp^_v z-x7ZS8c3FBu?v=gqn>V`X8Y?3_IYpEXW^q?;gd(=ONTs+LqqKYf%l_3)CU|*J)m;^ z3))Yyz8#vc~KP{460qvvDlD2|yhkef`G?07rPh z1}_PuE~&xY*TQTz>_P~41JIf;vL$veTt2vcbm~jCwCEGM2LE9M;r`|4wIANZ!`nsp z3ffv4rz1WT2YAnPZ|^iVsC_lP77%ZVJFuh`wk*!~ zmIXbAFim#Op1fZGz5pZ~!%Qn_ylFVMVtBZ3G;mNv^DCnemf&tLI%pKWBhv_oV9pJJ z>%}iyteCzdAke9CIo$}4;D4r97YFy7d0~goDl)Krru^=v3UD}m0yF_g3~B;uQI1&> z-)Qb}DO0`%zf$Q7c)lDz2nF!QwqP}iMG_xq=AU!heC&8xSNTaw5DI2_yL;gCz5?-x z_#P*9Q3XU?EI5iojNR_-J`d`FR?xoUp#!Uchvy;p505SPpU?VRaIs?}UKa*{@U1|} zQHSsFydGE}cPt)jTJVxz+JXC)VChx(NOF8|{A`&U8|&(vK8n9(1O#X^EP)UaX#VBt z$;sZu3*1^bl{4M9*u5)gRaRW=FTaM%sEK0a43!y##0FBz_45Q@ZsDB~{%Epu`-2Z{#cS*n$|*u2FeDNlOE+R@WqNdM z`1XUZgl)|iH))e*q@L%#6cdg9%rc{=dlwGH;(IDbEWJ}P&f$&mM627s)HhO^Ex-&RZ9)u*oMT;Jdf!H(IUAw z;1vYoQvE=W6M^FAyPxqlq<&-#!zYj6QIFLm#-S38ug0YkwT_SZNrYP}0u>)|pAR}B zbU5R`Pw-~SXCLR$fY&4g8ko41U6F>Hn^-^=|A`BZ;DBt zfDyxxr(%wlhClgOlifaiAGmu{+zzgxYpiQ@dj0@-%09m8h0_llK?4acfv?}%^&9W$ z#{S|2%C!V3HJIjst@(ubVtCKhw;IRMOu{Sn$|fg9D)N^zLo( zPMx6c-d^vXYSYn#i|_q$+qVS{a?MBG!flD`Ff7Yp$NWj00N%T;YkC_J?gfH-?|;#4 zfGZ5Y&w}5tgZKEg5SrR=bHw&Z2`}=d!#lY(cEW-ic>2X$k^gbk4*Z;h^0`|3WzSs_ zlOll{y`w2aP<#y|UIbPj+^iOvu+`(YrV-&jYS}~mM?{6Ahy&mIJ2>lS09VDkk+@Y-A6sJuaGt` zw81Yb8(gqEpOdu^gb$j?w=ba`?(vJSq?Vp9i9PPfyDx!#Al%@$ZYAn#z0t4YaZql_ z_pim)o{*cbY<|?|8pzL5M?W~so$i9~1h)6$MNoT&uWbGA8s^VJXFs^|z3%<*8mD)O zD|R6K?CN_A$1&bQb+D-V|Yapktl2*H?dz|bzBfmSc;o27koAmy;7na67bk1K;E)VFbO=`}29(0CFvSADP&7diTZq$0 z`C)q>WGg|m^ZTvM*>SaJS6|0i#s%_Btjlr9Bl>$uiJud$^k_wAw|g?HJ5-kPAy2M0 z;B`f1f2pYN$kmgjH)MVy&i4#zOQi)lFcXV>^>L&Zvmo&fEmo2Hs_L!|dz z#;IMsQOuh)62FG{tS4u9?Xays<9fs8h&ZbG_;T+iG){Dvp4(0!UdykzCLZMZGM@|3o@+RV;%zQb<`?Tk`Fj?KT{XShb3--GG6tg2{T z#Fki;oyf6kkZ34F+m_I@Ai~8=cBp0UhXvMKT!m{7XWbgNRK3!T0P+PA9OQ%rbm~)} z7jNsr?c=6+yh18H=-_dZNk?iWbbG#t{^Z(e!I^u}S3n7nx>vdm)F~0(9Kl_ecY-Yj zrg_S171WBb0>rdqOUt3WkWL+=_ok0OGpxsKgr*)@jeXU8c=8AhUpBHotL+E$TLVI` z2kLeBj}D5Wg4=!oLx*(#)m)WuO~TevLw*{bw?hHWNm=a2;JS)$e7MhRmVY^S0Kr5X zyo7j`cS`~lQ1YusjwrG_Ty+GTi4PXf#o#V-i<8VUI3*+PyU<8)yHMvD^0!|EokCZZrlZB=L{xp`|Ho+SbM4 zhPFl_zYD0Oqn#&K3BgH0w2+@`lhbCyTd6$hIW7qqG96E z7OtK_&7vsvbaEP`IqVTQJq?ER)y581jTbz1Jm5X%w97h7-NekxOT3a?6)I2yFCA;1J1tN~I!f0-WYgFo~y@9yO1;#N3+w z((hj*Y9235m6g4PNfCOyTgKLL9WcI=1TR=d_j<~Wk0$KzlNZ#j;g?5Q5?{&GDwYjt zr=pYj9I_r7BqpSq&B#`;IlR05n6pUpy;=(9z?JO_s3Ab&%#?If#bKJ|t{6c@??xXm1M*2X_Vs@+C!IpYgQ^4In* z^jRSRN8qSYnQJTOs?=fIiQ95ZxUtJjY-=&eIYp%Mf*Y!@Iqw{;Hy@O%i2rnSn4)X3 zx?dz5^HQ9wM&mjo61+UUtgFy1~1rSh!xx`q`*e7$^AAeW={n z+}?Kn+u`W_V)CUv`Z$1#bBOlbixctm=w?G%oi1v}K*U(pE#vMHN`e36B2n6E)WNa4 z$?};{uAY3&G>^r6Si86(H&N4s9+4>iZ1TBv>2SlPRFJc0vX#{WK z5i8HLyk_A?CX&$Q(Vkw;h@s@)7yeE*Ax6T~^BCmhVn_mnO*JO<7t;KtR#*OPLS%y5 z5J|Du&3xo`3y{*Wcxge;qvW$#9*mS0D3$QjJfJtXnORyI>dr-#s;kz@D4d8WW2cdy zLbRhBL;gRVg)0gj_ZbhaOCy{1$hd{6GpgPe^Bx;=ap>!J@n(Dc-emLy$-~}uRDf~M z7d;3QvPuC~$*&*_W|%Be1+*pgQjK{+fRN2L)z!T6cx1C8fBNYC{`xH%9?$P@#~%O_s%id&fZ9znos^**=Ea-^IaY_2F28964y}@J~M?8gvx_L zzZrrNktJ-_57zIetv!lgZjV;(TLFeZ0h8_=BQE$BlQ~bFsF-MME)cn>)OhKlf?7Sg z8fLt`%Hln2B`;TX&CO5VI>DGAofKw+r|>a>U@kbzrs{(Vym8Vgb?)k~Vfzcfo9E(I zW72eOD7#|2PworzX6?bb6ewQ(4$xoJM>y#3A5K$~<04U!x4`0}Z&xLQXt0UHPRe@L z?2)%^1w)Xj{AZ3uC>em!EUoH|M=v zfWEW^Fp~zoK9O{*4>I9Y@9jguw8?cSkDQ_-$PIQ!4WKc3-?}F2usKubALsxF^24eOy_hh$_4T0f60gl?Kb$b zqb;V!h<#|i*bK`kWD^v`O;L=r=(zaSX2H_` zqA}SCS-0dn;x-i8me^8UPt??9yLrjuV?M!h8CGr-F(oiFOt<^m9JA3DYN-U@j z-E0DSN+#}QE z`>ay=mz6nFGt(V;-aL=p`vLQZA9<>IpAo+y*t)T& zpQN;(2QYo2Ls?oSy_C}XCh}Zdc4VGmtz%26`$WGG?hj?HDx`hyt=B}8`%}gB>X~Zo zAX^3set4hmx#yOihY-)rUntT1MJ~>$%4(n+O$Xvayc!pujw%@9Z9;elO8ENo)g4M%1(Yih@shveRgm(Js9{<6dwg3q9i zmd9<1M+$@#rK4)8}?5`svAFG6k~yCnJto)Jy4-tPF*dHFfZfU`KebA z_rzG*fe}xf3V;6< zzu?AQ`-uIoF%nGf8d@Jog6>>K8SiqGmo}|V5HE>y{?;Vl;&JAhM=dsbk~2K6*q$^Z ztl%8MXj;QT`UK#jFkxfLBVAQ8!IK5Ja}B}9=TqmcymwS@DIA6!+(RWE^2~Z@q z_NV)7c?I3())E$vC33}n8?2>PeOWq6`NyZm_(W8-qx5v?>%w`{B@C#uX5kluD~1j} z+8!De@am(Z++Rz|m)t_Bt=J|_jBTnp(8kaTW^IT%#n2vI1Jq=fZ}#v>qaRWB(RRn{ z=FBFpKZ~Fv@V&`CjPmRg=JbeG$+?{IrK={Hxco5BjHXKu`>59d2Vf5|zk!hz;IeVj z0YtkdbFD-IlvnS1Q@^Nj_f?mnVkYzct;Vg=L_R9R)xO0p+#511QpO#}B1+~CGkA0v zhB()+k-{r>!B>eW1(Z&ES2&bJ&8<^d%C_F4^~BsKY=US>7y?yM!uf;muSb(Za^Elk z7?5N?ypQ$)90{6u!mGjaOws$-GXkYNVl@V{fv5)yvv7a6f&Jem%5GzAMJLs??VTLn zEso4PQq*!0dz~hBYEvHZFXZB~%YC}-9&*b+QNPzh)m+8?_?Z{Op)kVR=Ieh2Iq`OB zx3p29vAcWms}{k(Q92-U-d9B*8`Ni=oc{XSH^LOg(i)D4lV zL1ak4&~I;*2V4zWLmSdQ2T=Bpl$$g(r|gOwc~=rKf{6iq7!uh3HAVvhEi@BgWFKib z(b5}N+W)}1&yjO1Bg@Gp4S|6pV`<&={6-K$;zVB??EOnC(bF~!XmUwvCIjK26%Ug5 zMs@J<+%oc^sK?Rdiptj*5%{`zJyux#Bi?nelnn|!okd@>*qnb6%l}?+Gm4?6t*S_N zp3xz1`KiVk1+h2b%QMTXJo4c~X=5rdzkEVwLFRgin_ob~iYDCKy*#Y;o>@5T(3@Kq z7(;n*>nF2#KB2~dw0Lw}elK0o5#onsurfK>@$dcLZvJ#E#3Cw4MjT}t(~CgOyV4tI zpxR&3U1IU$+Yd%#KUY8gfTE!IXw+{IRSHiiQbg!7W{lYwf65ISsepe+WA~BorDDd?;i*-{a+l zx(2S;am5L2v--K=OAI!f92B@vke0rfl3M&-q(8fmni38Qg9hXDbsS6&X_#hA!5No`0fCd#UMq3UVLpy zllx;}^Kvo^^^e<xjg^vk>G ztQed@l+WSeZPPNKt_O={k9Y^fl?J)4F_k8KDbg8#U#N4YBu`R4YJKSCCz)njh78Tu zG>Pg3z49J({D{Y4CMQz8*dClU&G^zh|HPCQADOg?ObJ8!YH~1jsSZU-mw@FE^X?km zj>aymAhG?p?#9to(A9Q3{vQX?#<_PeD{zLxM@IgUr?j{-lUub~JE`rMxO#ni60f@j zJo{O#TSiu0W;V$r7{?6Vw+N=y#|$+|sN2PY>vBj#t|O#qE)@ zsv(2PGIk#492JiFMS3#oB4CaLXuzPU8LkAo;)+-vq5E=>g5XF zNEy_h5UIqiPg2)iX~IhcQ$x})poHWH4o)}&o8ee-SAf0;Ky@q_6wOO)NB(!KWi0IW z)ADhU6v9KEr`15eeO2Nlq+KiX$dLM1WLeg!AhhzmcrM)&z2#L5b#`Hh$8enA?BrBX z0#zy-RyxxBXDliEBwvBb!+aJ*XMDOfPS7sX|Dff^0f{AP zx_KblDvZzNzdm>=*mMx~8fmOvHoL6+TvPS7zyi^R)|9`AuS>5qgxVkuYXM{HWd}3f zg28YTn;{+o;||Iqvug_`gR)mn5(S;jhA-XC?2-y$C)<`x(f_$_@CUF7N}h?@=2bT* z+~E4rQhzm>CxAzcedsAh#mYJ@yzX&^y(cwYk4Bn|xk8lC2JGS0OHdvOiw4LY5Pi@i z%aA)Ym4Yn%il>~4CCoJ(e}bc_i!ArFB)YUYT&a}nTAUsN)epL7TKBFH*7WUNluV5M zZBuQO%sqAzYsLaHZWHaoVE2xM+cDlV^)Zrc=c1E+uxMTBObJj@*uL;t6Xf$|d8Ij> zPbPQ8*;fTQA5v@KOa|=D(u|{Cw`)?^3r_V?X|PC}T&1!djfpZ*%Z;G7F)+vYS2#X5 z#of!s-IOF2G>xfK*VvV2bI1(y!Qk~ZqiP45ohGVT+8l(z(yfoNvgCrd6(&I>Zk%C3 zpP#E0V!~XO|I{O$MqjW2_)ysOABc`&jSLongEv79EN90V&ud|obTa4Vv00dhIqYcQ zsnDgami@kYWHA;61EBgU$%hK{3k_@T)m$jOFv1$?@U{J4>v$83&Sx%~&ja@Mo>9uV zh4U=(o(*&sssdfgjgq@!g5jk4iYAyq4t2`Xm_e;x!4{4`!wCrOZrq8X@HAM@62Cq}{%t4_Eu#UYiDejEIe ztW;(NOM0KZ7zyi!kXrc;y;@%4g`F`lDDIh=8cnop%hJXhLJ%riOq2$wC9P8gdrAqq zgp?eH1xme zAd`X?xY8kC3!g_1jeYT#j$IPK4cQ@(_%h&~H=6kT7@6klwS*!kFYSvpZhOIAbNu8J z&B0}Ps{-ZBZdEIyfp*-BZF%I|+AM-|YcYtkEsdhdY(n%28r92VD;CveUE2YM_UyId zKzmU}*o_vP8exNriO0Yl^%bOtYfxwyWddaXASv!enbAFrD*GYD>ah4}m$aw20TC8L zvC+pKfSiG&DOpjulU9$p8N3wR!Wd9!|=F5WPb&V&YSIUpm?7G8>L}+iI zP)_PwQ!@7k1^Xxz2R;$T5D^!RarsrssD8iAoK2$bcX^TTxWY1K59(vylS*8DeO?kU z|477wU5vuw&|0PZ6o}$i6&H8oLz6C+4GvRJguk>wa>8~$cP;tYK=)q1SY3^(p7MI> zwxrD^@kKNkD%UGilNTdCHR<<8p|)E~4C z4PCvtGN67=qc5d%8(D_n<2JRV)~Q2k13X!U@jQHBR`H&)skg*7Yg@Q0^S8|syl*?n zd>Z96FBwAH9d?Xl%$d`k@)`6wWfazE(O&@u)Ow#v%o!IR*^prq4@A7erB!#_ln#<> z1Z|lXF!>7K`ky6237-uEn9tbIN=s5VNFWAxBSTW+W#@w7BNfGV|KRerqRU&lysh_v zdCW)O{N*kRWyziuds%Eo>HZ+g304-2=cXdF*bpE`!h3@*v8?H^FVQ?&vFh7XJoQ5Q z<8G!%%WKwWJH^MQ{}X7RjvCzP`HwJA#AYhVWG8B`#B465YsXp3@h8+7)GDK!BT`b}@cmCLbrv6b?KkGn- z5`g$Ia_J^6;9$*VaL+9zpA-eo0?_`MoydO})tCZOX_-Rwfl4jNMqcKL`c(2g;6fRL zCsbK0hUKN>($FB7Z8j&z^bjZv=XvDb0JQk$l{Er^8^1ZHyDv9#3d-<4R#VhE0X|vLfg`{N-@%M? zTKFw4$+aO?cYCab$*hg4J&r|sW-&O!hSt;H{Fz>@2%3@r_VCNBMrMK^(E2*G(j#Rm z?4+Dp`{UdFAW!qM(q;x~O-xFSWWzu1+^Vmoo_D@AkST>ZUH19S zYjoPZQkc1MRd_Pl$tHK7>Kd<;KKLkP+~n7M>M8wBGhd~8DjL_Pu#t?mN(Vs~r>53g zJI(DK*VWAGwv8EoQdMf}D8cT&G-_1gQXOZ|2d|QgVzKeo%MnPNEBH196cFggd}5@o zWF8uG&7%cZMM-T;_aHtFAd0E%7(J2d^`j(Wi&Y$C&!|+bxA!U*`k?pJ8_KLcDgz&JLu`NKOJQuCpT`-DJBt?254W~Cbd8F3jeGDM8pcS@7Zuna~X|T$SUd8 z@r>)--5@wduHLN#vVQN<6(yF(i8(fR+L}tGfE#oabKbo7ac*$)F{!XXDalkMjK*DQ zD_)QDwFw>62vJFWzoL0omNtUx4D+6*X3r3})=1V*E0gV-m(Uf4aa~`?LnBAYkkN`z zg5F#&ejujJHD;VEdvh@)@pA_{yRMq2ZmYTq4plEIIGa4YuD8re=um%rcqXu)!qMyI#!zRtXo(N0?9#adSf-I%m^h;Nz-KjOv3tB5qG~Er72fRt!sMH`;$Cu{ukf?yQn8ex=R13Amy(e0Pc?P%EbUIR^EX} zgE8|^y+6#!F2HZ|IsEBHDVtTdSRwP?8)Sxr47Fp22Fnl|7o-Ff;{{92o&*LCBTfUJ zBMT?UP--`#AW>tKjupN($cR42O}Ys?bXI;n|X)v+4L?nC&`Fr1w1}C5!U2RyK?J4M*><}l3@@zP+&$- zzmo%#RKN&0BMCLGBY&P{Or$?0c8HDTqE(y9j~sT>7MT*hlky|dA4EJbm7o$$V(ETs zhUIbE?>AZ)+r8(aIkJaAVEXNj!O^L%FWt~Jx8nr8cV!Vb3{uC*w7~fFEDW3fI1 z)ldD7I2ec=^O9m83v8D~y|zul>>>LEzPi1musn5QpwBDgg29%e6`lAg09F~i+ka^% zIxjzM2Ut1@fMB9w`Bx03ZrX%7x%kB!v6zah2 z=$EljPmdX8OS*Eau}vD+w)-#fafT4@>y=A2{luaRJzrul^Dk_=GEPk zNyTmxvzu!_l&A?>T;@Ce0H%XAas33wqy5JBvn{&}v^>|N@#M*O%OrlQ{OEg?=|NGv z56kylB<`}c@GMWsZuHU0HPVX}c)jnvaqNVhpvCoFFn7~ruM}`U6koBV(q2uIfQw5A zX0m^|SAzwJvypm8WOaR1yfW@WF}I=~|8kIVQ3}gzVE;93G{OWTGL{oBV<^OF?Xs{K zCc^#GM`Cw=ZzS55r2Y@9x3S@2#6p@vMuPw6gOCcsRxgHAj{odQqgBY44>H8Efbq{IM?0Ui6*4y(YUm9~4$ zZ|QnimL$fzg6;z^@?_hDXNLiW#Fsmq-_s;Kj^#{Fp_|4-$K;c;XcH!>m*fJ_Jg5S3 zD{V*Q%OFTbPi7MCawTaQ#@VMU;B1n=kCTyT+CZF6gqvgqM7?zZYB>2vKW*sOs+&9V9^~F!BDHKg0U7#N_pcK=*tx-V$erhkG zF0r}^hmCn@|B#rcw$3#u;7c|ciho`-LYtnP5MtyGBRmlxo5aYtk-2|XBu8)>(VJ`$ zC)4oFR0JjhVWB}w&8!#F3d&E{!!W%{!yc!COwc&m3Ql!zi^#BktssdP3K^9n6RX;Z zto6Au7qqGxDEo0ak&?OKc;g&QIwX)MMdoqg%r}HLV%mPW1$UQ4)^en@gTM5>98(o) zi9;rSx}Xvp(VrrYl46hx9J5yHK%BUSWHcLpqeMuZYzr|D#BGt24@AQ!2iTUVb zH8pL&_czEZ`_Iah!o1l0n;qCfm04^L8^LgGaf(Q`^A`vbD(wrPtL7Mnw z&p5jgV*-OvsqM1p1r4s2Rp^E{h>q@x%2Ku61FOYd&r^97rB(-;45rrkoZH3W;5j}N zY$+^(vLR<>dYfrnyOWUUyul(82*!F%MfrkI3x}~_lRh}DexWok5}otdEaXQRwmF($ z%|DC$Yez(1MrUUiSEVPOGCO<8>xJUxsLklZmzbvoXqS?yMUAs19GmqsSj=9sN;ufn4b|n#ZEX&7B9j+giWu z)m`=*2VIC!{Gs6I(f%Y;o1~&2Q9h6-&SEL7u}#|%J0{W@&V3rffT0_i7-UNb3r^E5 zTg6T#!O{0{D8`*}M}{7}Kti{d{lPmWI!r(MgS~%X3Y)RDL%%jF>}H-6mYs=dMM@^+ z_XO(nFJmsm(4JaBD>*YSAD1l8lLdTOqT`iK2BLE$0m`?6j(7Z2{)*0xONPYzki=;k z?k<1FTh`S4?XG5K#&M`A1JKNwzE=t0SPk@<%(qlR!4No$Sl&ofc|og-h9`qSld6rW=Ja?t&^$_nN;m) zP6QD{)o8nFjO12^NA!mAHs*;uJ4tqc`eaaEa2(>So#(m;T*$)>jMu&tlbh^4#)x=Lois+oY~1KI3#-& zE2MfU-fI&bx_9Q2?ni6cv)yvZ!>Of<{Bg~AD?!ddP4Xu;+Z-9eKzaMT+XK``knKB_ zqo!Z67X(|yp|c6UFY*$PrTP=-zA2|>s>2Wp*dZVz`M5xBtejcjmT`y~fy93k3%x>mwYH;av*)5cUxwC1k)SWPT%%AZqiEfOM{VHfvor1?j z;wglK@)V{}nhGlXE5z4CdzEcx6C$aG>fdlM8z{?_*8_3sxN4G&T5jE;ZSaf)I)hLF z{=Kk1UxM@yx3|qYXl)pdv{zmX$64noR+5zQpN581m>(l`? z;ECE>!|FJ0+ti}yeK&FYDxwut3;K@#G zQsp=6RTfnq#>>pLu;Fj(xwDgUfsd`SY#@L^nJV0`XtWeBip;kGGL=Kzir_p*j@_MCJ7+R<~;?nPaEWDjPcC6yhu>Q($x z0Gykvb<3h*6JK-x0HlSh+B?P?tB%$r$e;u2%e>#YQn5-h1YU#$t(tcmivLfs8eFR8_LD+s{hl z0Ii6ihdrCa~CRR2{U9G!gORB zKYY{*z;WfvsqWNFN)I&Wl{0?YC{?WU2}=*X%Tlyx;K;}nyhkiwzhd19v=g zQvu7mxzeD1L@-=>~*6V48TUL(;Y(Bb=qa8bc?l6=h-wWfEnV8QCaiID221O&Q z+oOaVUL9|ATGq*pQP8h0oC*gt)1z4W>mx~RQpifslr4WeNI`{;Zo*j}YQIp~LX}|0 zPQzeO4rl)frPf?}Iy2vZT=kjdp8Q0&o|wewC+So9;Xuyk3dywlEUE6~edezP8;)

6YOLmq>IA6&DGM9$%FgB(uE?|n#jnY_V#;AXi&8xdsoYe#beWSAwLRU zhb<*c3ISrf3YC1Q-+DNJBP`0MmHXx`6eUptj*=^89fH$W@AULm6SJkHs=!Q(p)oTZ zfvk&p$MCehv*nz!7~g(*8N`Q}B{ZZ|jir|>0OsdX`{X}5VIfy zELlQAgc)iGH?S_aBNp$-Vr4uo$xg&`HgXc5(Vx#i7Zqu^Pcn=DDRSyT4@28wit>i!nVt(q)i~GhIZ% z7!-#oF5C9$u6nflz=*dRRv(`ByN0LdyEycYyk&WdU(BrKhwY0uNbOjbxRw6}$Zmd5 z*FkT(QB!9HzE`Ww=Nd-wsTLn+b$_ft06oBcrj5vW1C7P1&>saTD8qbnMrPAiZ9F!u z*Dlpl&#xIvK_DyORNEu6DbgeWz4*E9Rhr*2lbS#>*9yotGwFI+a9w9S5u-CU3~8Ct z3WDd)d((HM9K%zypXfcv- zlK|%!KNe_0D0j-+q(HEPCk2DhN3&mKaF(1nFBWQ^aAJsvD#YJ9daBzMX!t!dQ|_36 z+x$FtII46nPzu?93y=&5bKqw6r!LxiepF`}O_aoc zoLf2e=OkbKI@3xfeYr|pTR=6~k6Jc)ITTLs9yx8T#WSQkjI>NXBintar4w?;=&{F9 zft@0GE-KPpe?`InlIYh^MZb%*%hXn`L$97tyTZlY~46DhAi}%k4Mjl0Le1&S&UpJxq;;BUK+qMb0@U zC|}L%BKlzII(lYtZ4erxu=1v0mSZ&f7n((`kf1|1V*VvkM-Q2hf?2b^p!NJLgtJ`2 zT)cc9JViY1p(~}*1K@fiKQQ6^adk!<+W<{AsZaZG_HL?Efl9Wmg4h5#DV2fa{8z&* zp!sRk>m2E`lGD8^cnkfH5h#YG846pb&`ls{ zItNFRlCe0YOE0m7V6|?h`0jv*4G?Ga!qdypKK;XONRP-fx8UUyEA%FNjX_kZ290t# zI#F|y;9hvFJ(ph5mrn70KHk*;Y}HiJDur&>4g4(5ajC!%?w2TnjAsqjE{MDdR|If` zLOX`Z@}-^w`NGmvc=bq?P*=y!u)~J10cK}{9|YNPmFH6tD~o9u6i;fiYacbI1|g)= z=2$5d26n5*XAw_#OS-0=x8vs#>L->FOzFg6CqZ6@0BY0TuVl4Z!(e<)h~k=_Tr9RH zuH1_s4A2vLTJ}!UqDU^lo0mZcAd{FJ z&>N5;tM|1(Lozyw6?>3!E8_!w)D(9D1{GyGbLnZOk{i#X3(DrRDjvN5ug=afK9_D; z@JV*EW81cE+qRP(+qR7z+t{&f+qP|c@}7I=+;iSDGasfuJx_I2t?IR^@$1#A{slFh z!;`s`*!GNX6R6JsBG?7@ z4&5EBg3V}hkNLLrk9F&HB9@|0`wXEl*)W^P7xYGw+Ip6<*Z@RVC7Y#pK+P(I8Q5d> zW>5F9+`M4Y5rH6+WFtXxiCxWt2$n1iV}H4jsELvD7|M}k#RGbrQ|UWZ&t#@QO)_uG?45_RLws>OrSNH^&Xt35jC##&moV=Q1m4PmDt$ z`+qwUC*b^Mw}VuD``oMN?F3B}A7#XPFM7{hHKCDjG^|&Wig&YmDzqEPX&`4I-*t}0 z4|-V~G$t$BZ^Exmf-hVH2ES|3XoAp|L#fb^`9!(k=S~hkq$~~C7_wn&z8>(gf2?Mt zz6qkF+e4?TY?*k-w-^6 zn6%J0SakGmN)jBr1>>^(GyAOamZ6CBnZd~^AyL+BI?t<_+_!B;Q7$3c;I*jZPaY&C z?6sHOJGg^kNeT%}w!2QkcEw)sPvDu8N3- zlxZdg|+s0Cpwdw`-wMwf)E%87ON9eW?i|K3Oh4`=UF5q^$=ac+UywZdHs>$d`13XVG366|(0202`qIfe+t6DytNQmtv|w)JFxuEApBU%)J6k?Z(> z^?R_Y5`TSsOuRJg5WxEs|0x^)^{aizgaINMs~Z@172=X84pEb$krF1ZEF4wZb!DIm;Z6a5uG0m{-oBg}F&P&MD#vVx7Na@Ymhp9jr_$@0 z8cbs)j<&ufRR3Pj+70)KRe*L7*X&KLu7bp_lFbGesBdRat`$#JKd|2Fc@Qw_%h3l_F|_= zhr%g09iu=Lv|bfa80vb@Ztj}3oT4M-^;a>aew{6B{w($mA-fSELwW+t+N~i)nqE*% z2K>`?J-9%dHvmFS?{gyfrIe~qB*zN_HerPum>NRu ztLe>`Xm(04?c!%`O719n4Qeayg5ylIOUNW|=O2+%WST|Vn@yP*LPRPetuA4%EbEX2 zX@H2bFfeR0zlWvdr+Zf@QeHLn!wuWq(h_!`?J<^D9HsO7Xq9%4xEx z?32gr@26zT@A*sUn3Y5+qlPkCs+&ros`6f}Tc7HL^$!G&FItAtIN()fzE+yJE4-%) zH+F$G0_y5a2OIEu;*nBPZAKjNuPaYAdTs<@N>4Fsi}8rCkjx$s7y3Bv`%Aa`Mpc!- zwnV@-u!K+B1}?QE4=HF23zn)*{l*K$N3MKiUUJq0nS`qC+8;_X`F=prU4`Ic%t>>o z#*zd(sLS|9wUb7|oDsRUb_Q|VaW2Yp9B2uw2b5POAWF4-?8n;`zzQkeiCm@0gee$G%excLQkWHQgfl3ve@G+y<{sk<8NsEXV;AY|y+V`Y z3H){Psgbj#o=k;XXTGT{8l=*mo^MSoz3d+kCum@L%&gWZqW5Z4?f^9 zGT^w=j)O}5gw8rf01cum?<$~+`Uv!$x!$I+=)yiiC8OJ8(RY;C7>71{If1NWQaCzE zHO5{WA5yQ1WaG~p8|CmGyxBs#=BJm~3Ax&3o<9$F;ojv@QN=G=0CwaS0ukM^6WQzI-v8Ba*ES4!7?Hci)3DM2Whu_A8mRypZ+HxY=tvBrV z_gU1|TCLfl3u>+1R&ObYm|e%8hkgtD#jHR$lBk+kE!NWw)$r|$>?2M#k#toRt?~v1 zQ899{{bnUQSI;~uh~l16OC28STzf)LB0xSM`ZXa>x*42M{;7oYKa>tif4ruM^rRMm zBtdjLz=sYS*6TIkwj5Ca4y|nZfTk0_eH9m$qRxqn0KJY zBbq0n9TzzLv&tDe%D&CwSdK2lrKu%uDG|UR>rWEjOxQ_~9g9&tPVo{cSiM8BJiLL8^mdA%XUw+w1_RM*ujdK8%9CB*wjKU)HR@Pq@0ivx|RD> z%R+sD&I3;u!%Jrcb*e7!!8je2Vq*v;W1bWM}1l=#~I11;wkXzK{2)-;l5QU z=C=YYHGf;*go3M1zu6s>hh zj_;Xjnj{2uE|hwxp3RQk36!3Slbggj!=~3_bG}hvpmE`-xyLeA;z-tDmjV> zNE=XHNA=C~vNwMpXDENZ<>&|UdooF>))tvy`Hlp6J&-2?(;m(wGLx$aH};)s7a}It zXLk9Q#OA?dx#(mjZ=DgF4@dPL&kRr9PR_z9%gaEr5h*<7NitIkU z->f3HVBo#oaSlMA5bQq2{tsA}4dC}Vwn~&9%nz&7^|IPh0B`Hfvs zLCr4Wc0HN+AtyUyzE&iuEeMWd=3Lg{S_!y(=(Man*Cz6$3Tu~hnz;>A&o4zgh44#t zEUM1D6*G&trx>D@?+LS3KYfk}+AS!lWS5f&fw-dgDom;CRgwZ2rjC%b9O56rs}CF2 zWO-EL@0wvize%ok#Gs4KI*j&(;_!^->!`H>I`#Xu?lnN~l+GG7*W0IeNwuE9>-zjh zW}P&MX5`*kgdY!+z0LU%oWq|Kd z@ajX_9SoW$@yqV`(miDuQLD%Y+OyjAJsmpfkq$;QU$fv5_EN^y6g3IVEh;33Y8(;R zxc||k%g)&jy91@)gOy@&n$H{ggFMa~qg2&@HP}&x_3{1fu)#nyw#fCfxV&pIiAH0K z_0g65)J>ku00F%*VRjhhnFo!USi`hq=a2KyFxGMw>MWRc`If~D#-4VrHkuXBt2fBT zQL(0A_-9Zj23AKba=qYq4#9rpOIgolx$+%f6r3 zK=8*O9e3S0BHnCw&g_Po{d2jq>I(BzW_Z9tABl+Q}#mF z-{pT#5ec{HXW1%wNyPv*-E#H`8$~xQ2&^&m~CCWYKwJF(VlKgyvIzDU^3w@Iu- zdr={Tgnh5NRO5EUA(F$9k-neVu$)P$kMvPIMeJN}AXi$%3xp|VI%(YU$f20c$2&k& z@(;s9Ps;^vjC&35+)2g#J;0wlHP@OEhb|2si5nx{?rx20X0W}{M{%&}J1lp`5LmIO z^lqOQte_C^n0j}^5a!PUk;Lf9BCL2iE0kGLi=zc((NaK zsw38!jYi7Fg|M{KqZ35^-tDC~@8=JnqjW?udd598nKAQ)e_*jD0jIci=ejDW>QVAAgzP2hVBuX8Bi1;I4Oba9`J9pHC z2>$90XOp1yj<1ng!;HUK>@-_tP=5W_0y+!*aiqCzo2#CAy7$N=vrIi)VKBl^)N~^T zcoMGqOnFMYD?*Qn?Y5Y(JV0)+3LnQaGw|R;a9e{|N+|_SrlXW%p!l%Ce(oJow87&G z#DbQPehJ)KO|u>RHfG8dHZz@HDcBQhEt%V;QTS{c0EX1u8Y{~t0Smz;YC{RnuCkv`B~hp{!qw>hHJ@O@HW26uDH zZ#0B1Y%bhAzwkoL6|>TSr?hI>GN%@SZ`J6X91iXcjM}7QlMH_gH(9sOZj*!JGf`|g zeW`qPL(@@DI5|%NB8*(`!H5eE!-HXCDh08+G_?|C*#43%)-2sRFTOn2Idu_*j~0LO*Lj~JVki56J{cpDa&is zM<3%#+Kn-jaI1;8PLT<=9}+d#UqB8ti9EXn6F2X&-lmx|OS2}53+jy-4tx)fa|*mu z8~-C916>~qsrGZdXK}Sor4#nRcx)#Sj5h43cL=76LZ0xagdbX5nuv1$VY|1Y;-5Fc z@^fR{iC%D5GzuFJSz9aFfW@IHvFlde-ap4zMr_Y-qkhr>o(x1cPk?^@4d$ zqz%Fs%$uQe9m3D5{s#vb83c&EYewa3(T=3D9wolHtM(wT)!xBHa^mR`#EBW$_d*;1 zyd)?^{>DSCC@juC8(q9|^9%;j3g{jCMp3IHfv0N*FKfZfcKfT2vwTFo;;6&|PT(Th zip2Xw^K@Jf*Iiji=U!VIDxaY5t6xlTQ?n?NW} zTHp?!Q#X?)R-NH93yA@;h8nct?xRIw0lD&>laj@G#KChx5dEX!|OPDFe`!OxsCF&5%j0&B!qI6d=*Vvb5x&i)(7X=}{`q8NAh=!Wg52jgyvPJN73izUT{`Har&Jg_W)807>b+rnf*PiPdGc zz1HsOANFQNVw|z7DT2-~d|}H^6%f-<`If_wKoAg7_U8Z=9vTDuFp{IM$TDGFXT||~ zwG=aH{CyJ(2Xp~J?H&_OtD?2S6oAX?wPI<3y>c_cI z813--7U-PVC*5*-9HBh-zCsa;!A_1>u;$t``jwOdd9Y#+_f49_0Qn!A^Y0|B3bNZA_H|(6R&z3%An3Ij^TxB=&HBN zfj__BMu5hiuIF_HguKTyI9tUWQMRTt{1Wz~Q>cx?OqMmkXR^Z7>r-*fx+JBTCw0i+qF8B)NV7YxNE0s;sCb5>&qL(bpn`qCN%hS zl=TvhaT1bj`MTu+jO+iVMyWg4HDQM45s*>aj`m)j;wvZvcqh>Zpf4Lw^yGunVwrMqz)DraH8uCMlANjCA`}5i8u*b?szf z=;vZ$UR_X(I+`&iEKP`QLP`UhK*cb{wVR*tE2UtaAzdPF+F5Xc)^25pAR++}6O z14g;Rf_far!)38l;Hj(b){v}xb!UEa6U#aPP6;YM8d@TKc4SR|)kGBKuDzV*HEEpo^@~0Dv$cMk zPpN}=dB-#7lN5KS3Gj&Di5er1PApVJrbx)i zS*7@1$Y@iW`<0oIb1X(nvqF&TtL1J4)N>BCz3H{Wo%sW59f{bP4(!1}qiU|RwKMn1 zT7TP5o=aW%DZ~hx4FhDl`RDALD7|D7jh!bO+2*_v=f*0_)-EDOkyAiN;u4tsOdM4( z^T14KJ@u?E@`25C)9|V|pelfel|O&}SN`x`4ahFz<120f3xvvk(afe-7xGJKZ3;3! z`8;oFGhQTLA-YPslq)A`NMIP@{gd|r@%&4kPH9Z3#Tg)MIUZP>$p$d+(6DirXQQ}J|(^r%Yo*VzU5 z0glz>N3<0q$g{MAtN&)j^VZB58JU*0tb2Cmr+UZ3p7HiG$1x-?Iw!#($k=0_VV*w` zoffK$Nbi6k2FSr6JsB-0Nu|Qn0}|W~9m%~K7$VgFU`x-WYVzO$`c@t3_citzoE7C8 z5v@OkjA%HQn#XDuuL4@i^adSXA5>~Ge!=d>ANwyom}M~>_vH*72o9w{fx;csVF1lH zf0T>1pml_w4A8M`%I%*Dmtv5&@}l|+M-UUF=Pcxs(Z}@0&gqzL@bllZFQz4wrGsto zvd=#T7}Oy#Utr=sS0bpM0$eNkqT~=AkVOZ?&K4;zEBYwwQIpycd-LePNC%!!=gSrA zu zv#Z9Tr)7xB?BJs>aed{Tw0YAoHUSkK7m9{NIr8Yk^0n9}_kpNXMFzZD3BbtG;JvEE z){@gpnAZxkWEEYjfdcN+{2TCT_Mmp#e)t$fWK2x!Wbz< z9Q{!*gduyFWcF7ER;1rfNXXC+3`A>OkwM!be3klxugW+Ou8BX?q@=d zixz{a%U)~X4+pAWkp5rnqNrE%fj^WXYynFB3-%j0ud!eo^PiloUqxtMe;();Q~A6R z)omVsUom#h=%_YLoXONh{H<4~MqbsmyzHsB)r-DR8*^P`>y-^tLgB$?*~M`SDuNVP zOO)IT)qk$-`#-d}xeW)k8ST-qCB!Q1eq`B6+oPjkm�Z4a9X=Ul&G7J&YmtBq;jf zzWL3|&QAEH`tX#b$H45l1>@`t%>{O0*krVnJt%$*VQyI7(_7t;=f!GudV!mERkKUXw5wq%*21R)v@bm?Bg zp#nLsW{W7Vi0Jt+W~|3RE96@L6FghIg$W7AV8I@Z7k#nD)9EX(7x(5Va`k*2-Cs<- zoc~_#l9JdXLQuoc6zQDUK^gpA@e7y7E^iB=y11gnx_h|aA^0uUl1YzF@KWZpyr~hc zYPH}dPE?T+|M%uha=P8f46!i{a1f?Me3rU?@coe|rFd4MfEB6K5liASpd{CLQt zpE^<*U`Ta~=mP8}y@6veksA5VsYZf&HjMT3nyX?(W#ZD|gufC6QUVs9PjbQtflSo< zJk#a{j^I&uvt^a-Q>6(bNSNdq1Y-$bfAXAwB)Aux(STP*c>xO+?dcLar{z^oE-Xpy z7M;BEN`AZ+3kXPW1|O)PTW#D#Frl)6Dv8CP#c2HOm0LlO7yW}pKsQ%Z4xJj89b45k zRiE`o-s;(T&7(zL$X<}6+4~;HjFHV+83$NQN4#P;!+2IF%TjrL+shnpJ@2{&14olb zUG4gedEUYK{4$yf?nhgmPJNzzOFN)dhonQEy$RAS(YtZ!2cAem zzI?C;?6!Qove5638Ablg*MM{>{G&xeG9Aaw7Z8}h83qR)$zRcgpsJj!WnqZ3c#Yvv z$~ZK}ijOt2XU|Sbe4>09b;hLtg;BYE9z_NUS7?rPsgBosaKUYON!W;KLkk`;w9(Ov z>eoe)=jxHngdYA7V$=rG3xi~gsUkV+rX^jyhkDyS@Nw3O7{gTi+TPOiv#+|`TFdxP zbTDlZDoy)^)KkAOG)deQHzB79>R!ltD%Zp3?y7EK2KEap_B3>H81Rtt-ZIgj=4n%< za3oKKX^3-T58{(+U|t_s^A-Y^Enfg$f_u*plOTQDqY$}iMEn5}DpK*qvqw=-*8~4l zp7U|9mxZ#PndM1^@;S1;BO5W7F!g44TV0yz?l?$KhGAYUBL;`}Ua;n*Wsr9wae+|m zvaR}3Y5vA$(5VOT$yJtku{zB6AAgzPQ#7pu0$;wb3Q|S6E)H;=DL|%%E8IhmQjL4Z z@{%}gpV6#ryHwPh!gl2us?ijD;lf)7Z_GhylgI2{lagJ>y%z`*^2(PQpnLC0iP(&X zlhfL^OKP~KedmKtL}pD62-zZbG6>*3I5=(StY6(WP{G0bZq^=qQ6BACTk^5TCAARa}BYP%vu<;)>K3NEh_XP!;6>66i) zKhZ#*hxFA`vPoU!BeW~f|K(hkA~@)ST)RD#_5~;r1+S%myX$1%^2Wa1tqK=v{QT3+ z&kV_Uhu5?`%98m7g2_W37J_aQyd~-$Av>#AP}3*NKpnge1cVd6e7I?vtDI7xQzvBc zQZW7)$Xl3=Cw-Cf2ITc69|kI}43|rJOg>fHbB(k)0T`&!BfT~gQ(bwOkY~two_SsBhRx)nU$R!w*63}@a-iPCn#1b{eIpB$Jvjz< zR$^ezJ`uvK)S|-dbP>kR>sooQlP@rvA{yZyV!&OOQw5bvac0P1wBZ{^=ZTjmXV?}w zzKlMn|A}c8@O};od(7s5F0b~~1JC~O14MFmf_p;PvTIYjrByU{L>}I~fGBR;S{L49 zgFvxyvnC6vl#elGifH*>PK?4${Z0ajuIq!AX5gm!S; z?&Esh@@!<$k-TW~XOW9gd_{%`B2i0?TDaP8M*174= zzI}qkoZ-Q*#NfZ?wF_23%qLlwct%zRxM1`rlU5@w5!yL8+xa#m6WwqG^9))${SE<1 zKrP*w(%#GBsGy-aS?wxUWHn16Q-3uHa}G6<{Sov9C1NW&D);1D7^tEFrD^_E2YdFE z3=^O#%Z%H6^E0u+SVYh$k?oMcSV6_5Yb{G1fZHudqb&YN0n7(6+HGK#%FG3rx9sAQ zmIwpNRV&bADqZZ&3QL(<50rLuW`UK|3H7Q}*f&@O=`nB% zAI>Dc3>yG8ZBZv^3Nuc`n_I~Cp&o?)@i&1v&e*evmrYq@ch*YgJU?xoHE^1ASO;20i}n@@B4K}Z1dfWOAOWUK+Wm6 zQC`2f_#1b78EDB%j*qNKuIP#0RWjhp$Z(8YtNZKVBL$eQ55thU)L{R%a=PndgbL!VnXAHJLiUEoBLRfWI zOoaGY?wpHLC&;cey6mm>D~0&|jj`*kay?SDCMqHE31`MpxT&`|1lFcj+ZL^aY z02iXQB98z&ZU^D9&H^5ndXI(x>E+D=5DsU*zOR{wx1ze1_-(_yVAduBJkAbU{(GCF z(L=(KLaJiDGNfsGu~V+?=w(%dp-Pw3I=zSYt7muKW@0<%WqHA%(UF(gp9|UGtObYE zm|%>>6Fo*KleLpKaur}zPKh|JG2Li{oqq@kNx{-n0oYS<=-GrZTkIRu%jFRGGpG#> zdd!tz&H3u3dZ?KbICET5<-|U9DgBm~wQ+KfMeO%^7#WGlvRp&IxH7zr=v0mn0Dq<1 z2FTW|7Wr@k_6|>b=o*3@Vu4y^drdq7FDg_PT4~FlGRNFjP!1Y2jW*Lh>lvPg^~?Lf#ccuD&XNJLyKwWAxBvI^@=goVZ_K%oWF&o zkdm&JG2P})_@X={4q?xvxfxExv+KT35e22fhbT=1A}K{AZ=tnJeTbuQm~5ezZ|h6q z2Qa2^v7*#fnRt(6d6a?IqO{~HxQ}lj`p9k1c`q`>LFQ^*i$^~4 zylMU>s7n-u)Mf2PT2!Ur;=O^yC-bc;cXmi@V-K^sw*&=@D|u2bL< zd^KO9B+GaAe=SR40hoHE8j}Ey)830a)36Kp(vjtnRx8!pPovBvbUe2z1U^>EZH&<( zC$7>BbOk(`eWgMOKGg-$ZJH%0LT1HApW`9vQ1pgjpfjAt!u>QvhsKs}T)0sQQ42k3p2nOaLk6j>>xanlE zeTZ!5Tqf9YP}p=onhR zXJc^hK@lYkF$HG!i%qT5l8s991IM3;xDcaO+t6rIr*St}8d&E^uM?wQeW3s2h%F7XE-)_~c=A+1oW7tSR0ydp2=J+e=o z$Z{83j$=@$BkHerxuXldqlWN{%g7TeAMn>5`JPAsL%O=gFN7I|O1W7sbRQAyQy$5O z)5pFmTiEfAC$=g~8Y!}xLZo)|{689-J-#AKa`C;VnL|)kDYNIq6F|(wOunBLUHp0s zA&G*|133 zOFS)*aF&0;Bzv)EjH#U)aXsTAsE3{V-4aN!u!-7cnYZX(U4gKoV6#lp;<6y|LNC^^ zr_*=Syq_(bF(>ME9D3mE4oBXBR{Gs#x-p6L-Idb%?c+#E43ff37taqS1HDOEmX~_YST!{+TWD58YH-9T@4Hs6mC?cT~6 z?+Q0JQBhQ$+Zh#_(uY5QwA>qaTtj>>bE=RjhNJ&epace%Kajjn*PE%(&Y*&DrNj7s)%v%O?awg*+3iC zgaUV)INE3D!^jLfKRbR_Ca>8a-!Tt;7%~^P{PJRJrk5mhq=>Tou>9eMILZsxh8p7~(XM7bl_uxsuMK)$YF`^<&XV(=vq^Oo?4dzxDL z@gjgVuFSDG^LWN4_wm58VTVi-hX`r2=?@U3%yOkrm-XR3<1q!aKH;+6dX03oj|lx? zHiqrqy1V{9Q@2P=84w#PLuYM^TG5}eI;;UEA9T|a!*NSg13mB>(Y~iyXIYWw1m_>b zBp;C2q2hj3(7m=%7@i)DO3|yaOM@OOoW$lV^U0bcBS(%a+|&r%&3uGaG_M3t&+*Lu z^MVrQA0tr#m~F?|aNd-Ux5r&|NWHMj#;}Dj*O8W9oAST#5bHA%uQdv&tRAhi`o76%A9XA>82 zbJ}afaCP=&cB6I`-9ffk?ixe)KDX!aaWsQhiX~ezwEi5%qKOGdFi1-gl!tKP_tP#$ zP`cY(W2oi8iR3Pirp+NE{Up5?`?0(J@XJ4>>`7+Ga$7L*3~}Y;1+EdP*1TL1 z3Nms{%0FPN*mPXG(j@~0zsZrlrXKX(DZJ9;GkEM#Kbbm>nfdkJnsGGJN z|I`^fXC=w9N`K`lb(;9ulZ{JQ?XVF26B+jgXTt6lSh%++7NTWBX?}PvP|M~N3`XEd z4PUw&kW`J<+^QB_ZB=^wPMA_-5(FtJ-t*k^N}!u0G7)vXU`C`9QW9Y40M5aOtP%+{ zH40oij9K!P!Qs1NVZqkSIN3W2st?ki=x1iAKGombI4&Q(W=U*0vu5WdXZR!1Aci!Y zg71u#st?a#-5}WnH*f7Ffmd-$LI)C2jSD!X*8$6EAa(;s%i1lHCCXrAM;iJ`j--q~ z*`k};&SK0u@pv*~Nl%$aA|*Ig2y$@+1%O*~!0tMOhysU-u)U74R27Z~0fyJYto0&> zJ1X*>y{0(KR~h>Ko5yxM_)P-J0R06&t~AF2^T@@OwzP4@S#zCJtJz!3B=8Kq-oq>6 z^&M82jc5&wlgb;pBbUoqMr*~hh4Md(!&BQhZ)^%_@=Q#qOI@a@HQsT=lf1emNOnrZ zKk=4HIv))(7S~mxkG*8-)-!tJmdUFSR@+6}eLMnqH&*{WJr&9yha z=lgFhAoWe^M@PZVt6PuxOlxIuPD5l50&-!SCUw+M;|P^VU8P_!Zz9zsA8r{T-tl{_ zQ@U>Glnl0yKY(U9W}q2=VC%&TF{&~87DOZ)nf3U$8Cs{zWv3di1<|Q>0+kJ&xDNH+ zYsWwrD?MP<2}=-@AE+SGA%diwm@;W!A{2e)!bUO)nz1FyEy2-y*_ zBW^n5$`VALvec`?Ju)HlG)}K>uOgm;p(Z%3hJL15i#>a{h(QroqZE=cWG#NBiq|#q zkT@nMDS|!AqRd2TZsfH#?!H|GVf9h-(7G9HmhuojXZ7|39N_3R)=~Qe~C_%R^dbbJAPnWKl0@y z4((*T_&L3HOO!(HcV}cD0&lx>1I&?~kyTJ}!2aG)TVmtFVZ#h|B$+)CB=?c4n27eqO`gwi+&+{!*J9F z*5@pXn++D*TvY?3)UDEVyDefWL%3$tbb*t@Mnb-|pm#nT$1qIP8W&1VCv z@Rr9oNVF3<{Wy_ffeIn+t-VAMQ>WER(kj2)~ct@l<<0bdp3SF{yD>gHY zs9pJ>)=j*xC%VIfx~k*|+Dkzsr|UBdHkv$3A7@A0`Gp;@#g+-cvP>QQww~a3EA$1T z+ViNaDre(`VKOZy0h+S=&VZZ)n5m2N01{$;)+SgBJg??RmLG>eZkZHRz zW>96)J@u4MBc)i3R`4vY@h@7YU|>Ruz&f($Skls`(bCjPJ&NgAhzR+_M$|iiIe~)$ zIB@k+=>QoX@c4eJ1k=v3%o-ADQ44bEO0@oQ_p24~NgGyINS&(kDhFg6kr9WEo=_LfzGVoYpyNbyTzHheZf@vwiuf1eSwX3^?baD zY7o@|loEwWXIw;^OW13`%o2{@lxYe?AZ9G}yO+P+y;ioYf}jiXKf$!$DAfOyu|1UD_KY*eCKV({lzd*GAN~Zl+@~8+s-algcivY@iNB_TJpp1CmK-Pb^`d5Z0 zKRq57*Egz^|67Mv$<5XXk5<^#QB=`U-|_pWp*|k{Um#pseujUl(R^b){~wBs|67s% z|E~5gY+dI6>BaAMjI4kC1yW`CKMDV$^8Pnv^?x+~br1w?oU9$clhxdZZP!H_~@Bv#}QX>$u5;I2h;{80eT7=;)c58JXEB z>DWl==t#*S{!7os9*;~~-_h36#?jKu0MD6`hM9(*3eVKh(bj>3me%T@yJ&3eO}>rf z>}`HI85;e&mF+KMJOh10i|=FpZx7`E5}d2Okuk(KsFnqS?w{K?Yxg@CJY&3n%Dx$~ z|JaLX{ok^0aPBu=_wO<$mhVLQYx@^HX8P|A{hN%1`Trne{Z601>#={cmH$ol{fFhd zJO3tQWd0tXf0r@RG5&{)j)Cs)ZO}2&{WW}lxBU-)jNcUD|A>W=`5PbnciX>E!~ZU0 zWTj*OdmAj@Wa7X3V`OFdMiBqqmhC@cVPT`E`@28(Z@Tc`$Ag89 + + + + + + + 2022-04-20T15:53:49.407171 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/API/clock_driven/surrogate/QPseudoSpike.pdf b/docs/source/_static/API/clock_driven/surrogate/QPseudoSpike.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f7f031de37e4f7558b36597c0dde0b2b440aa99d GIT binary patch literal 135370 zcmagEV|1in_68c;?6_jvcE`4pj%}-BcWft}bkecev2B|j+q#vR`OSaU{c!JxllR@; z`#Gnw)_JPP6~(^NGc&Nkk(aLl%39%=0ZahqQWNK&Z`OoD4_6HVXlruH4G!n7*0OtKH#>&abz{bwR$<5Bp z$;Jg>;oxFm<78!LWo2Pyshzot1%UZ;zmz`%on4$vjcnmOvetEEV)2_$Cp@ny zpQ%8Z#j!R)ovP!+A{hPX<3zWDh%1s^>6RCI-(Q}7x0-8`c@v>a(F%d7FFmD{$Ch(0 z^V>aLZKWJW>G}0dUPS5SY#lyz@61L8O)dvpILACbCcaCSrp5W*5;mU+z8^X~ZeRX& z4)=_)R6v4Oph!+L)-Flg!5^r(gs>!iRe_eU{wA;^7akgNy14(im?k|Urw`Dzhp^X&#klqRzT#j}5CG_p>@^R<#?}c4yF@<#Z ziuSD2Rd2vhaGVd!2Bo$xbBA)oG*o74lzAAmzels~Yj`@__!vc4LeQjPFM|h@ZO|JV zSDJR#LwteE>Xx7ZZMtDDzD)pWi*0psw|^Uo-bPrAGt;yJh0@;^jt;pk!(F{%J5cu~ zN4U`Am)e>aLyjL0w#J7ESI%pekGiy*3{KTI0$QEnZ8|Hlm&*%n zYI_!c^lJT3%{60Vc{uwwC{@Y$#S;&NrZ1l;Ggrmr%`^FdR`;eH_xUNgng>|dy0*c; z*o%aP7bbsn#D6TTlAmBZhTbKxo)<5*h20$d_9OGdqvU6s)7eeNopY#wgbcGPR|XS+ z%Gl4HQu4#w7`DFz2o~&@CBB9tUC%qv8*~S~1{+Ic>~XrpRfSqV!4d9Zv&_q1 z^fvfFB43w@W~BX}bzdW~hwdZz$Mx!^yk8LX(*&0Bx(}A+Wmg`hi#cjYEIx*8PLNPH zX;{XTBbCqIB_XPyGn>brB03Ke&WIpxgevnSO2 zlbZ^I-KAgo8?N3<9Zbm%{>LjfIQlim4S@uG5nxiAEr0Bkq%KL4ae#xL2Y7`w{f z3J_KJiyVPS#I4I>`U4p<7T-!_F#j?hJUb(l|Z_ccl)0_q!Q={kBXN=`4lera75fRI8Eq8h=km8!9PT|!NZ!qa1MqA zLL>3#TWewpI0{|k6T#(htpw*3Zj*+MhW>A)!+G`p)r`dth|c|=_PUz+fO-B4QOsxS z|ADAniBlH@%f*yfLLuU}PC^Km~=BX>|vS}w4*`&Tflk7T_OcQ@C8vo^L zo`zUNIq1!T=gU{et%3Scw~pagx$PD22u&!*Nk?+ZxP!2Jbt9j7G3>QXs*Mbp*GF)| z{%UjLO0tW@r&VPjwq(%9y$}^~ad1B9b+oG$o4eWaM`k=U_iaFdlI6T>cHF8nyt*^o zxm*Jsso}xo&oycVb~!9>Ok~ewqyDEy;@z`&@SAv+5&t#U!rj>jKVPEu{3Zr!WNC#K zr=6GtyUVUhU#%Nu)*dPAGS{&EU#d>L&AnK3`D^rNC&B-f;WILL{#Ul{f3ir}1bz}Y zJBEoU>q(4hv+PI|FI7!iabmt@%y5-t?3w+5QovtF=x3I<#3%GmN6g=fD9p^~8la8= zr)=ZH``isqQO9T%TxhQ1Ac;RuG5O_dZaUzYgX1MsI`)rUQ|6NzMC4WhCu5*&>&G=+GSuu)nfw-r= zo5}zzUy=@+aZk>X0o9W$N>8@NSZx*KaCw1rxU&173Jfw((-Ai+4(d^(L$-J$$u@h9+}fN%V+c>d-nIXbq?Y;eYwsc#+;5h)oTdN=loy`Dw^lSF?y63EmcG$NI89++q~qGd^;g~yR1wR(yYP*K0ubhcVGq-xKG18 zbV%l2wHRf_Usgp^kBzmQ4)C6|Yr7zIVBP+3hij@vB0$F?uTj$XU%|(MzK=Nwn|>c^ zHZ;JBV1&W{bH#+hp_>q)8i+ZPR?{VGz25*zlQ=zvvlbSj>R}?iXOfCW zn+PMN8!c_hg;x-_;Chq7f7(Nin$n2+4&Yk*6Bp*NP#k6_&cy*;K;ytN>`j+&1y*?2 zH{t}NL?x)9^etj3tjvi@qCC(=O^76P)U0{v3bSrfL0NYv zng)Dc>T=?Yn$Y^4X(8{jb8c2r=HxRBJlYU7fqNtcRDCX2ge^2~hU-y~0w^w~dfh(F zK8B-y+iKp0@Z_m2O;dAJzAw_3XPB`A+-TgdzrVPUS&>5cXo5#a8GP3VYa{m@L>2+( z7@njp-GTwME2<-vwQL1&JlTjNubJ*?iXrJXDWgu3HM>il_&xmrheK0>cnHNs`xcBC zwwF{a=7RXKcDx7*c}Rkrk19NoUi7 z!H*x}$VHFUD=)+Goih^S6h$HtK;_eD7+l@6_$?6awFj9uUROProR0Wck7eLlgtG*S z`HhW>g^0sWy)xOVhvQb?2=O2?u`;TDFjQlpT5HG|f<79(heu8s4Dp`+AoWj0Gjh^N zVR%Jny}ox?&%#?+{Ya+}&#JIZoehZz#)@LwkaIh61~L`V(jn!cmsg$`V}&3UKr&gx z$51Re1M#+@#H{njN z>m7)ZC9!5t8Y>=OXHP5%K7b&xZa+?jB#2-k4A3%mrGv*%UIcfA{cULkq9^4t0Ns8} zo&|*`?t9WGIl@}7B<5EO{MN0$z6*3MznvCLWWVifQw#<|dA@$nj^-YVIgCe>VQW8g z!GItqJ(6JS9kM?u7>%^ZDSO8YG@)SdVSC@wQRRkU)C?`q4zud90GUSEooX53_t8dl zZ9@=*hmduKf9R0IcTBkvMq*j()8b#-YRbgT z9W@tQC24mMQR`Sq#Ke|8#ZkuNk@uVVVVvtB4#I_vFqC3jm1!z&NaNe!{m`)9x3DLh z(w2_D0nu2%AF?lE48tf23(FEIYOc7=)_>OUCDBR(L;FL#fw1?~fRT56NJc2aMeDo| zQ=i-y{4V0e4vH#Xg186;-*pS)DMVowpYsp}0UY#(O+cGFzblCXKy3s6~>Q62d*^+Bn`dpla34Jmv)0br zk6y)VpmhPa7*L9O(PM9hAHQ)6KE@XdUzygZs?^v0)ADS)vap}qNPi|EBf@d8>njFE zwYtAaROOo%{1qxDLW2EzHC0~t6&r%jzZXuABs5=z7QRz^+WM-zE^dZoC0AET1bWlr zlGg(DjU@u5sGqW`!{5#I$w_%96EAsa9hQd+KRV$9ApQD$Dfn1Wg8xZC{O8j9A1dNs zm*LM_GbNI#+@IpEkhxBW$zwGRs zO#g~u=Vbo3%m$RX*#R6Zz{SoCd;)!eb=bL?0bDFxaG&{EKl@>4128i&G5x>)uyAq$ zJ{LRC^#8Nq{@4Cb9Or*x|9P^rG66W*KNlB(lbH?J11Er!l?%YZ2E>Jn<6n5Nv2w$) z1N-4(;|6eWF#))NGjTEf2R9&QoZO%BKpQg<11>h856i!B;0EFZoC)~O^e-JN7Z(7y zfZM_KKVz{m1E&Xiv9WLi5ducCFaZ$(qRR!8KM@5k)=$KN+xcl@;pPHxaRNJJVuE7> zR$~Wt!p6n{U<0Db#sRFr0rUgz*r$yXxWoTK9@yb$PF5h=te+jS0`me9{6t0XzbE69 zr22P)?t0-v81~(M^!5^Q#L2vXbtSgHU8* zRkR>N?(S0V?qM!4l2Q>a?uE)TMF}*ooQFBy-Od}#9rj$Ir{)uwlUdf4=Ho={ zA+@-L;psuJnls%S$*4npJ@6X_}gv@uj8n6}q zQ3alM`40{R%OWZ^)jK`KijH@$&4ceDKnWo5*H$48y2uCu9 z{B*nz|Max$zLH^R2pOWkBCESNZp@qc!?&H|9*OW?;6waC{od>V*)gq!o5gxbhr1=f zwC{ruoS9f~g|4d-1PRNIfx`|7WC<4Jo~^;eZ%Jr<6!kR~{n0QKV*7CS?-mSGSRd$p z9SgMRJK&}H`&u`MhB;?-?xf0KE({I~M4DE5E0}B-miqlrY?64K}9mZ=5lxcv9LGbMpN_sC|Fle1uwj-SqSOEo@fZt2}kdP=Pxw) z7pL`iK6_Yi73jJ!@8OYdSP*XBp!PUobA(_ar0>XkjC;So^MBCzfUqY138FPY13_ye z>RRi@dD@=R0vS^V@bXRt-7$irbkQL-y85zJgQd;WcPk*+fS7s4Wk)O*h7fd>+ujNG zff}m&iDiM9vC~FT;yDP5xCKjskimbjJBpfV3dnB<2&GEElq#8#^H%qx|3FCq=tzuG zV!_BH{vspxny|StH$+J>^B02(W`g_(gS(uDAwGQ@lt$_CH-yny6hs3HMrm~N$Y6mn z#)~W5MTZZrIm?wY4pD4$_6|nrxmN%LR~#B6X%edvBK$^wxO1uOh*& zL!Kfvel$U&JsBK7*&~xVd6#|AqDb!jh-2eUb49H6^Lgc=;xLmKFy1axAM zj?IAEE_em%RSALm<4?UeP=5?eD!z*<+-xC00)t%v3@&&BI=K)+67+~a0QG6BfrQq% z&?19`d&dU|mdNks-dx-=ado?e`}tpN*R$lh8j%QN)qi|w3R57R$Flc>6I;FHy4_4e z<-ceu@}aa+#Qx3S%HiWe=`aJH2c}0@pJoY#tzMZM2oVcyfdCXgg8DQSonOIMTarke}xb*IfM0?Dg(NYoWTZHJNX0M32%^n zJWlw4XN2#lDN46koY$Wb^+h(K*0n`h6J?%TS0*qg6pw2 zu^cduc_jWEB3;nbPv*1`K8i`d#!)VXRo^Yb20=~OsKhhdl; zZzX?eUd14XG*+Ug(6heuc*Tt6XGiZ<(0301S%&vJ^=Lnn)^Auv`2LHCP6-kO2dUlh z`RwHaGsx4zwddWu=8QeMHDr&Wl6}jl$706?`sg7k!g5Golp$PQ+&#Sf&Q+Vv$=mQ% z5V@#P>~WrP2aa-s^NV>cLs8h}hg4Bii=bGShSQr_kCH7Uv;h@M~Zrr{Ws~CNPdi@(1f~`qAg%nbz{0 z->Ka6WawOveOn$IB9P~fk0k;(bbF*575fPi^ZTS)qz!COcYb+?=4s7c#jRb#%A@iW zFY`&fz@?1LMAnPNK+~v7=%ygw9dY5`q8H0JLzAuWhlI1nlTyoL4jYAh-5^hYVXw_n9RTDD2pNHdBPUrIZ&2%7+|9o@bT+WW+2+EUuoa_M7JhLRAUX?V2q>U{=!8-o{WH>I zbC>#Dz;)@CAa*SsG~|-%JnNlUYwz0l67EmvopcK?`}Ddv&~x-U-8vFJ4~|n6{!mvo zV4sCc?P$mly@0`$SVeV2HQV2TaPe$~u7idW>YY;V_*9zh!)-LK40Fry6nb+`rf9>C z&0gF+EwykVlhOJvZ=c&{`!flw?5hK0U|l=x>p;Ew#L`m10CM{C5?X*8Wo)V8udE%d z*;t1sG1#)g_d8mrgvXgW&(Mv8a$a=HNDbC@%|}}6cc14R&pEuIPINp(-mqJm*^4py zZx@;<$I6{8OJ(lh{CHm|hT=n1%$*4cCPQYwkPis`Xj_)frD9`(r#ib>>li*tb9AFY zl>vYZB0uLg{GG93&wVkSjNbxZHjE7Ws z@R#jt8!@4gMon4k1Z%e3Gqwvpqg+S7kvR>BLcUZ#efjvoYm1t$K2(SlZ8+Yj3(gvd za?y`0hhIt;3=n20-?sjmEg-XKfG-;a|83xIAZBW~Wxq>+F)F^KXtXB<4YVIIn1e@j z;T~3fJ5FGKhjI0JXhT&WfE%u?*}3f7O?i26a-bZ@Dea!H$l?4A8b7FwcN4`ZO!Pgx zl;RShoR$csX)KFNufJzm02^%YJ}Y8^?ec7A-2KJE4r#+WQTwVCHGFpIItRAh26nzx z8GB)Qr5{bp*S&~alva1y*PAQPKzb@nTQIejyiZqsV4}%&ofhC`O8M);t(TV48N#6Z z9+5Hf!kUY2S~-KpACX8J6%AG22Ww_hA+x+1;hr^h7E8y3LJo0G3C?vnFEWv)0E3*x zI}G;QV^!*##5=WLx?Tj2>@0i%b8*z19q(P4Mb!&-!N>uLY+`z*EQ~mnV;g3&9%`)3 z75k=r;zozb;oL-6QDNMaH_^!H{IlJ+B(<2*vzb1Rwic=VKazM046SY<_jaXPVVIOy z<9CBqWLd149xwbWWV7dmkgo^7>gFjOV58UN!@@A8FUD?QcwRgsz#kX5I+q`pO94XCZdo5z}$Kn{H=PU zJh4EfJ%ypP$o+_N3zeju{_cd(?$5on{5%bM*v$Q2ia4@;m9okldd6b}yA&71#>0#5 zYbN7T3j6ZAo#m2HCKUgj=IyUv$VyPqM}}WyZM?>eif{B?2<71yq~pV|slGw+4|3b8 zySwNHm^ku;Rb8#O-#iVu9tA$pwlLLQmQ+`GCI3Dc(GA}C85_pH{42mIsbk)x*zR@l zP-7C=Svo#Y+P1bA-8kmLi1#b+f$dL5u`q)m>==3mT50K3&(?qq)U)rqj+R$MW>OGs zVby!bJT{r?2bGUn?~`lKU;N*c8*0zLE_phC-Sw|nKkEr&{X*$nB@XpXLz}PPGSYt; z{Vw}-k=beQ(utEkVGeejR^a_T-cS%by^8HSh`sj3*%=p`{Br9S^Dmgu8x3k*JC#I)I~hSVE{bQi@fhY8kqTM8JPkb6!rfAt6IKxetZ=O$pTBM|WLomY)ufv3OJgf^4?Ase}gIsFoJ z1xB$lif(s<#?il|TB`dJLGB{Y)tOW6vB^q~S6N!($593f8_$96>Z+?RlE)RXdSaVOzApgJ_=dVpr*C71`Awih>~%E<80O$hwSk*u zQNG-d0mgHdjDisb2lGt6w2+T|4BuoCa2_;+^zn3wM2_bc~>^w6?PICX*!#App={L|OcoG7G z5SbFhO29k$JAl7vwc4+!6dY7gRrg_Ex)5vDlFsVD13b3nD|lT-W!(?TbW;^q7WSr_ z#jb6pNVv`cEx#qI+a9BY?j@JE=jj)M5JEdsby284#AmHHzC$9tIM7B6NOuE2JO;c5 z>i?8h*Ax6%z+8FLblctF*x)}hVpImW>QHrcbkwhx*`_v?>XjPKemnvpKMW_0$!$|j5C@Qg9-ksytyCBSr zw(d&D+rCjOKmye9oF7VBfb&g+ccQotMs!RX zz1iMo+c8li*@3A&9enY3nDA!O$Sk+cpKBRXWMS`>a+#R?W;5IT^Sx81!HxAbEa1Yo zk1DHGG+mr3+6(S%y51%{(;A|O`ai~XE?THN2@DP}TeZ<5tJ8>B=s95Hr2!^{hD4VB z88PapWkRw#ft%NYhu^waV6s?1p$=DBLaNUUmOO)KY}m;lUpfgLwT}pVH|ZIkbnTx} zsmtrlaaCZxPys5{LJrm_C3M+*??*A|3+0)r!ua+cdb*JrEos`KEsW}uCSTYjWQhu~ z)YDq}#AV~q)Sq1+%`tOF;j{TyHR|eye_-Kc;#R&6+_CT~sl^|ze1pVvO+lU*!@Z&Z z82Bw>cey_5*~;bagy`0JWiIJbhc+PEu`w*M7*dyJ)DKo=XV*Bmy_sN)@;rFAGxc!Y z91FFPou`E*Xnf(iSvj7ZKJ@I{C*+;1Dq3Hp(Ibi|yuIat3>MFO!Xm|Uf=ciZ?L(wtDFJ8%lep^>( zhGyD82ye%u*`N_)wd4XLyi@%vymWWV;@#LEYPhp4FE}2f+4=XTQz_~GcMX$@zryUV zwZU44&r^Qs%TM9LcAsaP9?chZHFo!uRWJ9C!uA*3~x$JY(ymDy|m(S>+~vv3f^6K$uTNEpC!5s&M*n-hoYbgMP<5Nd(nvx|X+z z9T>m{)y8x+Yd8L*R-7P9hmVN2(>*?C&$6tyA4ePLU6dvf;(c+;hJv4Kw{5R{<;bWj>(05XOxbTn z9DEj2jy^=TlM{hoQjV8l+Z|bvV`%HYwd-C}9Up~fTP|4yPddpN6xmNlmxr9WGkZxLXTK?gF!w7-p-B?p zGAPZM$Hv;LX`MFxKnaIGC^W*wQx{MPu6O!#Xca3u##>mq%C3uCbFI`s;iWw{(#zg8 zFw-V9o+}IQEWxTc<@XA=pfy%^U_l+F3U!zlciG!5va5A_JDq9wv{K;!~AfkYiZCpO!(^&DA(DThZ$a$D;l#4jZT5ua@aZ+Ftoh0fp-`m0`ishlY0a*tbu}%@c z@XN29(dh<5>g{SaqiD44-B|X+NPe1MwT@o*tEazz z+w3rF%qM<=9Y;UGZ_v7rSqe2@6^Raa8iM>5YBC~Ox&b-UAz*0KccU(l`;I;xi~uc* zUKNQ!7p6gkcP6nxlW@di;ds;j&VmhBva>vy6|a9qx?fv#vShDhkrw0pWuyp3GQvff z9qGb|4C_~#WxwBjKFp1oDWMO_s$!o;d#&jB*XI}+p1AkC#v%o4O$kzmT`0HwN$Pjo za3Oi+Ed`(b#7VftSw{~gW2V-ILlZx36)>L??;@189n3w=2&zKIlE5>H`_ z086dBWd(L_twdGT{33!uqJWQ1bxBTKD3aP=MPZ0wWxWS2cp1g3F#gGIE`)e);fj_EA)9Y!3%|?=eo;}d?M=~d-z)MDfTZj z1N<2nwbJ^Q=mS%QN){7e6PB%SrN`ZR=))bBYqo18wQ^L}neSgDX9t#k@*-@2%5atm zzVby?-9+*2jCPae!BW(rb!oU1eD!FoDo>XULf37;JI>fSFl#uqwh&(rcU?!~Sfxe( zft77?D3_7I>3M2bd;Ja0J0HgGHw;n4h4!Fv>?VF!DylNpVC++kFTZZI{NR_AJX!xNE(-$%pM(F>safO7UNk18L!=7Tq*i$u!E4dQ5wFN_~H1R7OtH8<9rf*i8%+BbQqe7 zA7IGWrN6WM=60!pg44VSzJ6CCaSL|h>RD>6v&1PF)T=2j{_FwixT0t~fmW_uV4P3DW056t7%US1H?=X|TT>tETWtiUW|d{=1DJO^HhtR) zLWTXQ6ES5Eds5L{%bn%ZA%P(gx!&+--^I(eW!qYGOT2wnFa`3>scUIlmiRVK&;(VXP*`_sgv zEB4<_oBGH<2v@!T?rW&Q()uBX4{6qO(4xwd^fVV=aT+gNbyWaLB7m2ubg$qSxG}yJ zV!L~%IO0pRQfO$hYIn`j%lgp#bD4Q^p;q%)(VIazH%Vv|Q4m5o(Kwhsm{#DU1%G&d z9_lnV{tf2IKtN?ub`bZ?X5CoxW35M4nk_<0^v*VG8zE&x@se9uDZ=*H@T1#iMfL0X zK?e9TWna2BnmedfE0JdmIImh0XSPHC=-X~R@j_)FxUptc9RoKey_jeYDBR$a&S|aW z+%82idPI_;yC3#q&+r^pOYRmXGzjOQi)DPY;m(zHX zi;h9Ozjz!L?Ult!Q%#%pbfqkQ0dtfMZzrI?V$yQC2+@sxVQ%Y%W1y(tSGfVRKFjG> zCMD(kT3m#0d2oXJK}f-U&5&D*4rcQ})aB)_U1m^CET|Iz{)CT)jnbbiTbu=Md{)1j zZVG0=#CqdWgH?V*=qOH#v^$?PlOX4%@RWDUFf!k#LiZE)Z)6im)n8AtbmpZ=ipIo& zDaDroY{bl955F|}IFC19-wu|xIvDojv%RQPS+M@6p@w+z^?Dd_NLx;=_ganbQ{jQh zUqsJhD$7mC9U2wmm2dMRnmHTCBtPI4Oq^EG$@2+wCg_vzet?(swgk!)R;e-FwzY3r z@`cWIaFr4BEH5fz4B#KeC;HSnn1Go!aFL+(wzp(u%cJ@Y+LYiFGhc?9XIzT8*ETIP zA1B!_23|1U6<8A7yGO5H)oZF$;rozUY*E*P3vlU9h;P*asfVdtSa@XD^C9M%%@>nBn^6HR|G2Eg}EE7wx_Q4-D* z2Ma~GjHXg0(}r8v>+OK14V$g;t*@_1>Ry=IT7dcDu}twZuZcgHI-2X`Q=EM5HQV9ix~98;d=i71 z=(vY3{b5^2I=9^5?7BHJB{FDh*zd)R@ko4>WOPyEC29ix8YIUCj-y9lP=6yo`pkBX z*eox)IS%HM_MgpGgvkQ>zS7d0N{3aQod&5X>pT7Q_@XUx=?Yh4M4yf#agFmvQkkH8 zZ;Bfa0aiRfFPpPM`Sl>ul;U#hyuKjQ6x7C(&Q-XzFnB#H#pEllS^2jk@?D*(ZPI3P z&YM#2y5E+`=930m%d#dwB;Cy}(4Z~?GN4r)R3l*wy_4!tGTlTVj6 z$D@_Iwp(S4D$5yf8FdKg$_vvQWZ11f>f6$m{Fvf*pRG{#@hRKY9!ecIfM$r4 zO4a379==gb2)6b(q3v9BXL)!?F~xGV43pfb8Mf%hPjO~o%hRGkL=s(UV+E zB-V@}Me1?+6MX61fgK!XLTT7jzrdt@!p31YlzJ_C5d+;w@-noPR1u9xqw(!2#a}Vl z-!$uJkCliJ?)I&M#x%E$B<<)>G3U|l%vPWRb$2zJV_j5hF#KpYTWY)im{;q5OT-}s zD`B=-{@GQ17oine;nG>2+jgxD?Xj?hK{pC*XS=4Lh0D&&k;v|Z07vwy+KY$M=meEh zi*Uu;WHuh1xKT>wzQ;<~Oha8`v+!vqR%>^b=2YzTr>Lyx3ynB7{({{Si)NFFRxi`j zC3No87qy_sF~n)dQPZI|(a60rjyJS|w-vP(pEH%RC*58&kjuB#5+~K;9UYiS&rC9U zwoP^p91oAYWU(!ddg4myKzsD!qU@?s9P%_g3o&J6duF@VQr9~E$Jp6x^el0)%Cr>7GuTQS51yE;N0Iv}j{g&06TZ)cemp zTmzY0k5o8eG5VW~j~7GRIw|FvFLhbA(brNE8Xu)Yhur30C#leWSQ8e}Mkrsb?zlsl=#GN2Wvow#6p$(1i49{@I2Tws^c)^2LAr`;`xObWeMoxsQE<($z8ly*^l8>jG%q7@K zqUmh(^M+em!B%m$H9+gz{$cyOC9S)M*|~r0-ACv?r(hiX0HQ=?Cn+{Hrd5rKg#@wQ zz#Zpzrn>2=feq4OOX8R5*6_fDQCx~Trbhe}mG8vL zGK%x1n-`kd(hxzT&*YL=R)QeAly9@r2hE9>NL3_$GyQx_k%@3+KI`%+x4< z<$?OM6{ZerF)JnT4|s@f>oHPpt{g(5U)nKh=VYpb|0c`*xK@V#q( zlxOeMjilEELd3ViA(kqM-%Gf{N&{(uIt=-02dcPPQCq8m$|nYNeN__;2V~;U+s*?G zBW(q@;R&0wSfmIrO|$pelFyTkok34C^Ur1MkXd};xZ+oS>jS%ZN2RCloa_k6zCpDM zPMG(ja|=ywn;cUT+I^e;SjX?o8|ra?&rdSZM@zLdZ6!|%?xon3%MVo5)ce*d{c#80 z2J3QeoQ+ps$`FXvn7#Gx6Z=ZfS`p+<5twhygbf8F*2XpdyykBo*4U#PEl`V#p*(Je9Dsp8FAzzHUExA(seP?ts1Y0hq@qINUB1a{I!RZD>3gG*Syu=)`PQP|vpUHqyg2X>P_&v(J^G z59u+PfIcq5$y3c=|H2mBJLbM9wi$DTx8D?d+A8y$fIl?5l=J?gmUAygiA+e8wodeX zW`s+uaS)a!A4A0PtTr?hg@!VH;id2Bt=WsO%*Ehf)d{Cn3L{w#{3>A&m%waoE?4F< z%6Pl0)5@%9Z$Pt-0ft(2S>rTcy4MDBS-HRX))Uog0pW1DF1;VjvF<h)P;CaZHr>g+)=smNd%7 z1TSN!evEo#{{C$vAwT|ySONEO_QHGga6Wd%8Ofz}vfl*e?{Rk3L0V<{H55C=c4R^% zT@EL+wb!LykenZityp&`^9v*r??ij9ihiM9@erdx8Ckt97%rP zH#R)obYdGrf*QScaVHxE{BcB){` zqc9a@>iOr6=Wo(?>{ZlGy1tvaBv<5#XNKJl73$kvzU;^k7AAwKp=#fGy%~uc2wE=| zloS9Hc!bA(v%eptexCmEWqc+#??^SO5_>*>!2#s^?A_y;GZW z2ZSlhnZv|QL1Z-2U`e{l*I+DSbML%|1Zpj;{$9sbDL|5UjCyeTVZbuhS19*8=rnWa z$AOhCUrl`F0%*GTFg2_Xq}*V|PqG6C0X}!?nOk9p>5AFyeBzv-y={P&~OJI&UQi{{XZT7WV zEru`n)@LwP*uO|UZWp+|G?JJQKI{4={dG*^<-yq@9|5-`DugjSTeR&tSuYDE-)u}4 z%?8=G%_;-*<%nvc@C)k|BaGGf75oPDutFRy!IBU~yyBx-zJqKCU>7~!W~pOeDU6v2 zXOx@;aQ0AZZ*Q*43jfFqDV}^#wLSQoiWDD_Z&$?gWS?Ijj>elK#eyXf@pFj7pPuz; zykqfuNjg(XIRJ@ExoEiQMxWdGHYFwLx?Em{t7rdvezQ(U7^_9XfU`hXb9wQ1S|25T z;$1i&dpY7D)RT?1j$ChZGMStudoy{6=F^B!;cQ&({>^k;mEjNAiuCTjL{nhZxp& zI;CdTp>>(KM9|ld;E%F|C1K-%wrfBjbt!VFYL{H>?mHpK-F%x*$tw?EcSI|~{k<66 zb0T3@ASvyPdpDxy{7MDzF_SWi8O0;p&z@?&iUhF1}b zVHfr3^nEIGpp6}u5FAVwTgUUsjh^WgNfn@wla;0brz2QID9}S)XrGTiWDwhaWGJN| zGWbPgnSs|b1H>yJVxCB&m0YpW+BW}^aH4QzUl=X>O8@ED3#q@hjUh91hO{xi{I75R z03^-0(O9s^~7Nnxv2Qm%zaJxaK;FISUM}8p;&>#rvdfmM>Q;GADBR z(*wns0!>~E1UWZONN;&|C!RH1GQlZ-!Gtudn2AHi9zlQZN}>}+5ZVdR_r?1Nuo4VIR@lVr* z-yNadKPqPWK<#9|OKCS0n1cepi}(M*8wF_+h%4G^&n((jb*q!yAw1ma)Tx8vwwpQ8 z53^xu^z8KngKu=>Uin_&E>eMeVc*?1!HxP2Kku$y<8*@BJLkIZU8%4lqX-LAMjXA+ z=&>WP*a)sWM57dO{2htBP0<|O$>C?!bTr7OgCKw%dj9@jpQJtKIVNIgh|29>7-*n zDZn$-uc&R@kZCE}aGivTos%8bDXqrpW)#ccaFakthRon7C*x!&aP`(D{@{D?iv5wW zj5}KD=i0B#484cSL0>|b_f90H@$f1D1n+g1{xzs3mgE$L^U)G0;b)&A$ zX1(4od7TrY9}gu~5gh8Pa*#i>m{ilBE=ihrTQcqm`sf+{1Jd=%^w0>yrGDtz!7u8vJ*&IqRrcKT|xVia#cAf9)$aN-G%JhzG2N7pkM2lS$W%OMs zeK0@S@96i{tJUnAh#*roU*!}-zfzl6?kCLMQx^W8EQ7 zHMXm2EDH;1H#j`^g!weN9=OjkGWELZ1DwRo*Nd-enawGq``TTqE{RC&%NNmGWQ%En zx;h0tHUUlx6B~rz+#f-VJmeyi`5`%On_~cR_OS-$zxj+CJ$4Y2?oL0 zHE2h#QMk>JoD#=&q(gTNln@igVaGJS4Uoe(2?&e`#Dc*`!4V^_c)MDzT&ARGKyFoW z`+6ppX1iZvt3&u!ZvItrWiD1B?lB9FJe;St#SP6GGOo$x3$mcIv*jVKs63bxr0*JZ zGH5FusxM-&r|KP#iR`yZvwiK>G`TD?1gc)_A5bgJ9E*w93>+{88GH{$<&u}T|6+uc z{5wqPvP+=r*5jfpB}cAxEfuSjg7^Z|0i)~Ud>EdYmqNJUbf?zmNsajlPL}vS-GaF4 z2VK1?Q|L!%1kL6SDt!cT!8qC}Sjvw{YtE9ibRr^~pt09gGR=CSIE2hTY9&|bvLc`M z%FL-;*yKV{tBXg}#h2_aPpJXUztUmn6qBal5^o2^t zJbSed!}z;_<^fh=m;>hfcP~4CdiyhFuX)=@7R<7)iH4dcseXT7%;A>6igFExjl|?~ znqyfS%;33_UG$-y?PVE@V>Bw#*JgBPXB%XlL|3af@3-+$_*q27>_B1eGjOFq=)Po6 zG4Mia?-YI zL6yUXnf=U=%DFEK*cSo5iwrCh2d2x-(u>Ug7e0n*;T0STlF{)t94e@4HLXipZZmYjMOP}q6uYQy zAhyQSiS*Ej!c=?pna@~~E08=VKEXvRc2$7Wd-FY)QMgx*X zSjY(0`&U|f(7+PRQt~8|bL$EA^8GZX1g^|kSI%tG9bcjS;hr2HwjZ?uFdif=X+gI# zVI|yHk_FUAOKxf}KGGq6>}|rrPs^5L+ssv?woC3s383q`^NgWIqRi=EcJNIHff?)S zH6kT{+o-I4adrK{lu-Gtt-3252W{*$UV#=cMj!dB(XZoE?cfhtG8ZbeoN3Yy-6Za? zIA?)I9MGQ3>CCVvG?!3AP>2_AIpZdC1*@{$y!P;%gM`XW#c~gJZ!wGtA)Oxn;#v3G zGSZA#ESV2xr1tI~Lh`?F^-t?j7JsRqUrbBj8@47yz*S_W5mE{5U_MH|W(Y9-LWLSb zNP7@>DvsigDI6R;!EOp4K|CGRR>1L%dp(PLxv02gLU%^YqmBZ9nNYLmxTv@T<^{I5`%KF zKB_zMlf1QB+{(U`$)=EMf#%KRimlNj=0EzZRoG@N2k;OlzZ)$~Qed?<1I;ewAg|hF zCedkz?}$176wjsu49nMD@-a81V?m*bZ&~8H2fY+8zok3Xk*4IjUc1*huL-II>HRR? zlr*6}^UgjCnU#JXV>{;smr)g|$cBcmt)t)jBZ(pRx%u9&%W_F%mUO@Bj6jNE{7sEJ zbaH=m6XDmf7z)9d7D@hn7HY$)&%(Qg`RN&&|y zkUZH@Gj3jA>8m4IA<8_x-lVlu=dS`{0EX`v(+C37#~S>BTYQM?BjM}ZrY$pp;g`WG zzhw@EULPcBxoUkPl%NmSl*QKijGuPJiaTc+;W%kY)R?io5!Tp&wcMd-nXp!3~;T&(aUpkw`ox_ z^z5&K%78bRc}O12Uvx0l-)1xz>(h)-n;P%17gSZSoRM!P)aP`7=b1D*^iM-MkjNbb33F#ji#M3yJsQv5c_SS8$kF&g$Q*i?JMVo6LD|>k87& zaY>bp!4l{j4Q>^-1%cT<9PUP!4ZOL??*#C7KpU*&&Z?# zxjj#!sWnEH{a*w+j2qtDI|{No^jSBMO;~(It%%Hh({NNiT+1H!0vF5i<&5&Z1|J^<)-Xl?56k42sYrpU0NeKJGxVA!0z1pLwOf&$ zoB{%RN0s;<$s8`8*U60fu`UEu*wcJxLLn@NpMYVEqJ>&qj2qX67lwM7vZbhKBEcpb zW%Ff6Hc_dL89K^oqKko;K1Nw@xREX1hi&)^k>k={1UncM_Y-b13W>j8w|ayAzT!Ud z^ATHg#mo;JdL)0e^DmyyR%H?VxjuczUdG8f3S$2@UJMYTR zEDT%pGgH@|`5yN@hN*K4ddlynF65s8AvPNmAGi98dN&V0 zRbtuZAtL7CBtF~uO+;>^utL!`1Xl#;Qg5SL9!wa6x^+h6(`PT#r>6amV$k?$$C~Jn zsPc2h!+uL)EPhzT06ynK$?W5p9;<>V)Y|5N~MNDh30s;R_}kc6h3|sKeqqJk;4bj2V^i5JFZ)6RySrGW;^T z0I2y>@#(jCcK-7uf#@qd_gWl7LLoL>)>XD>waU<47JMbPYu$W_8TQGBStsKquQyxl z_)YT=H@$o0ho>%YB4inERQz@~raP~lH$qU*$jZK~FR7u~FlQl}rQnDv zB(FWQmcxF|+Ca)Y-_<0opF@gC*m2~FP$V5*C{9M;ssLkd#JOc(UD`JOJvC;dBPj({ z2afY=?(+^f0oJfhS$8L*FPoVZ{OW>iVt&<{?lE_=gs>9zfL6|UVd!%aflgO`cOz`% z9`>l$N{*HXGjPf7sc@5#B;x`ce-pWFOz-JcF}Jxg9YNB zqdiPZ$tCTx8Sqeg*tS4(7?rIBPH{LqEOH9A-R~Pn641nQjcNw+4LkQ^<+ew2Yxr)& z*qx@Hol-OWG`)|5{Z=}{Gfp9(Z}6pzU^g%A{_Is?c6-(8Go7Cn>0u_r(x3Y2gp!8B z;ZY+E+S=F>zrZ^s-Jb++{`NJ;^W!*N&F_zheP3d<0|zd7t1h%2IhtcA{oI$oK8VU} z-@(itLGo<)WnX5G(VN=_F~vB?piNQ#%&F!Ti;CPuq=EMxwf!Ee%U#xS1%ynQK( zp0E-yY}p1czcBdANS8drc=5JJN`NOm zfj5wJy`8yoGf)Cy&x=$rfFcD`4>`T0Ncx+T^C@1L{tQcxVsWqkGuKnz*e1u{`TFGb zC51h*2Sizoo{X)NwNnQj(3f+KmjiBYN-wi2(bOAahJK)j+)3LACRf3J=YvnUET*2_1*4Rhkz;=Q*x^e#g6b2H$9L zL-a6=RyRCWmN>-ZC!|@5UOQuBPZ(YyE^_4B7`RcWbPKO+cf!Pg44Gml6n+T`C>0wv ze^C-ksmn%D2t8#nz@DRPTtHa-8p?)X->1I?B=q zERIHhZ#v0_B0h^My8Jf`GN3XsAgB_@s|Hv=Ft@nH!dFa+Kxb}V05H+GIb>vh18blE z!+FZukvAy-v)aA;AkljItQC|ux_^J7_3)oMh3aStF!)vcq4ngpYX4)>+{?%mr} z;azBjF$&R!_U12T)3WJi9ycVN;_Thulmuhu*9=zJ+xcEJ6#;$?ST3WbF-==-WuFWg zH48PlEE$Izql?U{M?;!BZ?8^=!CSi z&@t1=1cOv1vOECuENs83pI}Pc;doedPPb&&rm<0glYDpLAgQGB_}j&7@pZd=$;D-v z17EO?#n=#fD)86ZRQg&NK-Wp=C9Av6)`cn?LK(kUr}N$6i2B+FgD^4i$|r4iLmqqX z@n+PWd05|+_M*b{kayEoN2d##`^#XK*ogP-F{u%?(Ut*wL7IbwsB*UOvM$>JAw;D` zXZrjyIA5tfGbDJgJnK!_ss@TrJ7k_0^au;#2Y!C7p0mLGTx2aF<_lna*K(1MU;?J+ z;!|2Q4AmYNkOiIInnH$70f?~|e3G@6<|g&xo$FGNd$Un7@GYJM>HEN{%zT8ldf7L| zz=MvC?fWT?>q$a_cw2Zug%P;A=Q}RskxXRX!!LHNP|2$9ymeL0m!du2(2RZRtuEUq z%bOHc9~+6J9;kn|F+UDxY$}qUmy4-O#usKsK#D7d8E_8E(bN}e?S7x~y2+DVc0mYQ z7t4=w)ijY>yWz+ETJ+)1$=v@J^gj#0MA{=mi5ON1`RFYX%P z#WbJ2W4HJSn(JS}t)eEwS0gh~VTZpu^kDwm>|lOYM;mGTVPi8CHW1G1@&pdHlQ&R7 zT&NC(koAKchdPolPVrh8`9Y50R7t#E-9twJ=b1iVDH&3v*mmYd1(l?%-ckPh7JQJX zKS|1WmKI~Lm0s|EtsuprG59Wtz}np&3anR`xV4$E%@*uhJz;kLZVtaDZ*d(UIq)YNA<`Y=UaK9Y^$OeiwdVor=Ns`9LaD@lm0B0p~ z+fz}#qi6K$GBpwW>e3Jz+)%}7WnX9*Hx>6YN`LJjRJ+y%XCBO9c z5-{js58rjX33X6jSHb@@=$4_v@PYpu0AYesbY0?73o%$l)=zRiJ>MS(xC*0bocnph~^9xR_jc9Uh?&0CG_b%F@q zb4M3Flk4W=xD5rGBabZu+5Oplj8nx99Fvomvxo1!TsNJ^fA8O} z$%NWQLjY1ZKd&h`v2dYKHZo9&6b8hHAtYvUc7@4-1SlWeN@0{WHYk^F^ZA;umKFjo zRW_%_dtwz{>bfM@LnJ zNY{Cg3PDzr2Pt!Ab+=j-rOLW9O>e06&RM%eFKPTwfd~6pqoh?05ujE<%Bb}hB^ELJ zFWg3$EEvtF*%0tJOq>Rq)FjvhG~&>f>GZf)gYJCg`YmHg&IILaj#ShrB=OyBJ{GcCL9@SuQzAD#M>aLC*`>9Ne+b7eZk z#-4<-3pI%n`meg2N+#1LM8<%2i_rA>!c*Z#NbP^(z?BpPlJf(3O%~IN1k7M41p1sG z;W0YO4=FdN(+n{(Azd=M!XQbz-@-a=7~yPkNE$ykFJ#mMR#CFr5a@(IX)kA*RP#Gy z|JxmS6(KhvMheb`RAyX!)%%IZ{NXkW_NE?5U@jca?3AI3B?Z!547ptr!bi{2E=G2E zX)L<13$a}zRmLq3c{;NyI%TNMlQHnBc}uO>#Uji15whj_XfOF0NH%=6Ku!&re;R`Q8mBFX2P$g{;ua^UEW}7} zbw7mkz#rIj(VGL*+fSW-KZw>BBFqf;=mkmmEav1Y4%8e zP*F19;sx!T1PYptRTuZcL)a>r0+PvV<(*xjB4*-bkXp~5srm;&@bp4bkke`6H3rX; z-{wBWSAmdu-knobKjX`5uUY5fDX_J03*kes@)aY2Y5!FqN)_$kCet5yKjU3m?HE$k z`QsLeF`hOZ)u6JWd5A_@Qt2ZyD@B-% zGhdPSy-FAyPw$|l%yJEcfUiyh^U_BeSk^;Yw0qJhqAiWhRBA1MC z<|LwUDBw1 zzWi>9v7*s@hahggN1LWOTS6*4GrGh(>9sHQrBq%ZGyc;4VAzx5nI4&e^Fq9_;P{#i zSb;^J8~$tK`eiHi_0%%W5x{3kBll0Qzh9R?s2`PbusEnMS^dy>LsPyN!)k;+g&Kho4w7p_Rt|2Bf#zVanxeLFno}#iK$b?@v*)?GE9$OjCkZ@AT zvneW}@=9*Z`^cZJtl0dn*@MM=HXVV)AOk{G>|rwdWL?649#D1FHLEk{D>Qs>qV=ZX z?W~Q9KtpW@r!v0pZxWv5reOWa1v;=^uPH0V2WDbh3i>-j)#j!d(&N{oH%-1e`y0=Z zBFj`#VP(?gkC@pxIjICWXRi!T2ZyTu! zju^Twz?aGjb~K_DO_Z^`$wxfXH@rb0DRWI43@<5%B6P_nS%gj(LKRRRb_a$^df!T_ zu_9$b#~l3lTI!cXfJfYj7p|D>QAVtAJ4nx}R<^2)ecZ!AWRu+WpSYuPEO4<$QS5Dl zbTAwgmN%d*&%x=kk3?9@sNc;#u00#Ty^1>eLu4q!sx7-SPGOPijY?WaQ~VzlVhj>N zEIGJeePbMVO!lNB*O`g>tZB4sOd^%LjN%FfaqB z6gC9S1#cA#66Q2u;h%3E{gCGSPyl&hwTaJm3i&Dom{hc&LxRNu2$xE@2(s?|Lz>2( zXtAU#Y5eb(=_?{~zOg#*C)JMX0>9{JebTBOrG%UZdP=-gsr?PPv&G>{b4gho)rks1 z*7qeR=9#t%)i+M;QSJ_fI)K%;6t$a_g6CMaR8=zSq&iGF_(tUz)zIJ5kj)scqJTYdRXA6Nm=nm-Df+}_xgGaEf(roQi1o~!sX3ziV^dtu z9uu&&t?B6|p{#jjB|B~SSXUh4`ql+?Oq)ik^}vk-u7d?i;Ime^pEpY|an;GHV&sRl zlVi1$PYF}|n;;&gB@l_CR2P)a69MvpfAu<5M4GTHyMegTy6j=-P*Cv;rvI)Qj3Z?G z#!@b@F&)&u@}sp|z4WPZ_YcVGYTl&NU?St*H9Pxqa%f}bi+sE|33TRn1VZaeVG`u~ zW|{>4HndzG1a5fb91IKuzjI7*AI;}0%k;cqjuD&5cZhq-P<Z4`vKFt20J6F4}?*}T?J7o)sOItT=ZBGHA(9&@#&HB3+oU6`F7wW zwHA1}TKCS7NZBj_4(pKDyy&u2G#tF-dkTB3V$aOsZiuYsD4HY=b5#)Pm%*^gNEjPmg6-0U5?EOeZhq~+Sq(XbeRO&(!1hvbZ{#f^|TUAt*f$&Z%R~V$$W0q)f2$LfVH0t+i?`?miezqXKLLuQEe;8v%{v z_HuYp8p1>DEhV~vOFtwcliGJi!O+bKFmR6U8v)=_5rOK!9LB73?CY`I!ZBnGS7v08 zDT%KSdQqx}O+Ti_c-J6J{ph3W8keO|V>`^&$XSRn4%T0IO%nu0*coozCI^tRlyhM; zon(L>_1l2sbKesLRo9N_p-Dgdl1EAXn~%$7a&QMNh28*W13h1#iaC{Wtt~h3G}hhA z8{%@ZH|<@YH9hK)f5qEaQF{&Mk4AF?iZwF|CnTNJ^-Szsq!>HkZo8n7rz!KC8G>tpaH_sg)gLToZCr+afre~vXy^5H zF(|a`Bp#(@(}x7)9j#*O%Q4`1-aEkH;X)D8aL3GMMC;IpwQ6M9aU4G-Vxi>4s`GJx zxL%fnbl0}9S;N)a03T<*BqWNNV$3Tm`~JzF^kQe)Tcv-S<-LgH7MFUHXxDBKcB%VV zCO}EeY}8VfdzqVa2!K+dLWSpl$0-a$sM2rMM<(=>r93(=iO3f7jifUbIb^D67B0B zraF%OuUyWqC#kSSywNSQgb_@dIb5gTXuG%ESPik*jA0*eSSNq;U|k@_9{4>Lt{KdG zU_C>aAyWta$TWcSE!3)e`P633igq5%(b^2a4P3M!L{6*NRa}cCB^3-@z*0FHzP)D* znry)Eh5HAVHfSD;c80*@oyNQvJNzHuzYPR_!;+1tY3>i7>Z`O37$R=Z&^?$d;hGAa z+8N@!3cC9shnc*RXxVhG(z;h;1R`i;)w}LBaJ^WytX9`Ktd{gCB@bi`+9-XlTrIoG zb8~N*8~!XrdVc%XO>*Y_ux)&p_d~OAa|JGQK&d_qZTp*xn{^Aw&inO5T<|4Y+8x+W z30VVNdWx5ED5ffga_HW*lPaW`qcvCkW19J}b44u3$7==$TN3NxXl4`{wRmUa`z9Sp z#P;d-bt1^KhbT^ZxwAN1RL3oqd+^rW%FGdXHDv?g%&@ah5vfSRjEn`8RCS2DE zXb3&p^x_Lk9a*bMsz26p$)kp6CU-1n*5IW|Vy^9>qmO#;?j~DM2MOE_Yn;Iwzhd8o0>i!r$4J^M9j%mbKl+)C2J# z&9G>LIbDG3BREpes_Lq>Sd0Q2q48oqS>}t7;&7Z7viJQS$2(wVRG~%T3|u-+X?;?l zahG9N?HM%!Sln_JCtNf%jsOZ~O3yl7Kop&VgDQhMqs%kuyTAL%6WY6Vx8k5Cx&UZ; z-M+rf&gbOSrYyJ`NWm&AhV3v)F3t?BrI`TKMEGr+16h_MPrL9bJa_atABrdH_gw&@ znsp$rce@Nl6NNz)pKQ8AeBw7C+a4~Lz972JzZ}aKtapbtP7|?C#HSsyP|L$A39ByAD4)1*7wEs3@kSiRGUj$8&=W4JKy%slu+$ zJRHznp1OWlMb`xcQNyP*&(*fwHJ38z)aPjk$}?pw3#CHMYR<3e1{=q62A0hkavv*a z7&fA+P$8_gnr5$$SPGr3HTp3GYWA-)ARd24qlpg>kL5&#wdfkReaI@=4W|+bNm-B0 zkD(p3E7JcA**h6>2k@@p|D>D+^uLj;Pt(fKSHj1T_Ld5nu=|y(V|Xc&{=X{2(U97cr4lodpXs%lc3N} z>fuNMy9}Wk$5@$uee%u>2SD=Dqr80w{TC)QalL!-!&k!Ww3i7k5YFJLD4Ce)a#tY-Iz_W0s!aCb(c*50Bva=k1fL1jI}>ri`W;( z1z(P|{vw)GXaQD?>>KC5|3Ckdty>TMQRVHBb_QjH0Qc4b!iHM3GBM>IW&PlFwv*se zOQv|W52KRkDPL}wB(jwEq=yhIK+VVF04pyUAx3dBm zONF%~QC`7Vi131+MtToaAC6hTWV=BeUU!v`eJvmvKVJ(v0k9iTm8)!2(Cl{NLjsDX zQ>XYzAH}>|b5eRj$LnRjUq%>ZqP*19nb%PVp@_E5MpkzCgFi{PUG12cg{G#FjoC-h2-psYtNQRp;&8oGK05usa3*?n} zMK5%MieBn{+u(2btZ{+eP+L;)byT7a>5ly$lUtsVzO2D4OTH<MtW8S(@ zRZa{e8VGan!%?h65TA4B;kRqKl^Eh$#KpuwJaMCM?#l(^RkS1r98x3G$#BEx-jeiz zghTF>n*#l;6{=W0G|JvfYm41+pjyxIydb28x_*@`11Y783Mo1L{`96fBu=fY2=z!81Vn z^afxKTDB}2@q}va1Br24LYJEJzO4_43bcAjGNlkkgr|@~+pDnX>Rh){k0q27b7*OC{QO z=>?||s(8aOwL8NE#HgMCvT%L)J@A0l)M4ik^_cw8Furg67oZ|v1HhGySo=u9B7c|9 zs4#b^3w^RIj}sx_BO|i2TFh{l3$kQAUP5_asgF^Ac4cx>h_Qve(dLPLfkejEh|5`U@(05)oRJ>AsEn1ZHrLQ+rSb^J5Tsd1!1;iq^9V(6Gv7b}5psbLdd0DT z%|Z;?Q#cN>;&ZBcV=zP{xVcL8TPlf%w90f8v%3rTL`4y0vn43MG6`io23LcRc1IXA zgGqn#%hFv-Ge^pPDcvdU<5mBv-yfpo=dFd}@^v1rRj#gasYwSS_EcAL-uMpY5Wo|9 z-X-czDcql>xy+N4)D2O841(i4-ep_z5iA9Q>Dr|&v_eq?DR?o7`F4-deQUDyt({0^ zJq%BxO?_g#iz&rUU}YKhh`_}CtSpl*%ymj(OH8W&2KIx82T^=M6VYXPdvYq}7!HL7 zN+O23VCuz04n3>(=b$EJ8ci?krnaFaT+i!LKfNrz!m zkr!~Z4PVRx=v;XBd3lwr$wT@fSqAIhP67$F)I-!A700j{qn)8T?^$BJ`waY z?~-F)4evoEL$%fc8lQ4a_UL0~%z!nUP0f8dJITpgi67s~Qzh!yk5FIinAukZU2f=K zo6F}tYS=p63#!+0{HwmFHH%z=rI(#r%!?nQZzs~*rpSYM8^rc=9yJ8?2$G%i70V9! zoo-7EADamCv1^Lzi(D1lUV6Mjmk6eaaC?dG^>;iFuE!svz;kd`CMFrtfg)>~3s9PK z;l~eE0Et0^4RMo#Ni{-dSo~2pdOPhv(h#pfCJbw?KZY^86z|tN-uxi}iu-{M#OGd= zYBm;u^wdMm@s65=ivx2(JApTmA=;)G#NJ?Hmrd@}hIhc6dJ5mG7(5MyNcLzqEBkUI zH_ss8@tn}gq&HGY=e+d{vr&;0fxL5}$AH+afRr`<^aB1dw`!Z|C+TFH0>F2zDvIMhRDKMk_RYp4C2o5M_V2Ug8R=fbwK>ow9OW9{og63vX&Cx#uPSch|9ZQNkF^QhbG-4?xy?_I3B zUWn98Hd(bt@r*@JHM))0sp@59{c>%b3wyaV3Pdqh1&powH^GpXAi}A5$n-JcUh*kc zaXI-Si;*rR@*#|@SxN}c>J=9qa`|}hI!UzSu6Sm-mJfNq9dET@k5MUwIR94p;PeB@ zWQ}kY${*`YRs=%+HR$Ej9As+-H~H_s%O!0rbQ&W-J&5VxNNlz&`3mX^!cL{-QJ>VP zYeDFe#Yr*B1{zG0f)r7LS9Z)Mvp}7TF)+q0hbl;>bzr^(PahVS@N5fO5rB-lF_QP* zCaY2n@B^IuSiVlb8Y=9_g3T#Ahj$YOqd}!F zwOYva0Y{PL6imvN)6$unJ!m~CD!6aO(9WGzc0HZjU)Dm4QbIcXBCh;@U*Vy+c%XHR z)a9Sc37z))mGYjXThH)A*9X;bx$_p1k|w?yRI)!PNLro9$O){V2WVvhouVO{sr+za zfjRiNy=1&8!5Qn#BSAfNlM-V7Q)W)w**&uz%O#?$2@3?N>Wue;J~Yw&!fQVMWx$3b zIvN@rAJussA2om(_t2Q?a_%Prbxy9URCB)*G)R^b~Z6YKg3pwU%h4zDVt zAWwI!#H)kC1>FT%-L!A8yyjaLSBNmZ^Lsy4iOT0N`f3sG=;pf95?@*V?!uL&^59%9 zJ8a=+SfGH#wugsP#~2_`;+C`h)JEIk51;uSgNsW`paV`5NGJJ0>^VHVR1p>@xiuZ6 zVA7`5*cxwPMBG|dIPGv36_k}~gN9)veFxXt>fYrV(D0XW+kzsQCR)OhulK~APSirC zg%EEgd5rUq#Eo9DKY|)@R>)AI4xHE*mjcNfoAf2~aP>!=>2NUfY2z&^FlCH>PHs*3 zdQRHRW~`&L=1?!3(1zKe4VuTfY=L@qLw%ZEWj$qewENqzWf19>pxy62FcRtcNTg)V z!iqI7nVrtO*gJFsl`-^cdqeN30W2Wx4eGUN^N?HJ=@+`o{@>`1=IMhk1Pz4u%8KIU z8Kpj{h@-s>v28jslEQwa8_qw3&r@-y%n(s6#m+C?zjRYJvZq@GZ@;%=Srmc3U2D&u zzWEgK-5L~g_{gUwm;6fBYSxp$qQ z$u{WN#1e}^i^*8)-{r~zkRPyAun1yhAg*p??}x~<)u%gynvU@mmK%w$Dsjfp;Yt)5 zQV{tNtNR>&gUgEZvh>R8H&jS8^NjAb=~ZMA{7I|OHuxBl@uPG#aopYLtTYxz3swn z_ger?cu62J%TyNs>CNP?Fb~S0j*S0nyRHoP;@&_m^#i+X%jf;235N)TV6jk_CU&a) z#m1Aj)0jG7`!HaX6ET;ykCAR&zB?-J8~axZqsqH$L=`^ht&wr|u0Z;V_<|Z?e;i`o zIF=nu@TzKH6z1=yT~&tYJAcb;Ia5XTYmaQ-2a}yj~}6 z_Ek?YxSUNXFTLzlt}s6;V&EcWMDFx}+BM!h#s=JRc$<|t!3vrhhwt41C*ni%2Hz#~ zQ>rzgNvf=IwjI@5Fw6ZbSj5ROFm`O5!=ACMzbl4LPo9;muvcqkt)zvCybBTjgLM*Hu^kdHOk`e-U6l7Pi#Y@A#h)V!> z$#RFWL!hMnp274gCtWS=AcXtX0&NR*W+D*eEG1G--I(8xH$isqO4r08k5ZfUcP|lB zg!i(SVI6PB5wE^O7-lf;gHbB(onh!oB{#s#$YSKTr|dNbl0 z2sj`yx*UWcs29b;STz%4G6tgGbBvza87LF6tPqR@;8!&xL38SS6wxlG5quoy@8f6P zb+?taiX%07e(WylDWqkDl&nNJ54^=|o(9Z1mBK_L&qdZ(V3E3I?AyjoGOzHZ2@p8OA(!@*FHbL}K<2 z=9E)6SNJKm^?d+%L(3u|e81`WDbz9HkLzL{MGCGB$qW#q#bc|8!2&<3i^@3;C6^ga zDc@8H8QC}^hw=dR9YXn;D9HtSV|RN!r!OYS7X?a|PUc*J)h-DXr{KSJdpifYKg;yF zqnzOvXgCL!TcP=l=^~;g)nv%gok*h99e(U!bY!N{^;aarJ~hb&_Z9`XBgdxGS-;z= z#)pW`ySjIz8dAXkHpUKUKaB$x*vR94B=qP6V7<|AI3PBAXd|mdxnS43csK;gd+MSmK?7T<|)<5)w_5boIOQs-OTp3(_Q($1rvS@7E z6WewswryJz+qP}nb~3STPHdYmv(G;7-FH7~b$6+&YE}QNUR@orxF=6=KvOoht?VIv zwRUA?>?I65w@9{h{AsfeA#M6gm1YskNnKr)A+7|o@v*_n@HPN$U%l;)=*n7`@|pg; zrA0p9!(t3^B?ZetiMeir3e^`S5~C}9&UijkAvSxZ$eCewIX+`K-?)ro=}4QTjdLA7 zn9YgD?Kt>e{{F{M;%t5%%`#E1FcO4Ve^oSF7tR~|Wk$FP7CU+a?AZ!$J3`OFj+^f@ zS^qbt(P#m?QKr>37njd zQl1RvL*Jdyp7LmpTJdc^gwr&(LN<{$uSd_|U`dP{)0A72GvNN#x|x`#_N53Ut2W=d znLkOoE3ZKmSF$Cv5X?V0PIU@@IMK`xSyrKT5(2~xUizCoo@U#aNemV{O_=_$te_vX zvb2!!SxC~gB?p6@XnKdm1Ozwa#n~9lEq+K+D6)CXA&YGs~>xqI* zB|&>}M)}B>73Jw81;89jrk(Bw8KWob&>E7!bdhu8EuK;UZ|Rdc;e6Nt^$@mNQ*dC> zuNhSV#GNxxg~=Y(bCGbvmy;T5hqoMmoTH;-WUa(#b@I1w?4CIG^q-R zl9Ri%E1Hs~AvC9&n2lCFZj!NPNqj9bPHoSRR6qjkpHV!Ns|rgy&S;d zBZL6RJeimq9&F=jb2{C2Orf%$IDdFok@??&c19{8j|*K{hSyw_bgtbr^*Cm7<+x=p z{tkYOjsl2zDUUe$a^9aCFckQv(QqPf6G@=Su}wyz=^umOq{E~Xtq4N z!zat8ZxZ86t5ce3m#4-zzb*BNr6XrWb?`EmOy4x_oQJ#9d-FeOKMXWd!Mv*>Ouk^C z#uuICi)svp-;~n~e$^WrSrlTHg$($-SnimC3i0sd!N^{HO{ksdS5l7mcZ3@>9>#?b~Zg_&O2z$ zw$MM3y22VkeA;pI@D&91KNV2KvCVTbZCaLDg9!)x0GVb6TC|i|`Pi3IB<8%kbi`oZ z0g#0)VU!*<d$SZ1Gm zl;g?J3b8n4pa{4tyIO$OAIz!&vvyayWh;+BvFWR z)$F}un%sK#b@cSUKHYOm|wu=dQd``yA&|dDT~gtr~iL_A1hL zx!9U%H6M6~UCzudyC~ufK&I~bsHmpiR?4Y}w%^LtxzTL6SZF(3pm+!SmL)cG* zK;!#DuR7ACaF0k!=OP;9d-4m0}pD` zEOaxX?$VgxvmCl6H3@%YigO3@O5sbGG<=qs{3|QFRGXxo%hg5>XpVxDQwcA(BMR2s zE>S`xbV36xP!nPXAt|`n$s3?46WA1=>T2yr?)tE0z(VF5$&>h=zB|@|LhfPDo>0N~ z^mONG=LiH)hR;GfDz;G%bcX^P3P69Qm;q}1&>8YF_MvPJX!GheIwZPZ3IQ8;#SHQa0z#=KYOKI* z-3UKwm{K0dci#;?eM&n|MP^9eHG8@&f~Pwb#aYm?hrwwnt5Ps)5sVKA6M9V31jpyM z7AQ&vGc4Q!v5(Ij3PZ}c(W)V3UE+sEXdPr{Ry;b)>RWCqiLAF4^?g>#h=vCyUpWI$ zme`XotJoTiJjs}I!sz&*DT51hdduAmbk_IAMZz^l#EV< zt$A98OlMi`KeKE4eY2pv1OCKGkenM#9~o=4r5zQ zatM-!gnc4NpQP&rWW}ta9o%+v_m$%vZqZfDAe|M{EApYoPRU-TZjc>rUXO&z+MYU7 zxRL5b&i1Og;by>lV~*fKpIY$TOF#cetr+A~%_I>{;XZh@lpOTIq81A(r_Nr_BDJ|n zOPBhQ;1KIx4THx)`XoI1V-??_MqRklf1ao;MMLej_=^TC6Nh*eFuI zNq$wz8)&{htHPQz>{m;F{(dLV`;+4Bu@7=MJ*&QH*wCVVurS}hMUo}Kc`Xpe;Fryi zv=_fGt(@^ge}PdhigLRxWnMiwEtlWeO^b8gd@!F4&|?f#T-9(}^PV80z?zc@clr?6 zqB4p{A;!@-6*XH@2tic1hh5>6GIc6k()P+rRnh>(qjiz_yAGEA?k1KQ+N;uVN6<{C zF;fv$-TnS&aS-V2RId{r0gDjQ6Lk}b_94DGnDZpQx|umRw%BrOAg?wxfc5Y1FMv1_ z4LfzS)@TFyznnG7f!AjfppkLPM^o;x-^%(4p;PDe5g`>fy8a{a zSMaNfw3296$%deoUSn6}ztAiuv^2xcdTZ_MJ)c!ufTDw^=?~;R*OQEzb+84iVUc;Q zvD^&EN&gBGmNfRU z+wq=l7BF0h)k36%iC4ly3(>VFZkiTj^l3Nxy3ky^N>HxJcmP6M0(XHg$zuA9ZBKLc z0dSeko;kqu)a41ta*X8>9aSreU_BucJ{C?|wL=vOYv5Gl7i-+PO0h6GQGRx&S7;LA zcRA5JwXtRg+8zMfIJ^gyEGVna=X*o2^(WWHZZ{yVPS<5R#{_&~byylx>Vb!}G6;`{ zkxsd7i39{uye&gd&2`U&y38sy0+F(9j2GjUUqEm+B+n?ZZ?SW72EH#CEHzkut41;1 zC~>2tRarBBTe;IQMaZ6|1p!NEWR1?AHpY=>&NV>;*2-{|jD_;BO#a+e^h8h0kXo?o zHu8|mYu}H_kMf#)4vAnTwApxW$Pq7R>BOo9v>+Ng9_hLo4wz=y=W_<7u5NTfXEaf$ z*j&KO}V;(voz4FX|Bz%dLZoDM(A~^ zMmsNRAe%6A-ri<+mfbmeM}U%W;p?V7{nKMM6`NEAU)X5us3YqUU{NxdwIh6?o0Ya% z;_jJbm~_$ujh5beC|>w(k|axO%uKhXNL(@qcBSGAapthBs`Jp2T)^J=+{{``CMUK0WV7W8?$;5%%eqG6qARNxPyi|MDZ1 zYYm|nDz_>3nsy-0;f>kmWH=&c;YNo%#voP+q<%NVc2JnQ4y0`Ze}!n?nQKUGBfTT9 zM5UtGKG-Cq7T41;`!Dx1F3{*a&ygG8dO}J8%jnMLPwxToAm500qwS5RjLaaxv$qKf z4Rc|C&EExPIB0l?KS-_rsJYm@L3o;7_fSh!Fomi$QExHSVlE zO&j$4oG5dhllBg(r{k)Nx5&;C(F9nWqMF>bVJpKTpgnhfS{~s1{B0Tdh6MbU{RlG7 z-9avzn@J1uxbq?iq1SH4))y|6lapM8S0PZBU=?BjQH6PP!F_B~ zn#P&MW&mFWOR6+R9f6d06g`}?0!Zp3&dXYomXhbk#7wuqICF>b`m1=NLLog z@HP`d0#7nYx3eb7Y=s=-Wsa9^y=;DkstVShHx=n9GSIE}oYyM{;Ibs5^+Cx@w|rP# zfHG?hg1N@s*|$531tC~{w}+!c`=cJ=<}V!Z5CkNm6C9MN{OVsB-kG+l)~IQlqh6Qu zsSF7)zb4NmMUT_^lVG#(3Pz=_>U@2NIh`McQcxJL>1)@xS|GO^|}HL*iw zdk8kt$t{V|d|zsVm&)+qcW6}ASHVja7_4btnQ&SPkX!wN_Or%V@r8yZVkzY4j;TX@ zQaR0a+1E}ExT=7{*U0-i$9A(Mh2AgrDQR0+kHGQ=@*j>UX#uI9tvTE3(GT^@#pn$V z%RRV-?mpQQ>Iwzvc6FQ8YlYg40a>gi#Q>uDcpGlvE2}l`LB9Ghdu0~DSTLT|M{j1+ z@~6A$Vd=9<2*vR=Qaw`xTmlmD-IOI&UlS*srG24B&UGb}G6{k8O&LX17y-s2_Jd{I zzXi;8<>`9tvoP)ZJ{E@z3%qNPkjeOS5^)s@}I+8~-N=A`Kx5Ockx)(uI`ZyEZ9 zK>9E(_s#LS&T~+wXORN=f#{G*G1DGFd*;9e6rIXm8_pB^GsObr6|Swyg}i3_`APBfG;vf?9Qd(pYYX@ioe0!(h5I-_f;*fd zPuPEals*l1@^v=VEa^@R*hJ->iGRv=0pAmSF(09#F~I(aS5>8C=K+w>-}dDyWzm@a z>X*9EMYCO{d5!5>MC}d{r?V{8yFsXFHeUGI^rdNOmwZo86b;h-yy|31(l5PJP zj@?k8+A}Io^gY&?NGF5jiD4N5rsXazgMnMjN1}E@7}sem5#(|`iOecNMWh&9drLvs z)ul>eg0#;m6SP0YNl{p+`3F-$_ zqtq}inG#Y69WUBP8Ojq5R=$-nww_Bf1!zZKm?D34`Nt3}fVv* zXGq=$oRpeiTc=$Cwtp}!FD_D35yikaidI7z+ZZ`G+8gUz|LxftSU@o_;nU;)?QnDB z(+TK17>n52IO5X@N=u6~(BspInAHo{Zz{mzgr|4wh==RGPpHAuf$?%`%Kce__s^&(HW)AoajQL5|1Uq& zzZ3uO3IEsl|1;{p^Zaf9>S1PM{0IK_|8ba}{Xb5#GU2nb)8n(Udb@8ugFi z-;w<%TK~4+QT>PKZ^Zxd{WJd7_qYEyK;QWPIQaLCeB+ro7`}n|Kf(GJ_t%)ew7>A* zH^D&vJ@NmHl=lBk3-*80;%{#9I~W>&XXalw=mhnDi5Z)lnmK-Z^es&HH!1%TX&C=f zMXdBq9q^f<{>fJX0b5smO?(<=`u|YB1vvguqWzzw6w$Xfw{pWL7nD|Dpr`m3XKrPT z&%p6FN&llnM&J57q5omQrxSP7w=y^6w=w-@`kqbE(b!t$+y4KV>Hlw$K+&oFYw2GP z|3}9E)&AdQCt_~pXl(zF;f}^a#)h^=-?K>>+n73<;WIJOGqV48#DLGt$jEg20<<+ z<|5WIr#(k=^J9wV$@l1U>9u=MV}kpw%CzTdYkEsiyts0R+~86VD&Dcy#TbOK?iy$S zMahpE01b_m6b(&tFcaBeBnj7tdaPGEi z0I--e0AfQzBL37I03dS3GObJu;N|M=T7cFEWSW^6K+8GM0|#jFe60Ze?6*BQJslaJ zIKRA{GBP;&y}x5cAvO;13%o-EP>G)w|7aQCp*LU{Kp2^8#z)IgARnYu(@*=;P}QOK zmlK;E7$82FW-aE(7HIwN$t|3H7~f1dxr8`yat{CVFDCU*K4c64A6+COH;)66T9xY4&(#J!1WN9mH9Q-qBoeW=VX(pbX8rnLnX&6D)LPtD z0I|pb`}!t>@3md+B}B(H{5t`dyZ$vT1p!38K)awL=vILKp7?GZshwbeT3MSvySHS) zKZV48;`icM9YWEAs{3EW;kksQ@}qtA#U8nUc>vt2;m?AD)N;Rn+)V-?=^es7HfP7o zf4T4UXnzcoQ=aYsUYi)10@Byp*8=#$-wcAj?7khn`fz{MEc>V@N%#6RxYrj%b_D_Z z9`WrHv^=%|0~{{&a1cBMzQBMed=ylPNPx_Zp`e9&iMmT-5OMo(HTG{FS*I}~{v%u>{ zM{luDA8LehQj^$Hw@w-ebtdE^vZ*?Ok#x?0>`H^6>n0|KH$$YrhP!^QPeb2Y)&QaO zs*8jaU-}jhj&**c(S@ahSs1(C{n1N=Q#e`tz{Y@xs<4Q~0?6MQLc2FSKQ5mj1Fw}| z9RRTdYa=oW3ho4{KMe!rfx zxtkU?-j7Dh8Y2>F8r^IYQ(e!;9o#=ptsnAicyM9{>V6-Tbw{_W7a|&bj-dP5DefFf zrh{)9A__bJDYkc=%8l??eHpWdaAM^Ay{EvFc7t#MXA{FVbc1lULv}L=n0E$LM)jo! zp^h#Fz-u2A3d(dG6S3nf*96W-^7~`wZl3yk18`;Ndm}n2;h;xf500WC#stDPHX&ow zFx#%4mm7WLfdBF{bL^4*+AQ!{?^Y}ZdDnl0$bSaiO9dSJyq zk-uD`yAXo=+&5nQ5RrdmgqIKky-gJ458+oRz7`4|VqnBBk^e7w_XTbU^n-7F{_Qtj z>Bl#I?JwR(2!R&sTUhuGk-yc9=Ue#VFVh`)D*|nM%rRXbBSQmV+h?2rcV*1##}U9f zAN1!0ZFWc}AXi4)s``pQ-d8obS#u8N%yN`kEf?@Wc>*;)MF(Hh`aA5orP|snq z@QbHDl(5(QhFUn%6WP>0GbuhSwD3T8re{E&vB`WH=sN+m?-gEvyrts_U|`R=bYrp% z%RLzwI{_2#9Z}!t$$1g5XMA>_Mn?GbJ%c~5HFJQT2tPdLecGOL`M*v*T35cTJn;&T z%`NZnMPxNIz&TzEp4{XL`Op)mQ%F@bo-*ZsmSfWLhZ<^Gy2&1mBgbcbK=@mAzlIO%z=(Vf7L1s{PicOL7D zq#Jp4klQmZg2?0AdyVa7KWJYP)eyN-KO$|hR06co+q<~>b5B2nFFz){W^HMDj}x02Ud#5F@oAS zIRTrk+#YVJ$fG22IHfv1XIrp%NtuVhE#vCI9{dnvA?bdY}z%iD*tA z&plT1a3QFY-pC~~D)6ccannIWuLvol#iP7XG@z{^3gU`eY6aea2O} zDY>nqm1~rk3$)NPjJw2yQ485a6YIpBh==T@xhz5)Zes%^5^^+@c8!i~FG+Ls(F5}2 zgJnZKAQKNsV3Xq*tgeV27TS{Zd*}Wd!SEr-6mG^u1GP9>U`Qly%AXqg` z{sR_5!~VVSh?Ul9v$WO*x<~2E?+dlMw>xU8C*7eCM9VjFgFTk2S{aPK@1O+IxH){x zX-n<-xDnqwCx`QM_oHhtD|9>GDP)+a*x;a_qtX&EO7`@Wf~B(FTEv!3>R=UQN)Slb zO{DV?CE8baHT_z5l@IrP#i-yvhg_C(M(N>|aAY2+%rWtBZDTgL7zGf>d%JVtR3`8F z4FTqvS69NpElw)9`JSYG5;sC+aFo)-*)-!u+P0K-e6Yh5`(5AH*umk#!EVT6fydHZ zz}*T`;9{fEHkFDWbESEk{KcQzyEesEg@Q}SWqsRjB8fvhRwNY5l6D>qKEVm;?Gg)( z+MOdMX`JP-n!o@sy|XP+5vAK5I}=(VySqZuw*}*&E9)Rj=<9%H4kwJ1ZrGQki+C|g zkT+G(zLj8@Xa)ZIvEHl=ga%yM2&)06997I$)4?6eV?X-N)iy?JFTwhAxO=^~6ce%1ZhmHc^X zWoFa}4JYfTkR$fODBeOvzgSdvjR1-dWPQQFCah3zFX1%;$7O?(!o z&vFOQ+Ei9v#^tx?5<1iNw86pyFJ;K_;V%Bd0}dX{DB*kxh4>Gt1>)^e=OtaunDqj$ zAHz$~%5szut||-+F!3-3KtrN|eOIARo+0W;=|rWvcZ#v}Kv$XcoRtTJ_#C3$b1+`g z{zK$Zs9uwzle3+jd0U!t;o-(5a_$c-H&$d$-q7GZ^h!k4*TjqOA0IW@GCri8aO*o> zxVAl(zN*U^nB07ouSVPbF~z?$_yxR2A=%OgqOt7zrq^4&bzjk6gT(U7VBNlKq$GQ9 z-WdI!FUGE$?oMTw#;@*4Xa|~6*=_DZLa>T2fzq@j-DugGZWXKjbT2&0F=>UR9!l+X z`CCr$+!J!`gYmFUDY_&mwNaQ`?P=!|1vo2JA_S6gyY=yvJho8y!+1rQJS2cF2+SVC z%=SHee?0O!?VxRelg5#^NO;F@L#5X{D?Q1_nH@P4rz^0v&z@jL-Dh&Gcij<2qH!ye<bg0gP}llEo5aCvfSj2CvItSusfs90bM#s z%9^aj;^8o)h{_1Dr?-_Qa&|Aj4d#nBZ=S=^DmTDUd_ZJ+No}U}6CFS&Wtu?N_1vXS zOTSPt{pmFfyI7>G*mIQs`FMx*tLuEf#L+BO!+F?rd&fmnh{`NgCSnHD9HwGUA*ZL< zsvr5ntcm`%W=bCM%*@KuJj`=?yn6U)Ga~5aVSFg=v1XhM0#HU+WZ%)<^MooRWG*1{ zj_s$zI5ju4#roN?scTVxlD2`36ga1e$egMa`bP*3Po`&v0bDv%pagEW0>`vZ$9%Tv>+j`Hla(*Ty z#dUt0;*99|Dc+G~oHDsvtq$YzL)?e40tULz%!Ycm?C_m22cPMp*u+uBaE&@nFg!|i zekPs04tQSm41S0;Zhz}i#$L%9o^nrEbOH6;kV#h?C(DFh3|032^+CDCk?|a^yeBJ- zX5d)e2?6TQ=bNY)g<%5K?PhP@!~-2_8Tt8X{E11SHc}YWG9L4$NdH4ph0C2z`ByD_ z8&;R}%;^02!t~D~1nd4N-9TU5yg7zzA#1T=-teEN-K;2eK-K3b$tl~@CT@ThO6Ax1 zN^Ts;mBn-P@KPO!5kDBFm7pk7p@@NWyp!EB7{)8rVvWeS^9<1$ok|6l_ULd6rL%pj5FR0f^|Fe?;EaX9 zxfiN8#kJ2rmE)Mcc3du~zC9vcg$=zAi^HBMZDIy4R=#&)=z4cwvxS4KeQh;%>M%MW z*p^#J716FvWZDLW>6sV7W3EOIVBWenELzdXjixUsmsQx)J#bg6eBSbKZFf=&mTh3C z%PQb4N|6!cnJed*g~L_Qt0t7VwG{+$hS9G3NY~h$Cp7&;uZyE#V3Zdf;zTO$Y#2*= zIF_S?+;SzAnEL9X;x_I-PTOSlL&`=fFC$Su!oS*FzBoTsKs55POnYAxL#$TL7)bJ1 zplg#h&{B&Qx4gceFLlFTRYQ8hcGfU6&oBh64C5$DzA2y=ijWY0nFZf< zn$s10Iudq+^4^Pp_bXKnvY)iuQf-<=Y1I~1Z=M!~&`&kGns?JZ7P>Q8z(60T5V4GL z7CMv>p<5|C$|=sIzA=9FIvfZhLug=8 z%*yn7mfARquHl~}-zCk^*gCEg?uA_0!}pB=kSweDDsFcZ!-fV`a4A7DB8mXz5;`Xa zo&93GjEE$aI|t6@{0=mJ`C$Q7rubYYenlWIm&1zlk>$>-C095c%E~{Kko4*d;CLwY+kyv~2oaTR{9Y+qG&QW_ zv~>}gm1;|MvUxR7$Osg(yRb)_YL4A{ zhei`cCE<9fZc}#0EueRCLZe7dQi_zuA&RuV9BT+NAKpYp`N;XL{N7jtp<0DVwQgxo8Q9P~a=PZC9JQFFCU0Wnz|;4W%R%xFRtKm+tYj=- z%u7Jlb`d?4rev=B^Sy$M?RQJT%$oyj!v(t=a5bOXW(k<~PDypU0Jpl#(iQ8n_`|QwvSw>5(xh)^~Ib9=X57B47|k!`AFrtUvlAzSC|fPP4{9d zilCbt(W3yIK4axX-9Wg(Y(f{JO281c1eaow>1mfcG;j6?S33LSY<$gbLZKwe9csbG zoOLu}7JfK^(&pWWa8H%igL(jf~!P^8h-2 z`p2;*BZ52AyYkcO{zBQ+%?Dd@Vzqf4DR9@Qlh z;cgMLHSANNR{G^jbcU(|fI?Egl8)!LRh5mI8c{nKUI+N|j%82blgbEbUjxp;y9|&F z92%oRa-p(m4Ia>cLtw$wwf0I%T@OG1xKFH!f2uHbF_1ki38+}!nq2=Ob=uB0I4;u^ z(m`ICdlFbi+)H4J`V*X((KFgM!U|{+>_NY2DnSnsKr))EACnrTL*o}K!>}*~qCz=` z_(4}S%W^!p6}WiDnD8khO?dcmaRNwBma;KexdH+KHV#cg*a69@xV7-t*v}atM5*ZG z^FWvv6)ciw6eg9ZZ2iRiy`+ts$x&ImA`kb3P8Ho6r%w~?kX<6H>j_VQd27P73SbGz zcaDUulr4+ISG8{Kq{Qf(xn+R%yP{)s0|zb4M8ES$E8NOXcRk6h9GbwsqmAF41{hj- zNiUYF`l_0?K^CTd-G?Mnf(}uNYz0~wfkC@3GiMC~j;#IgQ_fUnhsg(5 zSu$)Z{C%xgT}D0V=de`p;LWjeNWE+)}1@l}~^Kxc# zc)A~0<8*;%n};N$k=YYcO^v7ld&Ia64CN4o{Bz=MA5jXu(2HInCEskVQs%O3OvM#3mM!aBND}mZ#%m>tWN%^0W?_ zv}i*wy!IoO*=C{?e#ie$fqZ_wcanv5Aq7dypC@m6 z?SgG{JKb8;Y*CSDw4V|6+!^4%GxFDj!=T(A)EHWFrZ|JHnpKUTmlEMo=yy$|QA}h# zR|}in6>ZV(jbbN4%PD52v|D<=%Ox;GctDLOjsqBiwUewPns~;TkHSakbmw84ha2%yg zk;g8-1v$sX@Nystel6d#{nJ87(0CfW);Km?Uj=uuVH68dhR2xnO4Y!JC8%d=&98_ZS8WGOvRv8Lco|f)$-N$BH)rI)K2~$v#1N;xFcIk zjrD%RI^?w8wUW3q;y>2$c2o$cqsS(LkzJeQpic5L#WKz2_5J-b2j^%GZw8J0NdUR# zgTN)dHi7(lRD*xdqH|yM4pDAc5s@jsUc_=_6Cl~yI{$GwRJ=!dEWU!4mmI9wYbJML z;fjVpMn@3(yOa%_I{Op}Qxsoi!gAb1OE4u4Ek3Df)UO+wjz$Z*7OmsWTDms5NIH8p ziXm@L#*>zpE>H?N-CMOIes&ejfP#K2gCYq=vapPN=9NkY>qB7{#VX+vz1cprS^Qh6 zC`zA(Y5PBS;Xvj-HGPSDF~Q7W4bW6>k~~{#+m2KHirCS5`A&bLddvhXUVQig!jIA7 zFcQGh9<{oAe_^$?Z;nTOIg()`@U*Johf{IQW3o8iMvxBgkgc7O_amY^m;nXM0sZbw zv?4CS*^B=@`VxOlzmrm{aY-n%n5af4^?ALjj6LHS!_=~3Q28P3o;V%V3#dEbTAsT* zhseTt>NFO}S|xKY?k`q5+fs21Pzqst2UXbWLdjAh7rEuHfY&2+TG7xQW>lkRLp2a# zI4)Uediku7ukKMuHbM~k3upB=G@LotkY_!Upi-W3JgQ(h+$;>kOqSC@clN|m#|D-_9NbHQKV=KHTSJ#`az9)kUe70(t9 zKUL!t(unOm>p{kWY-D0-!f<-g*4E}Rl?}O##NIQ`gVZ7A>*6?i!AgO<1@*st0L>z} ziILKbFp?AHjNNR`@Te>Z&>29e&`;;DwUvJJW}N!O-4n0iGDL=EUZgC!D23b*NiWi7 zFqYv__ne`&vq_Np)4j(UiAX9t#+up^Y`XfI%r%?XMKd3ia66<=F#IiW9D z0NO9gK&i6mU2tI4p44i^!VmmVE&w|>TX=eVyK8{L=OZ_`N)Fp*%CCtv_(E}o+SIDF zl5~A1f#r7jRJKet3X*ZVbi6HkAp?8^KHuqxOT2HGgu1WOIS7g!sGH$J%U+ur=M zsGWyg-ijFK*u736z7f@{?Ah;bzQ3%0pkBSOQ+LT;1>5?!(a)Rf1EU^>t*ln}?+d zy^yp1o?0MHM60>!rK3Xq)%IwlO0o!k4X57>Ct|T~UYz7-02)GJPqt-!aa^M9oasmW z_Tdz=rnJ!r4*2LjoG87QqRLhp45N`UA3y3EdKf$td|gS#z%=Au)MhNldw~8Ui4b8T zd|0gz1_=*H-_`tlpw_M1ah$q4Ll*ybbxHIH0|DA)o;z z(GBUtAt$@l^d7GPLO4Y0#>idOkOkT7RJN)cEekCG=$!*9puH|TwRaue$>NE@dTyO0iOPty{qoFtg36SYp%r?e_OtqZ)+#&z@Y!t3%pHE;hb6E5QzaS(0 zz=lM=T7fN#a9W=nW5c=~4ddtRoc2 z(`k0oh@fkmUbgCSE>sfOg~%V{_e!jzw4N{PP(f*uqk!GqOMl*qAn4Xc8#K3dyt|0 z<2A`lpL<&Wqtbiz{v>YQ)quBJ|A&yh zh&>-3%VJ8K{KpOqg%$eZ0glX9;3G8oTeAb0xfD`0d1e}j9ITfD#tI>c>D>Y818BrV z_=ZO_LQVj)kLyB>8;FD^0hVe#G0J#Z_KW|r6Y18xu7c;g}sUqcQNRyw5km*sSN>-sV+v2RiLB?#ECTw_ATsrxM zwK8+|!@cN$M7jQs9R;1f{sq=R1ULGXnazOc6A$T+=<}!hCspTXd*{Kr24H@RHEvnL zj*9BVIM{=_o;2cRiLFLWi?TH}RvT3h8OfflE@Gnc;PQKXSFsZ3RXHlKio&vAn+RC+ zB9YLlEl!Umbf@Z*7uUiLfi&Ecw2aZ>k^+3nLQe6ynvA0LDodfcfA0AGe9Rp1s^SuC zQ&2`-x2JjJnaJzV8qFn3WMyN0q-=@p6n@Yb!!n zyU_cF*lwJ(dLhH-d)gsN4ozdRLG{BkBNg2qM61`MCNZ2U)vaHitj*>pXQP(|$wxgj<;@zVr!pPmfk|ypi^#{z!|x zLcU*U-q+en{kdLf#MZY>G*0rzV!@Iz{)q-_-438HtV~DU^y=gYh#0x+DL;qe-+j&B z$nE)ZAt}^k_#V0){ejyt3Mi=Tk`(afxCq%17RKV`8lpB@nFkjl+KK(6v&n~R|AiAo z5S~R`<9%*ikPivo{&`7AOUR2tD(!BR{K69F{(yf}v~a!Xv@xEUsMLi<0h+ahT!rIm z`JNNRRF|QRR7wgCR}=1Y2ZiX|8cCmQMY=4ih`J$EMNE}w_3aYkJa`Zr8GMx`StjP9 zl)|*Lbl^tHfEOUTlUm7{l>o290;O;Z#=0Z4Bi8SNzr_wh@M0k1Cz+jZ zWiEz&?c++EWwQ5o92zY$(vTcmC)r(+W)0ccpEGvT%2^=rJ(BHT;dE$`qgn?kgRqM6 zE&G>5%Nu9=^2)Z)(ktuE1-XiDbWLbb$_Ym(@ZK1Ze!|{Px1`Q4y9M3+pL4xrwiH>3 zkL^)9A8L-Xki_NM>^MSehRXMcx@BYJVFj6d#>5Kjrc;AL&yv)XP!zA7!qZ8=ba1;j z%jeobdNNS^tKX8_sliI7RP$maUOE(gD8#51xfEn%0CzaI;R+-jfUfb~XC41yxv#VRpUnO3p z(@OXxD*!M#&TP_ZLX}x$QYx2l&P@m62Mxgh+1I!uF17CSb=&w=6PY_$#+l&6k3j?; zj;NHS*`npD?7dbt1*#>JENbl(@pW3Bo(cBD^)Gc*ckbRVmfi|=2%SXGdM;Z8FvSH0Z60ZB8TGzYg^EnEx!PzVy!crCOo32mnQi6d?(%)K`Xim4 zv<{s8e}Yj{@s%2?SZ7X4U8c1k%4fwTlk{95H50-$B}77wZ;d9Br6lCw_Q)oKu{8=B z@weRr>B=N&7#{J0FX)UZYf$NI{DfW0RdCTVB_}~hb|BK;@DR6w;!-w24f;gy*+bbWxS3$%#%Et{p%oo$ZmI%(`nUY{_j7jUAS|we}`T; zLkT*JKRT*{JIzT7t^y-%(dc||`>+_mzIvcQDwYPKn4uF}8!l}Dp8P<=knEXDG2+7c z6*(Ob#8(U2lJ5$miFJIIYTpC134blcCCY>NnJHQ2Qla_ z({GK^y4iq3+UwLZS-Mkfj+oAfMuSrF0{tHL*@r4(IEqC`HHlokycx=Q&lxY1s-ZpCv zI?=`SxX^7XCJU^BjH@czRbEnABYvi^nZP$qcrdb5^B2J$vBE(p-SV z17*154p*Mk0Z+;8eN~n9!c&VEul=25k{O6O^Usr*M+4K`%RUrY-xq~0B)zn==Os@w zjEA<$kGVa_dkg*_08T)$zfs&oDfKCDA*T3H;y{aRzUQVg=ksZ}wFXFkt9>&m;Ca+g zX1zMOR+luXXBikvqAk5N%SzHut?N;6i1bRpe=%<+Zk|66UhN0DL2PGKN`{hVZQFSp zGVgupi}Gp@4R?Db#JQfcP`x)XbM>Z)Rq&@v6?7Lp3biSv(4C4rrbOZK?i|Gw=Gttw zhMC47J7zcU7mORMjGJ-%U}Lp01y;pQ6xHL10{22~W5#|rZ0Tl9W|N@1k`}M~!PPnE z#owz1>8ck+AD;h=hFi4Dfgu$uRRTEAS0dtj52{TgHSMSG_Q$#YIlM;_2=#N4xl8ErXHmrj ztUt7SFFCm32Jm^cGLQYME>R{*(-({4d@J}|5?T|tC5-L@YL^P=@?N`1sF#Fi3`xG8 zxj$@G1$*59Z40M$vSh19L9DRMXBcgKYHrU^wKZIVfKc^c-j5zIZwYE*whXTSJftVG zXX1qE2B#$$&2r`+kqF1MQ<}~nb@4Ia5An$h?bmOnU8w&+3%*_!tX#yRGHDvaI3@OhnlW+s=H0-T| z-?^8U2YHKgO4rJ4&%bgelvfz5DTS31{gA>pG@f zVpWm6-G!EB8D{7Mjjxn~qEcci*Eh=-h^Ygh*RVNq#v3VA2DV|sMz)lzlmj(dp08Dq z8wpz1rgN>Hs#;_VUk-&pBfwNK1ol~)cfRAu|ELOnw#VmjmE$6sL$f4cceUfj<^77Y z2!|+YM@TMzn6AE`+3gjsg!Md^c($@XJprx6#7K~0vWD(u``-5&z?Sh0tg(&Ymvrw0 zZ7s7;N5%$+D4ImY!XGJY({%A>D`ONc*b9#{H`bhIQwn@#su`@g>wP8r<6 z8)QFD3=L9LSzIjM%(2pCR4%acHhwnJzjw`VhHCW;I6AKw!yPF>*{ai!p;^?5y^$=Y z{?b&LNk-QUPHr)*TAY?6M?V%27V*>Gcve4HwGy-8ToZU~Lg=&v4CtAd=%vN0u6RYi zDW4gpq?l9UA1H0c@(aO2jh zlQdnp&K>LTCx82^C4iW3AWMjBgmN@ z563z`UtA9`Fzw%0e`U4f94opovKeg9tn~UW{$*Q>tYUmZddM&zs4%W6(~&xcEfO2c zZI%D3y(z?#x~W0TeHfFgHcm=Tyda zbu^_LWYhRz^3(c%u&@kK zbGz7CMS$N$c<~(7=8eA$Zk2dVZQr5ifdr-3Ek9nPtBtVHjqpfX zbeO3gP@*1_935TbeU6JU7-pzlz<$>CZgDC>JF3uT>!=k0qw5f4Qzxixx$Bu=Xwp$M zRTuM{AeYzPnmoIPXBr!q7W+;h!2L4B2-oN8i?X0UV_Yh4VGj4{Xux?8p2=r>g`a9k zvzpIj6VEj?(Q?H&!(&^0pOJ1KV9XnKq1qD;2bAXpe+eey;Q_ZTJ(C7`?+GDz#XWnI zGHZyh`@$q=eYhtdW65vQJ|J4v{>$$vDq`3jvR?r=J^d1nU(1eHEBfp{u?Hq7b%+do5d=0U9x+PEx*g@B|l9I906&a zg8Qik)I1@1Wi1glTaD%in0t!&y>YdYJhXIJ_l5Kem>8{J-SUgSl{U#; zVvSJW>~Y4+o$c6Zpd5^$9L|gLc3jrKTO~_F3^Jtws3=4PXd+)Ws7QDef3<&2PfZmM z4&vI?C>NMX_+%bmmZbFCa1DxYuG(a7koe-8^zB4DQOO4=)cn$G)Ni38> zW!2bDR#^NlE?1*SmPcR~kCb2Cw)wE%&ocZ~u8QTPFPZyaDT(nZ$j?#@Ty2vP+{)6D zw>g|7tDS)MHYKRthhm(tV@WeFs2a(mGY7KA@!uHWa^UBCiSu!fkX1DLs24Q&P$j5U z+-|JjM#)z;e^_t^+iJ4CW7<{4X5lnzqy11JVt`IZCKo*Yrf@!YV7=)uo8b%aeInj9 z?D*nT097I&1k?pA| zp7vSq7ZsdRu`&{e*P$ppQOya(%cH&>4Rew2h`Pkqs|D&Xah#d?#8r03-K1!v8qr1* zaCBx9J0~nJy;0NL<)jmgy@d;nsrd-QIe6zVkk#S+!Ys4j357OMqrn@p(m(Svp3x}n z-W5LbeMN>{dxze>EPPz;KAt<0jTHxu;pb=}kW71Ll~usva#<{?{(~MJzkrZOrk^$_ zo2i<;K}?%l7%tLX0)E`WW^3@rVWd&=ppSXQy<404P^V=N%HENVfdaG?(gG8V92Y#J zqVvvq!)7-x%+!o7POf}@$8v|L)ca>~1OBu04eGm1`=A{-J@dQd9U z=c8WNe9TScH@F!n)eg&LH33?t7SFNIAxtdzYKu2Ss8yPUg1v7HBEFR|r2>OZ-~>-a8i1eJ=H z!e`daSV3uRzR1H>+g3#TjwNH)%gk#N?>yMX)=F6X@%B`J%I-dQ$wZ_((9Z6xyeQ+h z&cabaY^lnNe4EkM&%9B9FTy3yJnH>D^Gq?_Z zaijV6F5NKNsu@^kj7*Rzdfw4Bu$)B064YkyA%xV4iz^lpDS8)4>1O-yZ*#o1kDU;| z{)WWLJt{ksyhO;4#`!@4O|2 zCy8I|qWFH?wHyMH>MBv@UBlb@Vp+(o3lL9Pk6p56M)vEyYRB3bLp?6!qy(W$g~o0o zc&Pkv9L`Ztx;rU?%J8*E9PcN?d`t54t^zq>zCfRh>Cn!3!(D4~CUsI2T~_GC}S}eE0GUGOKWU-w*RnZgGuOr z;rj#zM%3>%1d>Z~gBn@(2$N6mXfp=I)s$28{S|GKEnY8CE@W2RBHQvme_49h$@x_| z-a*=ud2Ed2^W}cGLbYCWlyz>2Vw#A%`-f>6r*|v0FBKFs`+LnYrscg^$OC`oT$@`4 zp!20Pv;<$G#Vl!E9TPy2VSia6;L{v^(PDXY085;abfvnsq3Yi;`ixrWWDI)qwKx0P zt^C=tFoSW#w`hz?Cy%nZnS~TGnxDH@E6rV@OVbSmR;IwBL4)K3KSfCrpXazQaR_85X$2%y?B8POvl}C2#>DT#&)T=J_~UlEb+ua42+txOl8c zv{fOWc^BnHpZFE7pCiL-5{2}$q?OP`l3hlVSQ4qP(T4E;T*#X|cDTh6hN^LrV9b(s zUwMZ*zn=2OgPE~QGUOIh)`>{`>-Z0!@(Kx-+5@EHYz5s#pmFj$D=Dy*^kQDW0BO`p zL1Ta!vYaF?E}ZuBv{=bx*|)USxI+VX8dxsklvSWIAB9mBjTxaG)6j9#Of~(=c$fNrF2ikP~LfX4VjQwj5MW+pnDn8H-4d*dH>DWwmPP5 z1s*z`B4a}HxMzvTkaO3yROR~JYyrB#&A`8d-ght3@SK{VORLPFm zvAPe&mPAPgxSA5nCnM5TMndvN`Wf1K3a*2NOzPUo&s}0uS%x%~$hIeRbv35$?pFMK z)S|icJP*V$iU^?XUoxm2v4zXrNed=eX3WjWn)$n{k6?Omi7{zkI;3Xz$M>A@6t@#P}$@%75&>j z-XKdorwXt$#~c3lg~jM)5{Tz*E5#!F%R!?ADJ6}^{8n1$!60W8M`hvFP z1J;%ErIXy_8fQ^0kUm7DpR#|s6BbXB#*&zI;fpGV>VQA&KXIKkLr0)YQ%R z6lRi|!`^?v;nZS4Z7xMtwSuQgprm|r6H%!AEjfq46 zi+ncHt;*1%Ek5PSScH77$iim3v>cVioy^1!wIn_W5z?tWcq^Z+A93 z7_Qa(h={|Z;H|?=ieU>ZFyM~D&B367GS5Slg5(nIrln4bT&a5^R-GP+@!q-+dqe}Tff zw;9&vla2f`qa4#Y4U?d=R+AEYbehZs8GSBO8lO+!&VDl6ouKf1Uj@5mVw7 zQI>*PQbTNi7tHs} zjswGGQ4Pzzfx9zTXbBV8zX<`b)-FONA_7vvnW1~la+PH`RMb( zA;;%W=`}-E&R-kUnOb>f$zXpbyZ~V6sMhPbX^;^&-y&*p<(UvZe{I(%{hk~jtBBdm z1Iw<}3q>yAKn6X?o~C(UZHX`i1F58rh<`Ovou>G7gl%Ko^xkb9%ChQo*s%LWaU?L- zccrJ?u&4tmwIPs9Xf^wmr#q zJ70)J!P$b)s^!AlDDGOFp!x`tmE8qqDlw-}|2btUNAt*i;@kR~k5M&csG}qp5m0Yu zoSuKLkR*$FBaS4*it<6m0YxDq(+n7@_66Qg_|p*{YZJb-c4P!J!7{imY9#8FWY}6 zye2Xgxq#>H((TY~B>RKrqLTsR%XXtjGok3jWaf(=URbA+sA^AAq%h)uo}V!t^bBPk zLP9^?m-1`pxsr!g-~`h)D9qE=+>_my={s|TU^y4^#_{uO3XY|K5I7&3Q;It5Vqtrkt*jakKrP-2TtG4Jv^T=jR)UHASj+2BxH zTO7Y+WHgL&x!^~ObmgiZVkMfOcj6_H%vqEM@5y_{wrmUXxU>Lg+ftTk%*52*<@kxwLE=T+) zt*YHKgKRtlyu<&SPql^r zVLiK)*hHz;xvxYM8jVw04sJFrmx;!hZ?-7^czpZydJ=ylfJqDe~?vj zdbf^XQ+IqqE(eZa*URmLH%*xM>JV%|DLDnAWUMRPD`#)LV+Ok&c(m&b0Ngj&sA344>1h) z!8{IAS?-J>P<<|s%0Lu-goNP00`1)RA6tsQPT7?mRSGb-x?qgJ_-rc*0b;iCh#p6f z2{mm|&>l9lWKkRx!OgxJZ+HR$lB+e2foF%87r|d5=+}n5>8Xeh*AsU&_5mxc9FQXlUZ3WRz6P z(phW(K?dwx0-ir|Gyh8BmH`DR_oTZw=7R^Q^s+gaaGAMW9^6$7N^r$clBdn=FBdug zWfRcxdiZYFgp$J?OS)8Xgqewu2rKCI-^-Mj}R!iXHx)txjCsKv{Ov3drV=7Wdsg88y0^+ z)@BsRE=p8r!Tw{5K)9qarT2$;Qn)P~5r86Z;Ah>7?3nc;Xlo%Mc11-*IKOV0wtpolfEMp6f+I)wDuF44rd#(6OSc7Ox#^W%cY zuTf>I&P>K{N^Te~0PogUWaG=BiMxeu<6B%a@(%*}X!BsCVF-K%i(TBX>*k3@NxU#p z6xqltmyu`t2NXQ52AyCcNl$`fOWew~y$nYIoZv~Kl$r-TWFqlI1}3Q3@_y23xrS^7 z-n&?K{Fn=K@;A!-X}&s3Mr4JBQZv#F`lV<8=*{(jU@d~KOyk;9gkpElXS^+UlYN@~ z*!DNebiR>zJudU4i!u$1&B_;m z1M=UOyY`R>r&Y;f(;{<{>~I@r!Q~KPlde+V3J;h}D5rVUHz7@1$SQril&%NPn{0WX zm#{cW$ACG&ifrwyK_VU=&PFJ<*CpU z`2?NGXcJk`btG26E?pIc2L~J`*Hogqj_GP26Rbml@iatvLjRFd`p$SPtN;b)+Jv(wP9pPq!~QA^gvi*y zSox~FL?>(0!SG|+Mh#Yp--VZVKe23Du%lf@pyy{8C6?KdSy`>0cA4R59tVB|G1&R2-c4mGz>z=C}H`ra1*$#qh zOI%mneqW4=6ZZ!u4;h?T?-Ax7V8oMH=dgLA{Msl)kT&Q;NQ;DdnM7aBgLx?*Q2cHe zZ)a{KU@V)EzvQqriI>sxsbw!>kRlieVS8O;u5KJs<)WprMzzIuI1U2N@S zj*JRx&tw(NeaoA==t`3O;y0iP{dG3ni{s%G<+~}RxxM03OGi??Z5o5eyWGZCj_pjS z1n!+#{{f4akRb_^oSLh2PeR7}4Sq+qY?S!|pV0Co;Q8sn3Rwu}+>pB=IP`XviVV`D zSa&n5UcCDkCy+nX>V%r31@5%qVZdySmO2F|2|d@@t0N^w`S^>43{WKLKnmx2D@#>D}|?oi5Jj~Sdegd~0;x6ME$29h=^ zgr+bQvpIuW&3VR*mnV-%)`c@ej9G?HiL%ZNlgw0~PfiwuoNa@`a<3fTzt&lGvKtbf z1W+>tXWdcYiD>==LOoCWeH~AbD!L)`z#k*IfgD1#n2$)l;AAGxe2y4R1808A8YgY} ztl3O4>08e|gGZuI+tCb;s~5GGypfmYyVz027^u0X_IJHfj;K z=Z=X4$-iMDREw2r{@}|>KQ9jBgYp-0{x%yfy2cX8?#nZ|aZ13R_*Z2205HJ1GQTEv ze8Ssz-x*=lOkq3+^RpRVkHyk8Hw(0ty&e5m^`8vLfk^*MCId>TwICLs%qqWFKhov+hF_$NyM$;>H6-^x^ zuj?@JAoqv^2GM^~sVYlW?cT8XZ`w6wBK zt-3#dwf+41oNTmdrw>(_ zt$&hFWlG8qL_epmW-{c5|1AvKTGq+X+Vx&jTKw{hl|d3Ay95;>XC&M56{A0`GFE4x zx7Cla+y+ot9acP0^2Gs$?=OB$A$(F&nEQI8_Cv>dr6#TQJ5xSN$+R0b0(`?{=T9)4 z7&6M2c08h>e?x=Nb3Cq>O>L1o0A(7j!h z_TW70i-H^zK*(n1m#+bdG?Vdmgl@kU!S8W-b+?(qG z5IR*+a~;_l{r?%k*n#J~s?r6PGEi=X8U;9k68vrwmd-T;{l{*_0X7YI4OrGtesXUA*U%^=Z4zU|LUn*!ec7>00OXL;U7+*4VWR1{Y$!EkcwY@br(p7#3@{EmRf-8$tx z8lZ*%*V`0b*LlbMsQu9|{n;})J3AubH;q(lG3^!_1P|~^pxXt!kfb0AJ&4uQGnuXH zdqL_`&|p#@{zRHc$i(Nk5^1 zhTftO#L`g&L^0TC=lmiEh$r$qC$C5mHNArOFy}_;hl@#dm^23TddR-#HMHe9SraUk zlSvu!OU%68rKHO+B@n`$RhiErXf4sStG+Z;i=i#@8P}f^u)JcY$$#W8CL>DBu5X>M zUz+m0EhkcGEVpZ^mq4JZ%Q0eQcxrhUuvha=9WB^*;U;8BzZ!V}0o(-GbngQ?=z9iX zXzO)z3ke8zsNuK&BGG{11(l#qpH~zPh-TsP_}r#-Fu4gp^iUwZhWdMMs8vE@TC_FL z9DaR9 zU4RHxji#(_|zQ!nM=?My^!o z5!EyZzJ<=J$z18B{v$W#whu| z#VbGvC&C3I-JzOGW@&5vaHI{Ue9Nu<4@9wIN5dUhBu))Y8bfB zg&20x7!(W#VZ-@4wtirN7p}l$Xg>HaHrE!(dLU{5! za3!A(S|Z_N6A3BC5-Qs!NbT0&6}q~1BOh$2TQ%#$2$WDnz5GlYWJqU4N7?ADp0Ww* z)AG}92;M6Cc5d^GQh*nG%}{6}9*{$7|GP0__VP~?UQ{j2`&O-9@8lpz?Yc2w4g>9l z?^@{^^!4gPh^b<1lS3ix71C$YuXUVX=`vKN2v&EDwI`fn-W`qg`%QPe$D4&=b#1=) zNTC0a(f4jrAQoD7F(*k-Z&zn)Sdg0_mG1zhB^C2Z9-*HLaTEPeVdGZTu0BB%6HXTm zgA-`qEzL4|)G>9Ts%86bk~=0RzcQtrB(+X)fryBzU_3zsyRB z=O4$PF>TgcxFEeFpX6gMX)tlyhf`5jg89*zn5_rQ!ThJ{|0hI*6lTB@jCFSgeyofx zg~D;X*vMD|j>JX#uN&Q!O9sx|O!5ij`xX4M!liCWpK+IDx@!{6ytIg0gF0@$B6hs{=OwjWPwZR^QDe&~3@D&K1+MvJgWj|!*Y$p73mY7XU_oOyEf^MiHg%vWb zQc;k~XdNxsHBKXRsxKTv_T8u*n(UC1`xJp{=NvDrZwkIlSyX}Uj+;x_-q@BC3#5pT zjn`Kn*tu~kI*HUc0mG}S78RE4K}=tUo&dbwO($QK69ufoqShTfd!*TTHky{M*?IaX z`-EOf&jkfVf>qFG`IQq%x$oG)urhQJxV8TSL%qX)mnr9WIUo^rUdW9VF@YVf;XMxW z#xC#O%T25ZUASjD$FsQb%`Y8rqg;X;ePIQoy|^)G4Ab#}lfhzmPe-vo8d*L4SRmda z_6Do*L7upsQgXy~#K8(qFa_-M(cN($_=-zc+&Qlp_X;e^&=J;9H@}gBvN~^!RrAkH zHb%|2z_1uOgDQn2jO@S%fhRLjvEd#Onv%b7q%y}D!(J*%)^JM4rP18`1<2oFkROSp zF%=;7d*Uw){DS8RWPjGy ze8+9CjxgByUG-Wy;ntuPmHHTUBQRVM;$hLq`NtNmi#Ga?k@#hFnTW-9V>og7ybVAy zgwRW+SbY)m#!p5sRFRpyT3$Btc2=|^>XjW4eAXftLf`v~oS$nJRwNPktSw?1FSh;g z(Nmot&xr%xfG2{npw_*yKg|LSZU=|!gE&B)`U}#Vx-N@rBH+{x(n!B;cvZNBlregq zO`?GYFKU+JJm&4sC1*t){B&-stSk4y5`n)I)hom7>)t>%!dp#}# z;-3PxNKwCJALzU6y^kls9d7?t#ScMoy_C}1e$?Q6Frc3vP`eCwxlsa)?cy@kJpAY& zN^aPdG7nPtali9WKX?QdhyoY;pOg6O`qy}p{{%HYX(GS ziT`+I5sd6*em!vzz3QxD5Cf!jQNi{{Hw0D<#K&hDo&N>LVSZK$aZS7|8|qL}npTfy zr*kFTaty9vtl^_}m?8@&ulxQtRSPRnZf*OTSicx0Uw=xn8$V#ymNagtKazs@1g>hY z+XHM^I(ca0m>9o${WumTa@UV|bsYmLYA(=(Bzg;Rv`)*_t^wA*O@$K6%^x$az#p)W zeU**;T?^~;^X^mI0O>q&#gF?at?|zOlq{~%DVr-BRv_PcFb;jXW7RTMdwKy*NizeH zXcLrro-1c^k4>igDWJ!7{BKDZ6H@t;H7XPiBW(ig(kRI*Zeph`lVU_14DS~d`SjU| zH=Xsn10>c?23=IA5B`oYuQ!@;nZu%o;=0TOaKU_XOfEx#I-@BzLm9D!TkKdK2ObvS ziu4J23{WCrHdWZz@>fP^1K;wj_+LGsjvDh>FpOnKvjZQ9zv-ST`-fp1tNCCJAS>Ap zVNIQQD{7N7FQI$Em^n6VCn2~r`Yv|TQ=bp&o{VV<_@UW33j3Gd^9jxhJTNPbT`2qJ zT(d)91*qBh1#0o~Kf_4Mp+>C3dxuI!W7SEd0qKkU0o7s!+~-p(5Qz@d%OI0Krg{}o z;R9i?Y0g!XAbe6EJ1e)B$iE%^&J2Vre24;j}Ea zH-aK?tUSgt5t}jTGI%u|N4POM}Lj6d5xZpRVYYIh0Vg;E_T-BSO6jTSOyl76(yYMGP%({I-l1mX*Ltq zpBa_$)-Z{vdB8XiuXLLMYplvZA57a+E+!H6z+k{o_cmAklSr-fJ1R8okmrQ-9KSYA z?#&W9Cc7O02einPCC|d#>aPI`zdRMYa#Y+C>O}9M0lY;eO?D8zS}F)EWU1fOAVpQZ zOhs7?*L)`nBvI;lcl_zWXXcT_l2#3gc!S7rch18h+v{s!$m3uwP%$7(@iL5@LMO`C z0H1|?##OFt)TuT9W&o#af@v{FzwWHQEpZ=Cvgy(7=Fa6$pa0 zyo0)vh=}UBCtiaYFd$gF_}^Nya9f`>q`->}W@xe4=(WsCu>3(h!fL9TM#j{|v4y?v zbkCxM;!ewYpZs3tUuO47B?T_b|m`=HYd z*?2NC7$u^9t3E=;5%V4CMifn23r25&(ujh<5-Wj_xhKhCkBDxneA>$CP%_nSZNUhz z+}nnITBLT9L38Bij{_W-CVQAoPw`XBW^exraWl-`)PJ4vj@s3?17Txi3^QgTErfmq z@Wb?sF?+0v@uIjZXhuSpbdF%lQzoIiYnlM>fCJzK2S}pv;Z~~R7fWajiAyq7lu1P+%s;H(N7nPFH{?S&s z{r$Ol>2bB(h3lg$M2kB^69(iZ?N}3y?$=X3DPG}C*5Wj!a5Fd6Pjis0K(h)yXj#<@ z9t}JfUv@_o?_Tl7VnS+yI(<7&q^F+G@!>gC8y{zFOx+{AB|OZny|jFQhSz0%$S6dJ z6ijR?VQeIx6IJLt@TPKy_ao+5JR3-y))L-;xColIGhV5&KjjFP69@U!Rw5$9Zn$F8 z&fd?}v#T~%nDTKhmWXQ889^An8Ewm&AM{ieARfldNt?%geSys2mG@)4Tu5KiYYB~C z#MegFx>9K1WMp%~M>%2^F#@y9V@jv@Ic$s13pkqzTj%^M`F{Oo8>v&^UA)#QyG@S- zXdc5w6xJjhr;ZYyxpu?v=_*9B&hPz5FonyHgL>dN{*1?Au?RHJhC% z4_B{{wH_%o0mSEU!V-gPM=oL6g!1E!pdGCp4W!a7R*;co*m+1c!|fI8J(ne?(L=$x z9;4_K{T4V>LZc9#>My{@^Sf#a(D0^kae6TZk7`_4oMLjI1y#V?p=AI2Ljb$sFFuyVsk*CaDL%Z>OXX|xu}LT4yVW!pbpBkV?P(GM zF4v_6Fn#W!MBt+W{8;6T?Fc5y;`I*ohW-InjG-%#n}t-G!4`=j){f~0CowvdZEknd zR>&NC1Ve97b~!XI?6xobP%vJJ($z!|4f8!2q+*?ztUHeQBue5#yCVEUHuq81IVlQ`b+6k_d z!}-c~JP>UpvBrrC+mV~97MB?Bc0~Kh1jRdEX%@fsvO#_CyZ%O@t&R~ogQj)e##XAo zGP)c9I!#(&na|mqS&7YenMU!J?U!n^nr3McqYy+a*XCU9gz*O~*9K-44nj!>g_dz` z@hgU=WK+H3a(e&NZ*X$>$pz*9`iwUNG<+0N^KFw!7fxLZ^u~y)V)hr%!V@MGljJvl2q=#SAUlmRW%rhdyg#P!|BN$t-l8LPRl6#0=ALy!_#&*ItL{753i5c9& zv{4^Qm2*%!Q*8uytM3d2&?r4Sx?O9pLEc48`9iww3po1l!Vm3zD09tqB zqg#IE{8Icj_ve|bUgK+B`pvP+pv#uk`b$&c_UWgYN$LCz z%(TE1d>6#H^Q|UQO|1E*q4#Zm-%EiB+H{I_dBu7Ywf#{)w`n%yIX{K*-XUldsAeZJ z2sEC>5|7TS#XXbWgk)WH0j}3Q5Rqi?7OS*ekmk|H=j;>1t0Mb9KK9`mtFOo^<>AwD z;*XRXl9=?z_IbR^!)cSa?$PyJ-IJSyWc-*egx9#LORYy}mjhtw$&i!z$@TIhYOhfQ z@IyJg`_U=xbclssp6&b)$dSG&=R#9m%J5yj78uiR+g*=9`&v|f+(rLCL^fGaQF@RW zevgDNnXL2Zvqc9CRp}Y}yEyEO+PjGGC-w8#cTc9=6q-#x#Z zB6^xl(!PW5B1RWt59fBp+1zhb1b;8CT0Rx($UWww*qTjVS)|ni@L|&<57xfYda((< zPWle+nhz+Q6C1i5?Z7TEnl~fv>5sTe+T0_smE?yZ3Wd%{l{cG;eOYjd9obFbDoXx7 zTi_ONu51ri27!AVvfbC_ibt8g;Uvhf=-tQHulLiGzqd=1l=W1HUT=fxkxNa87SV2E zstj2@0S9@Ex*Wijv}5duj#Cd^B|nr>WLF4$`RsO(4WAKXeo8~n{qqMt6pwA#Dhk!4 zmM*e4ZFICg*GF|LpXM*0;tJL(F6J1Q)ny%RVbi8f&RN(?KQ@Az9ti#?$%eK`gw%2X zjqxRfVt3btPff2L_poNrNrI1Nj%OTqKr2){G@M67XMQoFPuF4jYd)lMA6MCc#6tNE z$t#!=a}s?99YH=f`i%(C^m7OR1-p<38JUnYKEhMpgFwrIK|l-iKE8weFMR`_z4?qf z<*$}{dCoB%9;LAz0ms#U8qRGd7+ptZY!gDArHe>YaV2q^Ke->n_bGZ?*(AeEhyD!! zelfskan<|{9P0>-Odniw8Fg8E5Nv`Nua;10#Wv1Fq*%rE?-r%QECZgACY{{Y7baHU zl=C_61 zWO*zx;Pp7mt*`M1kZLK8Llw;p zQ>{X0^mqB25_@)`S4#13h9b|MJ}@~|ZAu1Ew25Tn{Sy+_HJWJn-pmf3Q2NxZP9+{L zlsEDeP1YGpqZ}ht)9s_sVN$zk#eTi<_TGuYH!+!o|o{B9-K51|wX8X^Q*6g+hbaduKr0+Vj zAHDx9)_&Zne*>udfH0{E$jE>xo76zNnu8c>nl}(p8j(cljp&pMOxHp|#ToV*Wry55AR&=GY@P0(_V}%qrs#fOrMdesG+S{FS{x+O zN7D~6ruPB$ry|KKuSsmWZN9MjIXZ{4X+$|e6|xl;ufc$L;rz$%wm$MU;EcO&bXy=G zS}?dQrS|at{OWQtO_Bo>HuS;%tF}-l-chCjYY%IHg+o>$=7UVTILc(eTfUIK=q>s*Yg#^5tS!GHsMr zajBdW5)4?Qui0)eYP%e6k`PTE%nqq}Dy0L3FwM|I866ktaKpi7VhT`d$ni$FZo`IM zi>^OljBXTSsQxO6p$(T>5%%YIWhoB&`y7Md{9D*iN|i2YVyv{9j#K7VRns*PUbu{b z!0#;9$eVlWg|XO?v;zga)IJthED#}^08q% zREJ(A@Zj89rU6;T)N$|NV-8#HLX^|1ILS9NeS(U~d*=YDh-W0Jh~JHr2>C?6ee%DG zj~SEnf?j><;|Ct{d4~(gp=__^SY*Lf!_t4iTjJR29$R!*7+m93&yITavWGgUb!}3R znu^(3X`a#lRyoOTx@adqSO3PcJ!@U`7Nc|yoGD&n86-*(Accc<_Pk8)1^me zER@?9y3V7XQZRkmcmJ1uH$g8O*3?Mp;k7EHSCj0B0md3oEvRWB9eo5Nx)aH1pIwb((R%qFRR83}jV^H*1A=yo9-CiK)YS z>}-KzBtT3gMYJ7F$ulHx*yZ<0#@ie1;?jkl%P0`$p%*c=Dct@#TlCzL66md-=I6I- z04bFA+IpTFYzMZ*3+y6yK*gL~RoQG(Bhf>s)YnaqE?ZSzy74uE4s?Do^BFL9@y3eG%IPbvyJ|$(NFbDskwGC&{q~1VSP$|bEoRc?{CfntiGJj2LmvKePeiL zO|x#Si7|2V#F*H&t%+^hb|w?sp4hf++qUhKdB5-7XJ6;f`BCZWyU^9uwVqrn=o0E8 zqvK)=??RSo%u0S~Lo+ezD}ngYTPXCpsf>@0AruUf3Oh^daw;hd`UlC^H)UUH_fd6c zD3DSoBjXQw?bp?c#Sk!R7$Ip;$O|qx4%N@m`e!|F>!2wOWPnWl`1zbcpEK7WaNl_+ zrN-aLm|j|LWFaUt9@h!cN?@j;U+;xH9s$h^B-l)4`$iaz&k^AvC)&@O+y<8wl__W@ zfFv*3L`bQiuFaY{PPjwu=2;BpHqrf%CJ286&U}A6FMC4l7!^S_K)K~EAM*zfvmb_Q zEIU=1f$!@BSW9G89x3adR1|K!l{O}HS?4L!B1s?yyKpVH|6R+E810sq0%wfpw9$(9 zsIZl^;04<_x7(!pLR#}b2iF*U;~q}jq0LCik@e!sI&uO5Tv?77mI_{zgNvTG-y3}# zE~A0me=OrOK5-F9!efAgiV#x6$+w+sYo7WGW3%>CziTu%y{+J^yZku?%KgrNQ#Uq3 zh{I9hvx|j9!A!nxk^)txRw#7#u=N}2wE$xBTKqxqOxN2i#Q9n^mXpwm8&)!g8wNYR zp}e|D+PJ9NNF=-UX=^pj9DN>!s^GvICWvW5ropWE)`Ufk)Ig&+n68J2N*Qi)2%FB_ zwlZzfNu{kb^kvFDDMy-vHV~!kpUF<)=2Kq zL6gHlGxhroIykr+GkNnOy%7Ell*ximojr5mZu~M)V57QpguUu1(bnAnv$d6>>eC?=2oTzY7HMN zPOFM1@g7!;C5lXqu;IYBs7&ck5|}$9iqv{nX~tL1e2YI!mfDz{NG;a^jvL1jhR0rt z_9M~(VLBnA!U=lH*&;JVxk#4dTXMocA=j${WCJmbYPD6d2a|Y>Ge+jiLT&WES==+U z7QR3YKewlJu(H7`?KbZew`bBeFmd*AS;xWaq3{d_1M-=;k0{>h$&02o8;Mo=UW%x& zcN_F?zz&oj)CS_iJ9;C6+Vj>q6HaQY_}h3Dc8n&bil3YJZ%l(f-+?58a1QL0`m~jt zl4t2lr}iYoNMYqr7*AD<5ZF+HQo%6lUwo}CrF>Dr@tIZ;&OFZV2xT$_C|oBBxZ0M+Kde< z=LT>;RP^+;G)wiA%~U{?1-dQwb*+V_3h$|a`GC=!%p%7jRCr@m`#6Zb9MOK{k)GnH zM35|=iI6D4o9$orv9sCq`dvg+NnI24KglkiT)h_~?4l@6UhFHPuyC-^@WYy&+4A4c zg2rdaN{pXNUgxwl@O8i;)$t(g!0&UVmKO7!UwA}35D!RsTd<2SeC)=b;M$t7e&u-xGhIaxg(D|u^$i)whYN&&-C*EFdL&ZMd480oTWMbb zx7Oy{UEQJfJq^$DYNv0Xo1V}@3mu4zh$^y4--`_@8g5iwRd_#$Y{AU#(uo-6`= z6bV`8_Fb4(BVNLopl%%6d~e0W`HEaH6ub`oGpP(O`O*A~UBhFYbqhpA(Rk#n_jc1}v*wGN92ZSAE3f%E*{{ zLYu2%DDttQ_uS1%F02=Qo|sbymi>fFnFzt?iPyqRqydpmYm!R}D1{m<=E#(44WQ3i z+G?KtwDlc{6T34~4@-)%SbjxjUQ~%WgFBuKI}R&W79ya~!6f25{OQXRVy&XK|ZW& zSEeX)2cu4UeT3EY@-tTFdJxPvgC>QKu!IwpT~f;!fa&U+lffW|hKCbSpwBP`GM9M= z2w>Qwr0l!I4%JJP8B04&9-p9{rC(S9g3&VzX$F2+ouilVM5h(Pm>t*z3$N;%&D$8c znQ=8d`YqwP-F`sXb($JPFX{Omr+o8lDS07D*>}bme`(*{!4bT5-JJV=0^q6CBJ&H- z&8yVDl|tKPJ(Qj^S8-JsFrW{FP#WEh#?3LpXsE;kP>-3qf8Ot@zWnl@R0jWyD^?fj zlSsP@(On_`1qFlzdC;;>@ti>27EnHuAKvN&w7j$7-@eH(sHt3kH|-`?O00)4^2BpZ zaMLq1hQA`1{mR7vbH@)-Nu_j$wK#&xUxoP%E4;KKxH{+Q-QAKst#XzO9!YB`Pi(Kr zD=pWVI$>|(_?Pc_TZ@WA2ZUO)ya7D-&i1V`R|jqg}2X19Q-!!ub9m}J{{y=HD{M%q0Odit#;V`~nUn?hZz z1aKptv1rtR!QgJ!OvF(UqjoEGG7HR%Bx4kZ>n%XWpqfU+7(_9oi+?qjiMK1-H0ZWH zd<9R`5e)chWNoeb;ok|v6oO@HBZU*=<(DGR7MG#A&*^R{f!M^k-Ud}&uC|bDX za0yk*YARt2<5t*ON6V&VzqbVo0GMD9d^{tl7PEI*>b|%?xnc=p)`cfVzXt$zP1ARm z4vh{|aoln+)5|`Ve#(TvacIV>Lku7la>UoHWTN4nvOIS}*q@#^Lu0SWJ<@P`{@ikS ziQ-QX{Nuq9WC{Hu@mN+b=4jTJT#}SRB#y$P*#0doiyZd^H#}F74tQeU36ZG3y#_jp z3xVAT!@+RYAzwonZ}hrvGDw#9#5?%w_C?Nkl=e)|+2fYTPxRRha1+1HZi59n4m=nL zv@w+}t?SH_*5B0rf;pH5FADU5#XXXF)g|TmMdam7&%7*PX)f7#X&9@rTIa>4QRnfT zCIX;ADlaC10*OTjgdiCtMl4s2p-!gDU%y?olckD{h1p*2Rv<9$7;N0*izcZ0___3& zYFEO62GN@2j8+=SJS|mTGUA3ER!Ni-Tq9n)#;&fdC1sB^IclRxqu(UHX0K?KSr7s^ zeJx%NtM}#cRX)%Zt)`8g2z+_urG&Z`UL{R%DVMvD&yDRcG#-$>%@-5HfYTo$_9*}* zXSxO8CPAztMcZekkqugVB9K>)f=Ldibz?$(azl&?cjsWVMwnU7V9tmjM+S z71vh5!3!dhFNTVG4*SRJjg}Nt{WU)goEGSEpQ?Lbf(qPE7kZrIdk2k8NvwCyubbBB zz^{7e%vNE0|4?7oZA%e9y@ya=@MtE!-bt_!fW)p9z@*;097OAS2WbF|lznI`qg!k)f$LbX5 z>3bDqT2C#ZRO)l4Vn+T+MaF>iESb|1+kpTyuJ|zP?}$D|@{2El-wI-H`f->ip`YB` zof22}u`!fPPWJPvBagHD%hYsbLDX*kxES9LsP2TXtakn@Q(8*Hh>PwXdHQP8f5 zb6DaFQ|KsENE>Ke2md1iGngB)zjWMfAkiCvbX@ee+H52#?L4NiC^9#Tom_NS0#S#` zLW&r50?z@a%l#&43YD)WO@3_Vh>|#%zy#F}{`Ol*VQ~pfRKPSKr;JnKcWWL-l=anj z=~|Z*&QNhS#*tBwTgGBxNq#|3o)PVLfeF2GD3NDV;iHB4{ASZz^mE__#S;|y3)t~x zi_bapK$?9K)(_m^K+C*_l+3_)P$tBmL~3UZdTnqEg)>W*^W!48Itd9isx68TIn_zQ zKR_sJ$Ly#s^}>$G*nFi18JhJskZqz-toeY4tqR>S)%St5HYS&|PD&#*gtfg)+zP+F z@P{mOJrM+w(Ux2;SDnBis#VTdi_6uoKEHd=VmQ-!(#4NWiNu}f`|&3lvsEom9I`E9 z+(;kNa0*&|7NHNY+pknRwG>scqQQgY+q)%5HlfL+UlnjTZg@8HKTK5sYh4xwQJ_I> ziZT;Ztt7_iaG1>5p#z_GO^oEEiM#|P6hQ4Mh#o+nHMGOG3!G3RFo;ZG5O}6y4+32y z=7)Rq%y%Nc*9jE|FeG$TIK7My{oGK(K4m7NX@sC!{f7S>D^-VL$cp@@9Id1QtgwL% z-xrS)y(30(I?h>R%?~r{^0M7&tCyEOQmxzdKA1g7c(w84Z0ea#RE5bj*&S=#zq5w6 zNx`<3D?*%SP>Wg8p)+*oC(`qh#mS&^J_b=f%XQw{V6b}IW)QU%baJe62DCdjXRYE; zQkDNMW=td7(l3fEec%R_hw#-%jIlB?>nKh_pj0P|M@m3r{)~iF3_1GRfDE zCb0VgVM7|O#qOEuc!~-uAoNII`XMk!62_yFP7vevU!6;X`KDJ9qFPUievbWV%7LbK z5hE}VDgm_W4 z$DX&hY8B-ON;)r_n$<&llVIf+WkmHM$4ZYOWb|OCNNK_M^c+!ZufqLY8+Mff-A>&i zzIFBfTxzH_K(!rldJ1;7(?G|7H&wHeSqb7kQkwhTI+u4j&vSz26FoG$l8zn7M4Pxm zvHYhTA+!j?OYWB22*pz8i)Qi<>yV*ZDSJ8R5GGZVmN zZ>^j}I6WV3KaE_Bqe6i1KzW(JLv+&pl8?aP*+oJ_(FPeyH%XlcFVoS~M7%P(YskVU z>TazE$KjxMQ&&mPWustTkrkNpBQ)VVwJHPxH!V}PFYcQa$4_n&6tC*2875PEPvrCE-7 zVuRs}g@nu=knwt-Iem0WTS}Xl)du2{_m~MvG$WpJvlKbCOKilQeVR(Dri4! ze;#zGY+0)dDGLvU`oXsvU=f0oqk$>%)luiT@UM*kv7~J4fS~B8n8sdByQ2QUhL?G& z9i41!Df~uFx)2)g0LAbmNV;v8uT|#LOja6@zJ;HvqUJY znuLW8vC#HUz!0Q%Z*Z1%j;%`@!Yto-1uS8iFzw}U$nVwM^8|_*5;t_m#6@!ydkZ_U1-T)!r9Rc)xiaJ#32% zFEaIc6l6_(lMcRj@vdxkR1#)+K9uS~!U0q|!z|RFZ?|PNx?JhTAs`C^F73ydSp~Q7 zUNpyxvV`DL9|(%M&&+m<&5!`C|dk0k_;u<8mJqix4_x&`Z8E_B>GsOHp<)Y>c|Wj1ItgbfLkmDyaGf<<_y z7ailo-~K77dX5a=gTzs4&!^nS2uOmeF#8DW7nDYXi@;c{_c{ElBglbCZbG*qe`IsY*D1)5^ z2I@z->zaO@;q=Z)87{P3k5u~%;P<7S1bPbyjcFshtw!MspKG77LsbBD%VY|--PWSI z(|hvXhl*@(gUU7QLTuo&xN;7kn7?rtL8qn$+;7=QqH_b*(&Z7NRGG&`nh#**J&}&xzrg~&H!ccd1 z4s_n>JXpur?8_Hua|~)r_}%7;+rHjyIY4Og^QBCkF%j2QJE=s-^iM1(es~l}7$a*# z2SW7MX7&z_f~I=*UrMxp zB|=6H2KHvQjyCoHdbTf60YEFI_g@uy238naMJIhnH(MhBtD{n zIsoWd{xRg?`8Ngzy8p(Y0brr02e8pI16Wz;0W7TS02XEj04pOCfR&E*AMAgR%&hbP zW+uje=&#LRV)()s*x6y2*w_L8Fa7ny#K`;){;!|EI&5r=0Cx7Tb-vd8$NtO5Klqow zuMy_oc>c;5+35bEzw&?e85vmqvHOetf9QYW`gfhb<6l2aEDZn9U;b~LEbNT`*)tO> zfQ5wxz{1G(rT0Z&GDhaFJ<~D5{MG#@U+iDG`J1D^^Do_he0=5fZ*KnC=U>?u(bLg= z3I97`W@r01Xqdl2{0;oyp#JiIjsGU=-|;J{|LFXk|D*RcXZ||Lzx>|>{iFAltiR)5 z{6C5M&-gF)H<-V&zx3}xuzwx$|4y3L{}ctoe~aR8Ve&f|7=4u`EAv+w3hLR48JU@w zI{w|z*Zglu{uNR)Fn`sXh^3y11AqzUpK28luyF-w0I0wA{Ew8K?JI*{KD7Q*ks^9l zUl;ZQ$p02Q1I2$aGfN`?1LNNc{f`eBJ*%&3{znBsEAFUgX=cE0ZStk`wX~w6k(Kh- zS^V!Z|G!TN46W*a17-e-^M955U+@1uaw2Ayjz;$X#O-J#WMp7t__dmpk+q4VDS+{7 ze+>W5zK)22jpg4N!&l5KT0NW)*sJ{+GO}YL zfW^iJR$?MUGh?s;RHc1=u)2nZIVOgN!DM7A^bU1k?^eNNDxk}p$;+w^ylDVS1P~5? zD?&ceD=sx`3*Y!S65mkpuD*%!zV7iMC|zU2y_ZWNY!7Q+sHx>KD1anFbWJq`dsp1I z)uoo@w3K8H|ET9n6`*_qG~dv$uz-ikeIWiH(d{V&Q&@m+j!fWcezHtp>LBq}u;YA@ z);^ZN*?Y`Rj&=LSy0&(9Igsk?Q>#m9SMt*!nqxT9z!Ct+=yt{-sH-D-K6q2`mne(7 zt34nzMaGcNr39;RTrJ-R0ML@InK4w2Gf<^%4GYLJXjdU2HbQ2Y|% zS+?%^OtMo*rhvG%X%F~(ld}ndpdPMl9&upL$1QPjk8sDR@U;wG*C0H{a~DK6L58p>y zAdUc_$Ojn!ZPX#C51}h5sc%e54pA47p}sNPXC3DfEW-x^1aD`1$`8IYBpfd(pn<_o zpN@7&$mE!a`ux?|yH30(8p&3h7TBU!gg{G@w7%65U;p^9%K(DW=claPC3>m@K?)QA zH8gWQ+y$#EM-R@;SOtV(=&KANBKut#C?I&5_ZM7O_Eq2#Fzkpi=_LFXkca8F3Hz$q zPi8xRm1G|3Z4ki_vLUboavN|4w*e%nnWZOae+n$#Gz3yp%V#h(d@^2aV~@Vcfo|9c z%Pynt?dKGnPmJ-baNSJOhp6VyC~zq>Q>e()!c~wsX0R1rhe%ARg)Y21OEkvhO0=n~ zC~@71hp5Y^DB!UbL>#lk#L2HWt~D@28*pV3P{|N7#wFmb+V$=^t4w}6Gi*r7?11Aq z=rMJ#K24IR1kclKyC&OT*R)>bWP$0LjQ9y5u zAO+UvhDKN91f;UowQo+3y7=c=Qo1h<01w*?w+5^;&?y*8C)YBHPu1oRp7Y5()$WTW z|I&sjr&2If}pqE|3r7orN@l2F$&%0;0#2PzpYXhT)!i0?L zPCZ?)iW*sCI-BSWlq-tKr4;|MYm}eT3k= zj(677C4#xTCSBYklk~BHt z^e^`Cg*EdAL`m3K-DwUcR+rN~CjpZHM1&E^fTErGygQTxm#`7qN&f!vy$_S}(E`>@bTMfKcNJ_H@hYh^Oi zNkzTnUg0>rwT9}s$EWx-(o02|-?qi*xud4@s;7vK+tE3{sT}LNqvBfx_%uHi@qeCp zu&I7pdgA3Dxmnzji%e>yiL$@sJ-R9V;Z4mFm4Dg1IFwKShdqvh>pM1g6q4ZR`}fWNk!eC{_`@7OOrsZ9d4zbS16&-OIpKGLB$LQ@AkIcL!ozsHvLUPEX zF9=i6f*t(wb~|HL>V;tSRHVb_1xpYJ-+2-(ei`!F!IOeSq z*XEc(>XOgHZ_Sm`)|AOM34bSz{hj0Y$ni^e_k&pWwNy=lx7?v^iWZOGP~rBJVV2%; zS+z!Ip>;03iX&*zaL3EYpBH69f(A4FGO!MTwPkvr3ks=6=BKgU6dBCUJZ<2bt^;4C zN5=)(R(}q#Cf-ifNi@R`6;A7CC`&*m5+;GRq@`(PWp}hF7I#%r{ZQuRjm8$Ks>n;- z%){sRi>AA#GmzB4W05RY%Hc~x1Q|4W{aZBzb<(O!FbK*XQ}gL>;dLTnT?f0FldVd6 zoLoxCtVvQczf(d<1rW^kNK~6}K)z+H?&(reY)HaH^PYu?te(VOV;URC&K(~pAzX$b zXc8pHdmA`*BL>WkbN7UVPNrJ01llm}ERN)Ine8IR6(Y-|f*S(qCWFp29N0;y1Hr)U z3gDN5TJ&UKLlU`gWS_H*zVD$gmeA?D`2R-O40gHcM5C2VB=Ab+I(aoJS1rKWAybf7 z!H|vGiHBbcL|n3X=hC3A)kY?39=HN=TVW?C=-?psCZk>@jiX7eYRlty6CVqc?pA z;(Oq7@kuWF~N)K&u~A*x9IT_HaW=y)JTCsN)REpoi^HjyD!9=>TMPWv8^JGxZoD z+728o=Y+yezGzv(^Voz^L9dLiCctf?(*IoyXF_qKeOe(?B#Zv>ne|y$I!!;6i|1&8 zFTc9PyGM~4AzYm0UYPZ&FOhKe&ss+i&sN0+)y3QUdUR%s1`Ksrg$El$&L*ubqTx-1vMlTDWjgKla{9$_WVG+=1ydskmej%E*ej8Ri>>Bb0vaxrzEk);aF^%QfvQ2O->O5W46*qzj`H!u`y|u&IR}@6a_Tc9_^^Qk#nX*IzfsBrHQT`Vl?v+$aJdYY=!xa!-&qb*Q)Q$_GU z(`&|*`%Qjk5gTPcgDFk!0xn9MPg{=UVbx9Ma8$PD6wVqoSqEx`>2+3ei^ z=*T}Ixi1-vna-otd(e%@$TT9HN(@R8Z1`6!Ht+sWcTG(t?C}k;_AMh9#Q!;Gb_pcG z6O)v5UahPEobo9<>1irWmei1!?o1q{l}lH2bwp{W&R$YnMGagDmwd&fp~3G2E_D$B z6U2(cDc>e~T?ECY!paf0S~K-)lgMH2T#ufg)=q<7*?aqPGD$CX>r5mAA7;c<553vEN`zTWHQpkn_puP+Vik^9zvuo@>KY5Zc#Ut1v+*jxbjv4SyG>tfA4Jg@BhI z5=d=nqakXTbTl(ezk8&%bk_<+29k}lH_!TeeIMr^EJWRQy+mD;M1`bHqAB*>Ou?99 z!O2k_6X#2#VQ&80d-?DV{~UuXrge*~u_!!(ynWP&+=16HkIE8P$$J6j1j2BQ2=|Vm zA?rCMCW()Q)Wym29JP@@a=CPdXkxqxgQKVvip0qqV@LBhNwAMPaI5)k0iOF+22z5Y z%A1RE#xr9b1>kUyUd?Q)mJ`IcNRrsdC?Isx@ z;*NJwJ^jGJKpKxZ4Tfh|wm>Vo3)#X;QYom_7bVaQO|Ib746HMeLz~}O%s>f6tSUz_ zA#3Q_Y$OrGN4C%#W6H(_x_Fi^<-YhgVwfG?DXX7jnMLs(PZcBJ}rl~Y81eyTOoinPGkkl9{WTibUW8?wcUbJH70kbjohaOx9OWmp{>()$i?x7{f`BFlN$pSM!kKTyeVD6Y; zTAmk%=QPvFczs@>d|qlI1htG4j!hfpH2_bKUI2pZ_>)o z4!1e^2)nxy=LBWQEO}-!K#H=xrwu}L;{-b_BIzU#o|<#A~Px-RjShGOZXhwcIjTcdC7J_<2 zrHeno*{0;BepY_!3zUvcRRclxsZEKSN$$Tem5Spm!e%-I-7SL#Tze2-8Ps0o3dHU9 zYW_$x{<^5U)0@a<1{+#bP$56F6x>-mVF*!PuXXcZ(-L#ZB4NS&osac8f^dNe_Lk(m zz5`RT{2V5FGME+lt20>5sfrBMf60~IA@o{~c9xKNgTX;7(@Ux^dA&pH&hZzk&9t3$*^+(SUVP9R zg5d-3y}(Wy;J5O-G>5}4*n=Xu{J zF^@L-zdM{J>lf@PPa7I94~*}u<$kX@(38c24QI-d56rD%IcGqKORwM>5txM2A?$F9 ztaX#G#I!@r7^7e-#16Y;rSuj-q|1L-hGZtI6eS`Kh99Khi zHCXO!327yhd$0JFHunVFa?J=`ecD|)@a=FP`&ET&3|a>Q%L584!XMt>rNF-4H8B(* zCO)V|lEM3V5&1^QbQi<7$8*+Ef~G7`b2odOAb7tmxDUGXBhPFtxZl{l;LrJO9Yx^k zU4Y|vhqw-xnR^k6`7|{@;EA=kxCW7>LszxmrCh(8v&Vj={@(NWWlrY*zBs3M&cBHP zW@i<&0$!V*OKPx9K_d_P)a&p-PBGT1Oh)gsBexFh9Wk5C-o_p}43&$fdhdc;J?>)i z)iTAV!`D?xZMUv`MX+>M-2;2eMydtL=+0xbvAB8q(?PQq%7w))(dC?U;*vm;hxmPX zI0(f0Ga`4ZCPHnmdQ{~nXSEY}8de+fm}A~@i+YhO2U(&h%QB|o9q;_-5sxH@^i{_xW)kRTX~z!hnh zwPgv8MriggwTYHqptH^&jt+b)Le#cq=hvK z$#Z>saSW@Buj4GTrgM3nWeZ=lo(mPf&4J1Pl1rOYU_EV<=hqF`%;8$ZD3n@DV^O2- zTT?IH5~)GA)I@8US0=jjoN@s>@EL6WopphR0|Uj7!#s1%p1uNon6Ux8RdS(iRgt`q z(5~;o(;zkQz@WFlrskIoi~Fme=&-v=d`=|jdm+be3Ax3FsVZlWcn_l?IS+c3RogK-kT@pdVE(_AV(*_|=fFM+J9`RO7i9O!IsCh&@vp z-!>3pVKr`P2B{&oQ!a{&3&ErflMBOv=4u>EFS-`EZoW^`aXo~#2B5|#7WdIsq$}tJ zTZ;`79JJ7iEoGF>Ama?WR)8k&4ZIr76zVKbKab_7{q|1obRTci&}BM_zl2-7l@x!~ zbp^sUy+S@9q%P@`@>en0C){a2J0XSWy+K(F8FP)ry(Nso zlGITd_h3GvT(Gb z-=#S?=3f@=c(T-BM!f~bng~q0{^}J>U1+OPMl`P}W^%q#qg^ywI7hc*4lN&+4J=p| z`!JqLFt`D0lwMib)VV&(L#AL7}1^z}AfTu-+8?6UpIJ<(#u29F6o&y`8B$KZJG zW&dSRZS{|fnXttMl|8Yk>+goq`i)@<7y~G=L9dN|>6oaZb#=uDS6VYx{TKPL0}h6~ zJL(_`^Y1obu+1xr(;O#gC_7vEnmm6L0vV!`9!E=#KJ>quR8$0mJwVTKM4ey{k`9t4 zTHq@B5@Rgb>a%v ztDt*WDLpZ+ffXz3E}5_=QT>Yt*cCrO)5h+K?&oD_l&SrTZ|)7Q^&3L(&xwE*pL zttMW@)YjFJ3T0b4L9({m1_a=FK)5xl82kDoD*Iu`8ZxteBEri@U{UaPLGi&;Noi{F zPYO9IxxC^diFMJ{ME>fGd>`=}yn76CTR~tPC4fI63&<;KU`$0@E&Hj13-T{F)AU#G z64=(7n+JC&ujvK@#B1~tiM1J>gJe{-5J<9`$x^Qm8NJxJH>=gF;`iUP`6NB?&9cYx ziWrm+7rt>EtiAu5fS|*QoEM6vepl9fb7!$hQYWB?_w!=V?q)c+--gWLZ|>WEQNKb5F|)B}y;ZC}?-2$6I8*sn{Sdp{$ydV;jCRTg!G zVO#XU^79Pwwrr!+mH-Z~R2X;0(g=&yV4o__X6H=tNtteJqpzjtC50eTqPd>mOSntt zIxf9mv`Hp}IP2_%x#?w;FFSJ>BgJRJBC~yQ!GVP`fJLS*I|naZBCQ9d0$9xUG- zzKinn&D2yAkZuo^LY!D9!^kY0l&dg5fo|Lfe1qQ(p7;F}irboi2+K)8#SSCF24>kX zc!+ChY#UBKWR~uY>Y(53C>eo@h4msgkuM%T`DoY13E2|P?XiT(ka8v1pxft5Q875`*R(Q1w}cnJM<|5_FY@@X;@YsV20a)!y?nz7QX!$9HBaM>&yd z?RVZZ{zaK$=oZ;ZK#AIb1%*OkV5ACL>vIkqOIm!<<4Kc59AQw>WyO&P&5gL=j86z! zH*GuWCNHhaN~G8i3V3AZaKa8rA!_YNJr%yMj)^#KM3U;M{h?`R+yxtYI_nFOc2-}V zEr~5>e;ufa8Z>CzQD_foX)gTOVW2D%rh8v_vwF&0<9Q)JPYK!z!9Bjgk37}Bq}N1Z zq4Y0pDW7jDgUj(=J53PpZmDcYB8>V7w`i*@zgDjyPTOzr8m(arn=oLA61mP*sq#0O ztNE569Nt;FTWL)DZa{0ZN_*69wbfV>=)A@P*ogE-W!sZ*(u}^sZE|ieYf& zefVs0ZjXOV-s96x|_m~7dc0iTt+Vjl^mdT}7$(Gawkj>g_q$j8X&7n-k= zV&9p$71>E#l0NrXz6MAW+xYL}o;D;E0ou9nNli z?h*ScV?$ikh)CTc+q%o5;XLW(KshF+Q$HgST{DPFG255y_0)1kod|(Wr!qx0&#U05 zfe}2M5g|(^DDf-eJi{JXxPEu-R~=@QH@*1!-sq-ZCY`-tErkSI&lqM5IS#>4E2Me0 z6+dukPCy};OrX=boKC54zlatVr8*eLDN|NTBr=D^fus41N}Sx$mV@h37>yG<{M;uo z|2eE1W_HoUJ1^3^#{7qapA_IxH?9v>SM?;>79C-ZINYC#HJ4$w(mz&P1R=M=H8~}1 zye=FUk{-zM<~I);;VUiq`4TWK>MCS1O3(0LwCgQ)Sxhy!dh!kGM2z@mtglT)Cku&bw zxS4pAJC5|}P;bu7{pd2st#gX;5f0ABkOkBW`o{j|=!Ux$tr^LjwwFE$w5h>`WsLW^ zv3?4D`h)y|N-_te%-4<9A*@1j!05w0Sim|-^m;YTvqPRUm6eAgJa^bc;a06O!ryvV z10!qds!x~tZ*7~~JnxMme+wj~FQ?8~GEAdGbF7B>1NPt#UbwSo7Xw_ttKZQ(yWr@2 zx60u_;oJ~&nbI|wK7Lv{e|3@E4;()K!O><1pA$4&)!XZ-sg}m$1-Zy?V4U4H?}DrP zyAr!swdG-;4xL8j>XCP(faJ#0i4F$ziRa2&CZO;E6xb>2#Vx*=J0+o4s%L5m}?gI7w6$<51|?~N`+nb!-*B83iDI`}qrwHE9h>S2(TtbO#wBXY>wT_cLVeWj zmFln3Z)JC5PKa>g%LVmbA-MN6ba^^Y;aFJxs&Ww}*a7w>t?2Oe^+IUYaR4sN`f5r4 z)EC})zFtDh8*?dFN{h7AsB(r1RpiByX~it*nVXyAl0f|LN&N9@)gC{8{4zhT4ob@c z=>6tsL9QPpV}*l%BPSM54;Q`THZ`y2MN-^+RN`dJU+0u#(uCK;eq^USs6(l#={y$m z3OVbhEX%958D@+|O2Os02%qE%BM1|}s+zHX>R@LQJ-h1DLBf1DksvVNU=^0it=!N^ zY)sx@9dJ~qH!P=pJ#a0Pp?myJ>o2&YEko(EAn@I}^hv)<4O5}AFU=ZQdGDQbOdcQG z;RA{U5D+1ySIK&TPTuUX&*s;MfatNzTa@Nm62Ze^f+>*z!0U|}mwQ-6m`DhwpRqWk zsLHe-kJ;`SwZZ7YH90*eR4&;_La~AFpKJj8&9Rh256Q0wAzs5IFZf;0A|8>%Y7gpG z7Kg<8t8oBoNWr!&Ide9!sK(FnI!F`#oJO4$gT~qM0oj2ykFf1~GmoN;fsWngZ@a4{ zha)dy-3O;bCwXzDWvP896XwB@*ye{I_e7IjuBDvQG5Ca$)w_QD*5TL{uVtl1^<=f; ztS1gD9Vo0G7YfB?g!#}OS(+b?&hOJZstyKhvA9B^28^@cQoBoC&TSjKI~quqL`W|5oKf>3+g5yym%t|9ju*9gPxAtW-} zObW2XzRzl?uYVSn0X=O*!*bY3w*0Q#81X(MsKo8)O*xBGa282N@Eb00^{a!IiO^Ojtooz3z2RD(wI0c8jcY!+&Np^1VM9J<$ll+oV+5V^hC7mU zzAii`<=-QZ`w=g^aUVD_LyzE*4J55-1PA<&S1AK!n+oTu)swB{@Mu)&2&{`A7gJW} z(Gm@&p>01ndo%4kdr1H|^jix`fOfI5?-KSx$zwSVXUB)7hJjLG$#R|+Ah^4Po$AyrjQx41q~*pXLE^$(Tsoj>Hz+S}?APH(9l zi(y|*QpUnop4ARRrpRK`t9cHKOF{4$CQ#nE67x@F(^hWwrA|9m=ef=c;CPC_DR71jpLro25 z;DHRucM9WNm7Nf1g2+g^oo+%lhz4u$r<*BC(-awaks_($J+D25jxbYmJVSQDALox- zPNa31Ap~kdl4PccCEG_*oTee0pbn+taPRcJ4~?gGqIkR=Tk@3PqdH2`rJ!`4VM5iVOLzzwbOoNAZM zvx%D~{Vh0swyYHT*z~4eH!5cMx+=71S(|>0_;4U;WyEKpxaZbjrcQK&zvvt^XSWH> z{vtd=yGM+=X&`NuWu-|{Kj+?nZBDKzzi(zGCWh5qR)ViaMAp2)MkdxYzNAQ$f>0n) zM(l^9%Tb$^$e0B->F*YIlP0G`IzY-$*v1EUheU3~(jz|^bzcp!C}~BgzeM-%!gHcM z6XI6zZ$oIF`La0`E%a9IdWts8a0pWio6I;Uj8d~Pk~uC_^sP57hfR_``-AT=E; z%&!H2Ku2AygBGEnJNlZG~(Z$j&)liq@m_OX`xO@SA@E6e5kFl_&nb z&7FLPyst^}bNXCtQT>4eipdkiakN5Uwr&5oa<~Zq`>Tg~kJq0^q`(s9z&v%V-}kTx zlZ33EDXnC5{ra@~8Xg-Lqv$0`ZA~mkssbIxhWk{Ex7x!z3j?TRSS-<0Il|6J?+sbC zVHaNM8Cy~ka)J^}-tnjVVvAzp7H@{b45nr=5fDc-5!r?#e!knyAaYL`+XSQ1ryOKF z1(2Lo!iX90WYpeUA1eWcuT!o_X>?Fm>W5&H z*6#Oh451ddb~*$zGfE-@2@H(|bnN}0j=P4w%_r7$G6~tn^zG<-?@1*Dm@{B5T@enf z1ep|1&6tBM~(auC*z*MU$AMJ3FsP zVI>MjUGzP?wpq{74wu4@`iE~ACb^%k?PyD7{X>gzCCeGYq$h5NPy=Ql2!gR(t@)@c zTGQ#5#G2sIvc(n35n4Sy!=ctN<=2wGnUvEB3f-QW6nK>@D=%+WUoH@{+IK9T-|~({ zyBVu8MZwpxv|uu)IGv{m@1)V9$M?sHw&k)i-Yr0@LYMx2pg8r)S8!knfNC9Z%<%gq zx`&*TrO;CH&r7J(I*Z@CED!P?DN7-K0f@6P!IjuaMZs4c^#r8){tVJ_s%dE+h%y1c zuEcf?4iO!Od$SMFMS($RWfC563$@?A*05&Wz(*_meo+JSizB_y2JiK~{dxT>r2vr5 z0H$t?pJoh*ip8|drn)QFnLwhoL0OsFwC=}fp-KgDJ<4vY{Z9%*GBqj-h7Y}?`o(bl z8HQ#ddQ)J5c0`PKXVVjy6KrNdUm4ll)?0mc8{tn4Rn*o4@0P7OA75> zKLE9@(Ffmed_pdyk9ymc`0A|_MV$}MXEJw1pns)BkuG6g2P65~4Eo-0Zr4V`9XrM&CnG5i z)#TVERQVe;bZdt~G$&1@a7XY)3Zd?~|ENNSf}ONhQgZfC(kGOQT=2>gB$M>Y{t7B? zzNk^-*uL9D{nXKPjlu%A>Egip3u8y$_xdW z$lE`#mrHct?3LN>`b=4EYtryZvDsW7t}KBNPKp_-qO+To{ghzYLX0Y|~i~f-q$9 z@Q_^nC5Yp3lStRW%kCe%DHg796IdYBvT|M48KRwfQksFqXkzm%O_L!e_4CvbjuTA> zu9?t#yOvKJ9a1tJoz}yS&0Vh6g;1eNX*55IER$U?B~m21`k7dfM@tx3-7#hQgLN&f zyCu~sH+>c@-C{+!PaVQ;e;0P-e*=ZfKv2`jS%uy=jvrK)Exs_;Q4aQ@qrt|nGE^w9J1&mROfSRxXaY3AhX}UkHCHR?M~rB z#-{-1+sE6e3FsOaEB1o2Y zv=6?^U92U?7>uUQP;TiFm;9>!)#YuYP_I^Q)Xdv$u{=Z$eAfu!cSP>CxO^3{C96CJ zLik}?FY>UmsR}1$JM$&OTv!#>^!-ufGE^RAq43)p9^G%8DD3lMd z-+yrOkI9b~!vZRCZ_SRL*_g)dGkKYFeSppF7 zI0I$HfUgap2K_vP6~91*M}rpGzx|TP&w7cWq?Ts>f3LQF3))`4=XXx2XvjFMjv02|6H`x$z>@xj@8J+=U?O;hk=Y~q5;!rr*&p#8^& zF6;sGcl*+~l+KmWZaWH{<;G&m>cy;^n@@bu)Mj~`1onqyb?~Bi8uQlzb|i{!TxWZ? zwR{X=Q^;aT2eX|9h1yiDa|3F^beSvSSB`wH1)xWVx2Pp?+?d=R?NzoqZxrV8r8Dkpfv{l>T}Lfb%!=(JC_?;mZ_E_m99nGkkZ&Rj*JLiQM z&`eCWmMMX66{7sqDNK8(=5F7=+`)p}u$Jy+{yH8rV7XVJ--$!wCHop3=^`sT z(4yb_Vlcsi{^NN?jQERt)$Hp%ScHyZbnEyI7Qq--`9WLGuhOrpzQtVG6X_>?h%9e9 z!^gx|MukYAiwR$IA zLCZoD&BjmYPbuZr!0}H1K)-JkkCmpmh#-S*0ZVd3f#uaaFm|O&p6~kiDov8UUh)p~ zshYEO+_nfM*J&!6sg9j^^26G|o~_vK>o%=dt5jGuB=m*N6uQOsAvmdICE zB4K)#uUObM(J1yZVVLvm-aQnZEjk$XZpYfDO4lSUeG2muAkt*<(;!Hi9iDD(7LGm9 zvRyye{O%kSJS>jeESZgZJYxs7M!V{*q=Zzdw)}o=18}|rfW%0Srhfmzb7q~ub?j*T z)U?wY*F6c=-G0fqlze}+@V7t-6;bT5h0fF+v0|94^fW_U#9|+5_TCcCfogEcAf^F2 zLthFRQMdM+S2hR*?h`AJt11Vyh$PLNYCpAhm=9536JNJ_$@MoQ+`c@Go1>GLG(|hN z&z-y={BW4vJOspV;fA0IuAQ)zO85PyU|Q6jPKPAIw2~~eEy0jc6~dj~*-BF1RVq#|pC z5~^aN?pC8>wKpzlXJdY2TQmZX?zOrOu^L zmTwd%mr=33yMJRoO^vF?f`(NYHp!fd@F`ft(Hp7k02+!9|GHcShf0q~-H{32{P4p^ zqFWgGu@F;@>56X>gfF(cYn#V^5fVMLMaBch)OfmIh`u8fo%)Df;;Q2K-H3JW-OlUu z!f(3);t|Lrma@~1js*z%)o3a`f{PWud^cFnZD%X-=VayYG-;E|5qjd_o#l47#_hHJ zAf-OLi%?Pri#jNbCI9;FHc!wU-N%&7&(H(in}5Zs_?}1%&eIv^!6*f-C-fE4YgtjW zr1LO`w%`5_gP(fYk@Fv5RQc6Mp?fI<)X>5YzY_!mnNZ+-ot7w>ugq{lNb(;n1I4FE zjSTfTA|2cYYtrVxW{BjgAw!buF*F=gT^xtXm4~H3Scp%5yXjXjYP_;=vM0yXG`7{R zOEo$2@p0>)CvJ|MfFzKk$HjJ57(H!fceEThbTi)zGM1o9iL@C!R>g?;+7)tTy)~oT zExxew0sT1rv(JYdpMzHk{IAltu%ak1szfnebA zff?lq`mM&>D5n0M!Y4J~yFpJ@R`Mb*&|@G(-Op$o3m@Km=ewGE1Dfm4N@8L{=+V)f zM)@=8n}~(=qu+APcPF$&D^uk?h~L0B(^kbuw^req2QtH)ZI zT&a;7qS;-#{msh{fRkZhBY}-=LzZSvb-5*4`Us!h$g&0x*=rlCnCRJx-t0L>>KY3} zv3*BJ8t2hOGO_%8_T;OTF>ItUYqZ^+`f5o|2vj-^S>mB@JhwM&Q%x7L9H+*oH--Y1 zujF`j3C)`9LSvv)_bzluH1+G?p^`p_$$QA%wd-EVj z@avB%U`2W{TwxJJmJ#<<_sv68)X3pR+?FM`_{B(c<#7pTwE==3F}ZInvfcsG&TQB$ zUS`tOhq_iKoF2%DSl0NQ>3%rWsDFo6V;pp%taoI+-|M5LNV|=Oxq1dnAYHFZcdlBX zz-~vtlQu%G*xwO&7u+{T8_k_JoeA;a_N5>>szGy;eK>zQ@dsUeU2N2OcZ+Q5wPw^> z3yGMTFVqVex`IG;_u(N(iWxO6D)H|~ea#InhackBBDX2I_=t2TuXSgxY4DEHOcx(2 zG;r#z@ssqoM+psOEpVENc9hsfGNG)h_1*-LoTzSAuG_-#*I9Y4?*sNV@h#8@(rK|FSy&ZO1!VbC}@c4dAMl zX;|b+k;{{EB+~$QRFMmQTx2`UxzxPMa;xh&T>i3-ATn9|jt1gJzdWq!B(6&tXZby$ zAqqj&mT#+we54&S$(DUi&HZu|wDU|+T(4|1VtJCViYGqlONkj9&NWmTAH2<*iK5;5 zu-x5xkSnaD*|?`3zvq0`st4rBHv7tfmPKy1QuTMQn&VW2o04m{>SfLZHA!u{uh}Ry zj;(PDq5A}v#v(|YZ+Kz(FBGtckw1BGcqkn8<&N_^n}t#DrG;?7nDJF@3FhOAjLrv; z7^aTXT=`$Abu7-Nm8SsJ-y2QP`JeH|Oek28F5^{;t1^D01}ajkuBdh3QB^f4!W?%p z8WI#{=&rmGQb7>^&3vDl3E!m;gJDMtnTeV>0DT4DfwQ}jI-GZZ!!hkE9ay0tX z&g6O@#O_+H!K>hYp)cvB=_bb|5 zsZ*$SL^sV&kLH}cgwZbj>(d`>C$g63jzZ`@d=J-WFZKk3=9c#wmA5T}x3-2IO(a!5 zjI2!6>2pZ+-_hPd@HaLybJzFqyjtH_$mVSLInl^t#Nwx^J`$aeRa5>#Vn?aLn)e_*Y^! zFfPZa8Qu%SB4He6#*(qQe0jYj8w)4oh8=bkAI<#sH<9BlElE)5<`iZ?1vD~(3|9uj z$c&fMhzufJ|9h8G?S1@l3o?s$mvYh;MtI|Ij<|d=->v8`pZbfeqkY=vF!j)H+)=JK zg@Ro(b44c_LC1a*MR0U)oVMj%6`Y{F)9uht_y%D&7W#@zxQQaE;5=%*i82S+2|C-v+)UCVZbdpu^zztd58>WF9wHhMVyRErLetgn=GtUz0rz&|=;Z z>PeAFHSJnlZYB;|E6&2dma%Px&#K>&fDiMrK2+?zRm&@PJG}P7mSQ5fL*W!OeZJTaQBEBuO4ygLQ$~ zcXu{o3I!=S0^y|1phdpOO9RBhhGBaT^x_#HxJ)O-V!xXHdPakYr}5^Ki`4fqQSZde zP>*A4Pg+9j_Lvl{kF0Dd_qsW*1zrJupAr-|n8;O)Ppn;CN@2gaqgndyaA5A!3ij&y zf7E-KFa6uHYSNR1Jzui*`lWjPDgYgcx|Nc3H6&^7>iup(E}guU_#3sXwe-6O{<_y( zd{{EAcbBJ|Dt6*c2d-6ZHu8dXwPJ%AZ44ClW3mw)$HpRI8uUNUSy!$_?$Ln%e?(bN|-V| zo0FL6cdSNF zrY{uDvWTVmPvnfdmNKWTF++SMW8+F`il48LX-jEBf zVAL})m+OS|FRaPu@brX}nFV2WX7*buUnVe1R4?`n++MY+U9RtQsEB@tiG&J)gKx-l zV6d<+-G9YfMEP127o!s~{o@;YJHwM1JDTb7yvy56qx-<~_CVq%!{e%Q3 zj^bY}aUzRTR3)~|o13>_H$$^b$c%t`_i1|2<3P_pYtU(=L3G6`-6AkWc!sX!EIMln!De&Fo5TgaH9@8qP6<9nyn?}YSo!n03B zRCYM3hb>+o&{APRQV&OgQ!xChk5a4+dy%Sr>jS0_9gHIFsZcw^*RSE)(Rkc(7XWAC z_Mw_K{`TMtaZp)NUXz=Lz%E054(O>sVlfw1`pLT&YiGwal_w{nZETBbK_8; zWi;4?zrrK0Uldgdmy{1#1YuEPD)8eSma6?Vf9UnkT_dSf8gTbM z6ZN!tqnTV9{m_?W-)QX@%kMxAs%A!h3eh>c)) z2kVG}&#|zg+bDormye4kTUGG|!ALRiV4*b1>{o1blpjULN=ytXW@Z!@(a$2U-|hRW zK0?KC@GVS(@yJP8=*ScW!VErH@bX*4#=;X~iY`;%OOlW!iwCw#d+pH1S%gVn!7pa=vznJ&Z_-dy%~5Z&@U z4nfI>)u|&Ue@vJyZlmJoRWbkVoHP{+>3!4WC$!tu6_}2#cRnmeXZq#K31Ka`Y#X%TyFb(AKY#rOT`3Gf`doW!>Ht#u*Y~;ofFcLC|SF^if<0xu2z3;;Xn9M z^k_!@tU|&KIuOQhej9D?R&V1C1FZ?voF1RX*Wd_}pD99d%+U>L$h>ECdG6=+LEoZF z9u9*FM_d;JD=F(>b^u~`EZTHkLEiMnop@l)9mZUo9wmu2p!GcE*FLBVL)EE7s}i?+ zv|_8gDH{iUzrK8Y@Or67Z5|y_z0M0J!2a=}8isw!I*`8L)YGAln|)-Mpzy<0Gk_++ z?Ra8)yFV>p$4mF=Q#PJ?>x#V+Y0YN0ta?9_z?F51T(w2&+{3lXY zph1z0n;RaeSP z#xM@Ty2D8YCsg2cWeYW;gO}~;n+5*oZ%oBdg&=hmVE4|27BFbG3MrP70RO;Y z+ywyUgF&L!pG1Rck$f%~;J$x})ZJ1*=BKD++3aZ8!@i26ck6%5#p{@uqC}u(Md|h( z@?p*f#5C|sxGt;^*7^63MCK^>yLjw$pOWZ2uzb~mu$S8dCIOc2LyKF|IRd6pv&{z( zIl#u+vB-4TYyQ|5at6u^@pf9(pa9m#dac$+45NVmZGkY^4#0ro+KZO-%X?A(ufDj_ z!jtKXrr!)Z;V8tuvI!UGET-vW^sNj2SR5V7<{ z+yAmq9zdqTm#Q&W!C6=R^SNYG2AgaI;s%?x<0k7^TGjy4PIVcOZ zD8)i2*6pA6s$|J#uO2T#O(mhEJ$L`Df*O6W>e~0iW}8*6$Gr}8P`UpN6Q+Gsu~42|mNIU1owXNrle3yS;ajbc?T@!^(4w`>u>2 zRy~kDo~3G5v+rUR4up0&lciN?5Pk6Ha z>IRQ{D|KZk`E2XZ=2|!PTq$43Mg?SruuxE#N#V% zzj0p=+iBD_UH73oYe+8-h$8gos+b{+>f+~VHBiJmSur~Fo=K9mN`z(zcK3n?pfLuW zVh-3Jvy_X(EPi7h3=L*{&PNxw2Mbvv@V*!(nc5=&1cc z_8iq`N}>IAfOcp9W>hww*8R&cg)P8ne=P2cFmCnYC5wlB2JlRNX_T?iNpccCAHN~T zfu=+0%Y+R2f72;idhM~I{oC&afl|d6xl-Uh8hc-#aLQv-LOQA^@fJ8@>a&zAp>Tme zW)Sb7SRkK2u*EFh-$^v>=n7mwu68KMSMh8i;?SFRJsD;*w0s>}2pcMRuelXeTwCR= zhrP@4G2ocXsOj$Xw*(P!TdSD5Zj=}ojhG7v#b$UIrw2!hJO4}ufcioS(_P5<9?TH@##sLyBy$S##hMX# zBoKlO3NU8p|EDX2jEQ&^@2HIBTun``qb2zB-YJMRB*sc0RIs30uLtMv zsQ_3V3-}PT9~NPs>!`SB!fRv0uL=B@Gvd`GJr(3H*Q1k~oBF_GVvBPqn1SuPhY%Ke;&t^LZ zIide19Z%XG^kw1NUh{tBBw>%CJ8ac{Z-J>| zAUL>O60JAarLPF(w*}V>6R<*Bo!UVE1NOHYOlM=Uxpe2=mf3%SS(5#2QpJvUM)&aa z-;hVuo5@^w$fA>IW+;;9WbSq8U*|R|!M-c!#;KP*}Q^Tzxkgh1U?m>CLgu+ym}b=q{JL&sZ>*KeF0(S6I^^Qs8~;a?%E|LI`Upx!IK zmTnZRZ<6tcegtM=1B~*?JMS{9p6OyKh(%{R3hSl1SUks}&bMii9om!64ShSl1+hL} z3Il5}o(n8Bum6FTEolld>w(4ht^f(rM-M-X16it!2Hd;u2}qzE=FZMg`#d1;08U!s2^yB-iAsy4S4 zY*sV>)F>HAx?5zpbiE7xr7(M?gloNP3~!d`8PHpo^US++#r1n_?U08h%%{Cd|F@@z z$RKCc*H)sLtB&B&Wk8bogl}p1T(ptGNO|`mPoTQKPqDGxl0XEDs4*0fNJ6lcI#!Ju zfj>5VdEjt)_E3r5i|BnQh{CNGdj=SyF7U&T;P@TDmYuP)D3f<^MMxg)b&+9=^T-7L z3+{kbmcfE9EmNaTDcoPIw^2ugV(?=~>xT^*_Y&&;K`KeiJW-k>AKtRYLPoF3VZ_8e z5#fg)0=8r8b~9x8v?KIpBT!-h?|jzQUyi72k#0gw=2AENBFO#G-*N-Zo6#yCO?}-i zjDJ~lR~yya3EQMW;9DSoG^-qfb8!UmD5#c&_^Cdxmt8jf>)c|eetEq$x$6cLpN-GL z=_9g`ZF%R(WZ?~#ex0SJCB8j6zV~%wtvZlAzM(Dd-8~nK6n;a$-*vzFYo+P{yC?hp z14)fghylo?)3X?RxI~Y_`Ms5^_l9oSwk;y~mhoRp+Y4Os|9OW1!K8~#0>kX_ewzyd zNlAs17$S~mX=gAnQy>kGelJaWG1vP?P!o31BTiYjB4zW!;$#SLs!sfkYK?v$_3BIg9iAAM%{`&6z+~e z6N$d<`Lu~LPY!(7nF+-Y3&2{{0pjhSh#Hfgf=Y1PqAz^~o;|X#b=E>b(jIJQz(9|p z=y<-QyqV5mB_a$epZac8wg2b*jQTvq}Jd_ge;P>>x5y9tNI{kX^jCk&NfFSlo)DRe^pHC4+@iGY%GH-#J zrUaeiOhtSOP(x|_>3Y6{3}C%j4>xSFO=^VIANt=@zD|o}H{v9ii#!@)CIN)}8Yz@C zWAcrw0RCkT174mdHy3Zcn+y4l!Be1P@ggbDH~K<5!^pyHk@hh1%EvH^W@{|YcJhDH zYi=+(cn9k#f3=*0LT8Qsp{0F-(za9d5NN8Z?a>h5B24Kj-KIB0$w?;n(%+3=tGWo0 zo%G*Y7UM+RZF@#0)im0F>vOTjo}H4d4A9*uRB|?}?C|-X7=+^ap%aDhrg0|ngNTLo z8AMI*^Z697{-Mwy>qG#l#b5avvq2 zGkQ_)xdp8*EG)cSEyrW*AB0V|?2W9V*#&v@_KGVM^{Qe+)uQ}0Y#S|8{BMMQ)=MdI zvE{?JQirB=&zVJ3@F^+=IeIKNdxw$G&hAI}ms0yjuaa9gZy+7D5_q}0GVlvbhJu7p zSof==`l&vC_n!|K@W$XVktBHWYYsPwUstjd71gmqD*S*fmE$shU!XGO5P6iJ?*s|^ zv$izuB#A$bRNl6L@(U-~GEjR&Yh7$rYJ2PT= z*K`#YApWI~o7XRR9=@$Pr{T0(3cy+q0~uNdW3R?4ZKIkRcrAGWmMNu*kmGCt%m=YC zIpQN;3@=#*W`6ZsKY*r*i+O$U^p+`6h2hrT$)a8r`@=cdC9j?~S=x1|iz0!hL3w+8 z3982LFZ7#mM*b3FUTlvqKHhB{#SPV)k5%vn*EXVI?l2ntKGWG3 zWqMGZzbxc2jNXy~(>?alF@p^3&%gSz`aajJanjoIA9>&bJ(4cXgpj$(MG>?1Urq*# zx*ZgUAt2Uvsbcg!El9R`jHH2zm#?WFjaJs)0$c`TX2bMf2^m0^APU+ht+{vXqqGf9 zYn?{r0pwV*%Z-QrJz*YppJ9e>5g(9?VPpGh2?$C@Zd`tSx0x7fz8jqamCu~7<}X&h z`eg^~T2%diE!%pj89l**ZTEVYFJeP(skbt{pYc9g)6HYq)$-qat=W+LR5FTr^H{JU z=j!5)vjmxUF^m<(R|jH%%A9?FpIDrzGdhuF>VzC9jMGWo$-zwMPDOAGD-dR5hr?^M z;HI*N#YRRrd|avWSx>#!fua!(eUFIYwkLCNiy(jxVgE&x@lo=m1Bz@-$bfzIJfWXr zMUt2tQe&R4tKyEMb*Ci0v>r+2>O z!n${tA@wJ${Tq-^nzz7CgsLj|yoJRqH*zHIy1hIw0^$4B?6TaQi#uwiD`T>*xXk)< z8Ai}AynXpWI7AzLS40ytdk@61NBk;K%$mmey33!f#$^XWY&rPW;i{yLJpS`-0wa5Z zc5*!_6Mbiq2XyDE3w};+3QO%c$IjS5%5s;h6wP&POrvthFCZ11g9JHzLt{+`CM%NP zNB#BBMBm;4F@R)YC3dGVi|xy-v~~%%iy1>Pn0JS`05sLlK*+?sptX3|L|KPz74FLq zH%mP?PtMDDA|iwd_2_D-=bJzU>XnNwJm9NOy1zVmk^3tzB}KUFvr4{JZ0v9?l2LAG z_~-ABkrh{7QIfP>XndTRPIbgcp&evDxgQGF4{G?+W#$s1j>N#YZ3?znZv)(VW-8z4QDczX|l zxT?L#BqRiHD6xeeMD+fb?+c35X8lD4K zA0oihW|Ov}7x2Xu3n?7(d4Ju3D`LN9THBZvP0l%CV%&g=ZKiLbuaBNaGbciH&JG1s zlCpaZ@oURJna~RIdK&~!G}m!m2C(R|ULoIoKq z>P!%0lO{wBQyE*yZSKudumZeSX&GxgYW9@o(AOj#ZuZli{m{9x_Ik?YdWnXK&IzYq zY0OfO|Kw6h3r6%GjZR#!G~Cqev0}B{_Vg?K#c<7iTW1#_dLvC$STXq+makaCnq*He z7pMYI*%$j|-T~VCGg7G!*wvJFJy=?bYaL?u3W3Q||e zR%XzfB9pcbPoi{OB6Z3k>*zemxgz&!G1XQM0%34wLF_qnQk~ur9Y(ta_Xlh^I#x=p zeY9A48eWv-C0ge$bLxJxgeh9q=Abht2gL&uZC5=Eo-A#ykeG6uUOLwaDK4X7*rwui zNPIBrrJU~1-bC2NEq<9%Q?;|fYsiz6@g$4nu*}F6?~aqyr68JQ7-3sJP(9}Vv&_Pr zk=pav5}Y!LU_>5?i;_UAtqq>ZejCbt=3XJmuc0-zm0%)VHhf|8PRx2P;xmTjwqvYb zoW%h4`ai{`Kh%ku4-cRj8dg9&ywcjoEs!PjgU$8=(h1W<;XXLsQi>wPzeGmSL2|RQ zcTu^S>lRjC?gj5t2Z2bJ$?PPrnVdMtF|D=ljx;`s^wNjI^RYFg-cX}O6G7#A_EvV4 z6LD>h*@qEN&^%bJN-9mG)c+UCcksn2}ey~H0T+cealO)1MKdGc@W zlbt3sj9)J{4d1d1ne)4Zeoh1q$Os~e9z~DBnIdvT>-4DYY*rHCjm}(*ZNa^THY2bO zkA|F1fAeSdw9YYDn~-6S3#dm(PEoUJi~KDGP!Y+|R}0~g@DW(!%D`Y76<$>^mcfWk zR&u_JbEFaJM&CtLQ($a^$yU!biliGbRLwcWEhM(B10|B81pwig;G}mE%T^EEWKrVG zpK>_HIWIXFk_#nsYLA2ikRVd6abNj6kVx1t33;>-e`n^77_e5sHy8dlU|G!CTzfWtjLt~F zF_r`-i2byn`LbnkKv`2rd)8SO#q>62Lox5(JrWiTTf1iB$14Cp&`7Yu-W`+bzygBl zf#f~6U#Toe)P}n+Hz}$Al6%_XqOdf-X03Zt15G9br%_9x;H6=A)1Hm14Lr9^i}6-> z;I5O%3u9ajjgYoOGKuL+WCB}{Kgjkxnn5;zBaaxFqWkG)8pX@zy`(VddCh7_*4LT< z*xydp{DyCgvFF#2HX2;Q*vNmAAeIPDfJg9@BLZC$^IWdYl{ey;W+^c#1)$ zKBj82I$sA6HRB`wn!sBuxKo#QP?@-l$JP)aFrq5ZCj}5Xu50pa>qc2hiC(7D&2Sqf zkR#?cq)X*IZDqx*|s8Qi8 zlc!ES?=rJppngKmWrJWGSza(+kN#tY{QV#Q`za5HPN4*HjFS{W@SVM37hiKmj0mwb z{YFc4>YC{F@l*ecsi%?mvb@H+HQM|6>}a86I+c#!TMoyEyo9&!1BaFleQvhypvqL$ z82Ni+14aFVx&u5|w^i%R3w|~gYz9lTf{ch;2heD=ww_ohMRtFsjKt{}{yaca5&tdi zDmFXyl}8v$Yp<~{JFES5rI;egYzMuw)2vK)SpxEj2(~6-n&QmAg7+#*&~($si=a^8 zvJXMF6vRKKn&v;wuNDAkF-0hf>IDKSBZ+qqaN{Ea>DBxr3)7>~;y#bqV8v3>=h9le z8kh_eu6&G9x|s+XPMHymc}<;e_u-CzZ-2oIG#o%++*$Fy<`k%EoF~o)UiUUvV@>XF zIy5Y1)A3?dH6CTwAD(l>QktKwy1sp--tY2$j%a-b&e=V#At*Q=wv(}Ng`!!hRSE`l zM!d`~lO_>kF^WCabtC>L@p#nD z@S-gp;{q0#H`rJN=(LtlxiE&(FgHT~kw3_9ahul>RBj~j*oO<%H-?g{Ug%dM%W>`$ z<$>q0n^sFwX&etDHZ&8rvPnWoqu9wNk*Bd&rT`M8u0-ge5Zi25a{t8+96cAj-WSPd zs?)idZ6OzZ=GyL?BM65aI0M_h2Hy4E8Uk(pd(NjmaM0u;%>c>U&;=m!c}^vxh0_L7 z*HB_}ZDSwBt2V_;)?||)At}*;(g+X{k3^sA`Wp`5MKOw|Q(fN!kNTxVv$`95YRJB( zx+iv&vcZcqj4#_}N*>hP6c3864F{x&Z*aV2^_mjNv}K_mfg4Amc)WB|`-4h0El$Zi@O^ zY!j890|v7GZW|-;3prwjjDT{hxF`a2jRANS^NUgP{W8AzZCBUh4bt=W*zw08LP_w#l6D(Bk|b8KjolyvsbzUXQJXn3K-HFV#rsZ z?vCXW#>Tr4b}9mL%Yda->0eIe00a!>-TNHCr1^ugA0oZIt@I`(T$}hm$s_ zyL@utpuK%KYCj8z76Fbej?RB7GxP%m8=!|9=P`FGS(u!MPVAx$F}k`4_5Z18*QG#y z3!-CV0q+@kxiU6PPBTI489>HK^*(N z=h2+RH>FbtWh@#dXSM0SS`6{rJ=6E2o?-7j4?2mYdz~=$ew%xA!FBSkKkW~)Li)Og z^6t#^&^OW_0_fj;SrW!4Xpt2#S=M-x_-Y*Y&ROSE18#A-D~k<0vj#xwxT-;@W!3b| zR{E|%q)Kz{4{hte-Y`AckB`UK5K-!Q&{lMvo6e_x;$2?mc>273zx1t{KjmmU7PJ{# z^23^!yX|pQ?4Su zZ4%3mfiDnOUJSey>()hOhr?rV4Pr61N8F_^UXu*O_RLcRWm609eJ;Yd?Y@Qcf_}v) zuO8U0W3JrPDfEdO{V~-W)FYrUB}+HJsiPrbaa#5g0b1z1v-?3a;a48&Mh`ggQN{+L zv4d?QOvf_-)sIiI48I8;3RbtdZrNji>t=Ci_izc(sSX;2c ztx-js)@L;3K_46sOiVcAnzrj9`fXbc%Jx^mDoJT)PI6w2(bl7+D$gMpJ126t8&Ly} z=RNT~louQ0vB7+^;UM1z&Mq2c90QIy>`E751ZH=lD!8MYUyUZ9(yP!Gp@@n?D6EPC zD8B1P&LiHNByF;zQT@2D(7zYqtRl5!krSPo^Kk>>m_s9XQaV|mnu zm4(hnj7zy4JkQDq@X$fE6spRM{Tua-$W&Bg8Y^3z3X}tUB4-Ar{KOdZ$3c^5+a>W8 zDB~4U>Ojz+;&l3*`~;JC=JE-OIVj?rcLn;@nS7a>|*d;-5IOwCD z${eJsd3;g6b&3r*D{2ko{prSKn`!l#afF{(emOPPV6ECVv!^8MccGK-p8RkU{?oFV zP-*ir*_n0vs?7J)8=l&o^DCISQBo-v5E&eAs%>}^+RrUC+>@pWEYCjpLbNXY>t$wqq{8_X!M zMDj;UDJZ}W!f@kP=PSq*|_fO+P;6<;82aYH5Y^Fh((zj!STSt9}gQlF_ zQ+Q>;x+v^89VZ>zw(WFm+qP}nw%M_5+qOEk^{3Za>#ViU+5f&c7vDtHH)>YZyclyn zHAcO2mx$?%pucP7MGju!BM_1df{Ac97|bnHLzu$f{p*fz?wQGhiAi(#RUI4>5<5Lo zO>8(y-vP*8^LVs}mIH|WL>~uPJqGu~4&PU#xNo3m0^JMi6x@9o=gbagEsN<@^EgwN zRcp=OiY>ejhRMJ&hQGA3zOH7JPesEF=fcy?a6D6SXpM;ta#+WcA=B+lWj9D*v0p+Q zt>_99vUcz6fb^L)po^dLtCL6AywQqhauC%Wq zOS~tDf&|`l3%d9qiVm&fi#R9ks`ZqgLM|9+V^o5*lTvbqn9#d_CY3|xCFx;}#A9e4 z(p9X~HgeOzH%t34?#OJKtx=7xA6;=43bFrT^}jt^k++q2aUer+A2vYkQDSzeowX%K zag4kB(oK@ZOxbZNyQ`h>8KS|FqY7t;S~ikr(-m7x-wzVCZRpy%c@vgF*8!+oAD$W4 z>pBgTTOS7LMNyi_3~ELU=ZlN6rO)n*1M`R_vB?1J&`L9=F!x+tT%YU5zlxuklJx)MLznmU1m-k4VR{P@1$KCKL%#{)79~MNR7k z-PIeI`4IE*mO-qt!Fg~eO9GP%bPXB!4I+^%61|N|`|3&^Lq|s`lvwjkj=Mo4)Hpfu znG9TL_U})40;45u?6Pw(KkKwn!I+qg$=>>A@j_XHbF>m$;gjk7l5o4wh)HcZCLWMv z>8}&H%j8PgY}={J-cUGe)^?En?flsb9?QPynrn~E*t>$_4{zQ=xCW8LK+?l*&w786 zR~55Eq(!O7_)DUwE_L(XDgVZ`_&J2Fh^>iMqU2xao#l=1()gF z&L~u(S22STs;;7BEI)e|KMtKkUG@5A9TvX=)Bz3yD7)7^QK?cFe2rZ>k?bs@$wFC8 zbmBnwj}K!Cj9Scf(L&3Iw=h}kW1BC@uy%r)^vjh1s zSr%gkI|*NxHlEI}dZZD#_|yQFaO4m4_&9J5Qeh48H#fyQdYjvHsk8P zS13NHnM-_?lQ!E``KB73%x);j{sw)G7))~BewMtxgKsN-|1B86A1M0Rjb0zZm3SOD zQOJNh{L{~o1E-inDM<}R?cMzvs>AP{>caYgKOaZ* z-AaLj9B|S3Z`lOjgSEP2a`A;TV8k|D``kT$qhVS!+HR!>w_Ys7;4#^1b83YcKUiOj z>=DuVqfIv6el-QebZWzYK!Fn(^JwqK$U9K_*keS0Lqgq#=~af+Q-*-l6(h@uw^t4j zRQeArBtsi3J6u8$Tk?W~C2V_+C+ug>McySj`6O>YrdtyZh)W^%T>LmfB#G~8p8_!& zf5~pF(K0J=npbBzN0;P*1J(E2O+A02n?2#6?zKxvxynRIa7T+Vc{~zufO&TFTjsWJ zB-6G?Y!SsROa`s0Z+OH%nuH321f|+9w}rLOwNP$EEt|pzPBoE9gt1AMS;oxwQ`rQZ z@FkEU#%BBdXd|EebGy2%GzIJ=g}87|@`0zcE^3N5s)e&a(F`(-LuW!;qj%@TjS;x$ z`MYXh8`-t$yzrJ2xLU{YMDI^@5FTpd+w;%*v=3~wnW|f93W@z)3yHSlNBdck0G6e( zzTI24`)b(6U!yeSEb7J3i3kV)rEp6MhbwY1v-Gv$3%oJGzzwh}VjX!!&J!?juGSVN z8cVv~LY15MX-~?1z9ygIqw&cmiQZFTRQNd-lwRsmHQAvYp8Uajwy)g`XrKd3&Iy7B z);9YSRxt6uY!CR?BC*4eh!-e*PiZo@%D353#-~s=hblNkh}HXjZ@ldsV8Ay&0QmOz zc1>1i;3^Yf_yiD%>O$ z^pf_~wIVN!;GHS{O>=XIqiRL_kcOa&wEV3@g`Ok@eNtbPEhdFV?6Jqn_2(dolt#(S z%X-jvCVtfGNP-AyRRGL2H;u0l5vp%nZV&((r==YlWMO>U@OY!(Q>JPu+xXys90nR1 zpbYS;o+xNN{f?=7KeEV}WKpe0%l+S>E}~P&Ey*Z0PA`ya%og9qFWF~FG9Qw_^TvoS zz@|=X3^;_L+PtBQa<*G2u1t|sdQdE6(byuy9Ur#7ZrFE!6LimM-A5o5@oZ?(Z@nq= zz^tFKZ=5hu(3;OU?9YHuVB!>m;8c^nT3}g?7UCzBPcU9WhH(FZFY@}ch^yLxSFmw& zQI(tiv&OjjM=?v|6HAtKF%rqfv-@bjfN;v4L*BmHDveywDaVl?7zrD^wW$Q|7GD7> zHCieuw>7_q3j1=>k20|J{g4|_!GOpF^m+V8rVefK(wQL+K1YTpxgZgM6+7l%$qi}u zj4A@?8WOHvZ~g)x-<|4)zO8sLIB%>@tRmJ*_e5$vyI_6t5>sEz%drjf!K4%CTlZf< z=uFTR3LnB9$_v$LCZs}oeSr7e>miE&O?JNj8v$9qW;)m50^N(qffx{yt=w*IB2%0# zrF6m~Yy12;RydwJ7J&h@Le&-+F7S$j3I?^$Y(IlT)lWxrDx!|e+Kvu9_*bK(zy z7#oQF;Rp8_KWm|I1)fa@|2yNf2-XZRt7*PU;TAHXZ6ir{fM0HskGL~7J0nfXU4(h* zLIHYP{~*mpr{^T|gGIPG_OD|IH@JZa!9AAYTV=i$ zovr7cS}aQ zblkSZ%rA~p5Am}>rp6Af3zzOCDNAvKMl>l2Ci^0-sO*UdK%*H#9*ZgQRZ4KXDSx>p zQ1@z@mGR5cWh&(buQ2Q_?riwScAsY8)&bN7m?5F+St*G9{8wt2h}-PUs&;ML7I+*?iN_{P%PPj9vtP=2 zSYh~@AT%3X7i$cI41x&Fa|ysW{8KMKx6bUU>Mqg_i%gclF1&^QZx@+?{y&h`|LP)t zldS)fi_GvBuKj;;k^jLU%l{`7%kU8ga7gHuiAf?{IgBSf0z94#D5q4m5Kkxzf1mi5(Dc$Fk-s@B^$G_ zK(c+iyno}b1I^6*jR}9pU&;6%9sizB*1z%3QT}a{^&2l{Vg9cFmX}$-0b*u`?-}^^ zsF~TBAlaChzA@r&LHTd~UlI8mGp1+wo&~n=`DJDQrj*(LQqe4UY=3EGy1#PqcQV8G z2hua(v9tYci1GWw*&taNzRUm8%l~Kmb6`y0$NgvgYo-~$2l6-meqH~XslT26`}%Jg z8{;?2{A~vReP;UZ$G_VD*8u;s&3_E`p9BAAJ>NiqyAe>PygK|*6&vSFC$q&jh^nmAjyB_@c#oz{+sh(m1JsqCK@`X z|7R-sKXvl|iArW*XZ*&h|EVE=Q_1YCjQ@J~mrAB%{QswtTR>HlPM$!;T>|8RL?F!A z97R(cU4{W+H(};BH?e{I@yLns$?-R%5}8CHH~Qa=!Lri6I-hRb5<4HgyKSnkHfv*V zTm2aVV#6vCL^qa(!T?7`CV=7Nl$JmkfK$`d(9_b?WXZ^tdZd+eyKRxgnPUS8SC)d0 zQ+^*^UFeNxUW`GMn^{{dnF9AN+| zw6x-jCQBq}a=wJsWpp4N@%Vg2;Ionh2C#Qct~{+QtFQ^)|XZFdXh5I5<5WGc~;!F+I~Ak`aWn1?5ZwPUMTm14oU8*zsfJ zo0`G6CuJ2z688;fU%BytHPTQZRS^V-2LMv_i{fXAjwPI8A;AL&ICuas5<~axz#cjB z>1FtW$_0OaaRT5I-r0_MySwiX=wb1}Vqs!wYHVU`umfLD1(fVn#`EoHfi6hMK;-V3 z89Kn9nHif!vEP~*gts&@1%Y@bbQ1J!i6!WP7r2q}jvj*50}g@ zzn@0?5qKM#=9dX@KCyv?bKERQ{Ri$Cmo6}{xg@I+y9cQc_yC3l zl>d|e#*xVp5HJ%flYc9_o5Q4VYTIno6K@Z*C8ASLDOd3yqX zm-?Vx^T~w_-r=2pJq(LyDQ*I|{4#=jvGid8plf|sLt5}@f`-{?Rsl#ou*4sP29o7n z>VjqDtfxPq-TAtLecm7OS^Vf%_~e!N(jgDy)KL3C;QQzf^#Mmy52)Pug7#CauSYCo zG_=D1(7UTvb(;CYP)9KYa&G!e(U=>BpCc4tF^=Uh0jQ&AsQ)+t;0zDgR$? z1n-&Y?VZ5}wJ+cc9fiFPe1|fFj$Ow!g4NY;sCDwk0^%!m2bQ$Lmc{wrsi4OYrpeCP zlkW?_7l4Fwgn2ccFCFJr3=j8>77mJNVRa0`65Q=Y2aVEqbOzxkm~%tmM#;-I8>a6l z2y|LpZa2as_`uBC($GOOAMD84?@Vl;X}`PaLL4ri08IcAgPMR^ zUug`5ykCwVgo1eE+proXB8d;Q3(t9NK6ZR;YXT&t2!(Te-97O6Ux9c;{Et(*sDdIc z7Mvv^#%}j^pNI88t7u>G(1BIJBMXoRM}I62p3nMQaIs?}UKa;}@U1|}QAh6ZydGE~ zcP$=kTJTa{+JXC)VHs5TNpgK~{A^ho8|&(vK1#kp1O#X^EWr>FXn~cPsj1$j3*1^b zl{4M9*!?SLRW@AgFTsM@cvG(MIw!m%4tbHg_Y{(Ae*u(cHqzGAFGP8=;wpEfM3 ztgn;3&$s^1Pvq6qFZaf{MVpg)KwRu@MqLB9`*#8Vs0W14C~(a6f_+vB%kyq|JJ64K zTFl{UMQ@i6x6W9`*xpA<Nym9D%sEK0a z4woB*#0FBy_45W_ZsVO1{%Epu8^DLQ;xqOM4cZ!_XcSo1ABUq7`%wG=aGy3StOn~x8zJD#X_JrJg ziHe&y(Y*RXsRIs3s?>~|l0*EqdPTmj*-4D*80 z#%6r*z8&e=aXGKpbc6~d!6_Md*57Fh_zh27 zq5CvrWZBAb$8bl4^(lR8t(k-Sd{@8{+8Mc&99!U^&v2c%pa;`&MOD$dm_4yLCy{f{ zAkk2Ut}UTyQG}a?>`2Sp4-2ffqzcy_&bl>jxq7u50ptrLILHYL=+viBFW%OL$Hz_a z&nl_(kb}ob7CoueuiNt_^e5L&3$DD2zCtR1wEeOTpiYVK<_Mm;{1a?3FwIjwtDshd zRUqbFTRKkV#SEGly*GXQ*%3V!BQ*8MYV51#qmxHy`0~+%Ic-0nBn=3?9;nxm0UcyT z1-F9$#t!NJtNAM7nuP7;hJtiFZ-+vhlk(V)p$!$^_;8=sZ2tW<-4P8%8r(se@5*cB8aQWa?pDyhJ|)h{lWSPv#XN3BUQHH+@H+n zrbuYsj)ozQ+;=q2f7;Uy;kK5Ar--ZnplgkSz?FC2QC_EwBC;b@d!2(Y*Hb}=EmqKt zih7+BxIXdSk~lbiBkryIS(8`%Cg{Wn_(&y_I8|f{;7@DWeRCO16q-0+eE zI@)=1jS#$*fm$2?5Dfe3xCCn-jkyOhGw>1ps&D$byE~jRWOx_IAQ~nfZSm?E)GUfh zPbarQTE2{EHA=g1Cb)*eN~7>(FP!Lwo?JRxlSjVvL2$EItrMSDq?0MOMIo!s8}|npNdWuaLRgU zkeHBaHX~WV=JM_JW6mKi^lB-X16Q^$qJ{v8vry4bmxO6nxMBnqzn5|o5VFwUbU?8H*$3 z@){7^u2K`DFf{6gR{CFyq-1s-jlliLvF{4*|B-CYXUUvz-It>Y=xV!FxN677TRmG& z)M{aiZy~;7RVzX_mom;E08WeeM&PfLG z-_Fomw_M7a{X(10>%(hq?RkFKRhQEY%GSSNBxrf8JNzWt_97-wnnCau9<}m3%WoEb zWF`q+8SClgiWp8wy6|_h2{96;S->DC7ef>zY^pJ-zmOIvv%2zU|3xOW1Cbni-ONvJ zw+Ja6i$1qEeKH)(OT8>7+0hJdKYD1arYvK3yMF=#7&>rE^z*4clJ`-aH?-7L%@PL)8`A zeR5xvKW7iltw8zecZmL?KFY~(|8Sa?5*LYzv<(&)eY++ZM2k%vc2eH6ZjZEMD-?o6 z?LQ|?QOZ=Gn{KBW2NlbRuo_n75>G&<#e7*1;LdiX>6$ob!S*Lt<_;(~JI}Tf%E#KG zEnaV9ji30)2vRZ3d*4zTB}$yFRdQ5xnWzatp2k72WPPGE!=xi5iB7~%>5OO#yFA>61~&4C^y8T|0UB;Xt$x4U2QQnChQ~Y zrDj;BUp7I}YxrupVAPw1vA6cA2bquOM4=zUsFwA;ctnLr*UPNok5`+t+B|T@zZ%VF zHEmfJ7TcjLTFyu~I|$7{V!Wwy(Ip;h=j%{++1BP@gbpVRB10`{)imX|-rL z^JNXDcHH>E;xAc7S^hY%@tKaoR)I~A^DiHgV^fwOE;%l}wOO$Czi3Q#Le?$& zj=Bwpwk5Wd)Dtyz*=}9(`j}6$UWQc|MU2!+k2cf@Uu!kbCgtpYP>TiCp_@%YPfKW- z?%^iPyg>WoP~MyhPMdMTc?rYd(M<|tW6Jbv3OBCu^#i+gbM2uX)egIEAE??Ix|{=NLlRHv4glfe~?}$x9aZlDS8g$@j&i@-M6N zre>zQ@_hLod-sFp+mCE;`K(ki?d9fgcFDVxIW=(pAbI6fA+Ju1)DP@5HvRPGp}HKH zQ7Jrw=gp-u+Bb5&wXw*UD^X)tn4$3mB}M}%n=6zOrq?yVzupx={ircBG`RM8Ghm)( zjdGbIYZ=9UKFEhj%MgBa-PIJVAm6E=|0FpZkoKVkt3UQs^FAYfL$Gz@$T&%DKM!F3 zM2E7pN`5J0@J-~sxa`O}!`i@>Qum2|Av_q)UQZY zLAG#*11IG5RI2X@SvTxJM4FobU=&lKXt^z-l|4|AU2a`1tuP59Awr)V;k- zW!Jpb%5%pZPd0nz?f#ovZEmyZAy5sYo4W7YLqicvy7%OG`k@hToQgpIm4MLZUHhm# z#W)EjPYs=qBtdr`lZNBnb651+_{EOS|g9cJ+8GK_JqU!z4= z973-WQ3}YN_O5WqiJIG|uvBfm#~X=xPuK*}k}w3SqJ#^F-={~DBXZv`K^Tw}KfI6j zK^zI%c*3ip^DNQ(*E0g8d}1|5v%#nb3$t*4x50xX6J@vYw&Ig&y7o>^?-oavT`3y5 zi2Y6zJGE(#_!n|<*_A%sb`QA~6x8n}P&HSv0YCE+IAkVx+XDTsASb>q?Upu5G!Az! z0o7voH!6ppT=!Mce+=rgPfjVm4va8`v9yLMWp0(2*xZ-c1!Chy8$hbDFmztZJ*D%} zI9RN8?s0lY{2;=e+IBRLt{1X#3}EToUFbbjyj^%w=7sGGUZ^8`*+{xs zDi638w2n5ceGZ`PA1OCwXin7?H~Ow5VgwTd_%JMZKrv1W0{v??z{oz*aI&R0uB`uo z?SM1)kBlrAw=@I>j*O*s)AJia2#FIzZLl|mR-&hEI?&Xz)NCfgLn|I6@r~-x*OMDpE>DmnYM=wWh>yTk7nK!T8|wj4w=P%r*3Ce#;nL z%G;i(YcY0d@9Y;-=2CHw@r?m%7Z^eQ=-;><7Z)8YD^F>=Lt zJNg|RqJysS>Zf{WHYXX3R$skZ;2mmSzkM9(q#gEYWNp0){Q%X1l`5!RXWAhZ^)71( zM^vWhj;=n?`i~UFI12|i*n(TS8rR!Vv2q)3zyA<+ct|KTAox(qc)rKW|LPjN;=mOr zu+8r0fiE@KYI0EEMn+uzVoq-Hcai?=LTpMnED9>zx(N{`iROg$U|cH53jDsqi4+|~ zBz&n>3{hbDl*|7`SdJ<#k#ZkU_)jn?N7URyAFm$=9X%%G!8vMhQG^vLa>e2LR?m~) zt~h@s;KJMcZUY#9t;5RbuFbkhuAZ5=3icR>M>hfd{)LJz)OO=7wQeMT-1o&n`)ZJ; zNXJD~M_ze`A3b(nd)25;Cd;wW6UzR6LZOmEl0n+i_VF`8ASOU z9o;sq0P1?MTK0%{KwN2%>l#yQ!j~bQ3G{_JXG!uV7ogUMUVf5kwq?rDe$9}mUN9){ zL&uML9A$AK){E`KS<_A|F9=LdYw?pwo5++hW~`+I)0F8@rgjNh4zuj7)9-5R!3q)E zPv~wQUj<$5wBrvrh&Il@gIR$y9z8M%j6S8um7CnE)!Iqz#KhI>+mm?RE#leFY27lh z>9PPCZ)lp+(=M08SR?hP#P#RT+gw>TOG1jvbYpT4vlew0jTMNxDs>VlRbfrv8937i z#qT@(UQj?|bFH0&6l+1RkFGwf-HLvP3iKHHZP$Uu$--|fZZjkTD4$rScplIygsv*& z2c(?ImGib#-wFYxOMP;=IL}1Zmvb`w*#n8(uyaeV?tvcAyh&qd^-X_J-AV>>H`Ja5Z%?9!2hau_?yyoePR}pN1 zH(Pp!3V$L)V>bdl$od+KCB6IO$tRUuUSI1S;iL-|@h7xC*^Vi}fVtx7MW~3kVi0)o zjcHYJwS}TZTASs21 zJx{BFl6+O-B&1y{^T`nVS7li@s3Eipym&9&6TRhC40ZNkh{ti9-|Xa6kONgJ8&*5g z{bwzy`Xpb0?lv`DAZW(yaHjB-s@K|t?8AH(MQ44wHBQhjGX~HK;()}GHQhXZ+A2)U z7rZ`rDcE!n_8MudT{gR{qO7ZWTVR1`Lu<<4#Mfn18bWOnhqZvQ_i}(4Z^K}?iOmuZ zgYg7qli9TelR?=lCyRp4<-nKiWpzn~aFFdtrs|_?7z_Y5LCG`I*u3iIh8tWzTI#Q* z@CNXTaST7js90I2hu1yMa`dES=+R1(vHTV#v;li~^%9at#G(aq2Sgw8$Ts9jOQR$U zzv8W+W({)<$DibE>LSZKEsZW~4p%DUz7}VIK=p&}nbEx~f;D}67bO$pc-vANBXf_P z!kV>!jN3xHFxa~z;c<-jOnZzZ+r8-I7%JXSI#U9a6t*vV)&%*ySy^q);Frl;b@o+3 zDuC2lJd*)?vozzZ*X^1T_JUKrR2nMQCReF!M`NZ+)N&)}Z4AscrU=L9p}c$9yqlK9 zf~GZf>Keb&Yz~=aIUKsaW>W1yv(rR1OP_}@Sibc!R+e1!w!$Qc#Emm7?DKQA`k64F z?LYlUuhAE506rXcJpj=$qLIlec=#ryf#vKt>v=7#l0oLYGCl|MFpnJ#JRQ2+)pF3c zfF#DMU;tEKCHe4M{X)Z@g zhY)yUH9_TU5z1lEAd2ytI!S^|6zyQW`nb1lJ24`TSargMDGsTG^xM#n6s2-2Skn8P zrASyegtW?c=(UPcFYL_0A#u;Fv}mFgTh=zd5Q0$A5~6fKEoq%%*i$OdWyF*)EKr)w z1YsS-vGs;uJP9*!Qm1N}lUSi~)VEzhxQus<>w~yUn6fQ|)H77} zEQmSVOy}KPj1jhaYmbJ1;I@^wFh&a}DPxZn0^ByS{d2j9k4ksbi-}Nd3u~EMV-MRG zfigU-%tb01=~Fk$Sn>NP_rhEW$04kyLX;1&+mjJ?{nWgBeG40!KM41lB)*yZeF>O# zw3Ac`T>!C4fb~Mp89NSa3ED=5@4F*PVr?@t8jV{ElpIOOc+=t4Rh7m14znoffGZse zwD5WL&^VSTbnKD=Zpe;+#8&|CywSuL#>q5auO$?@_~>4&aoY>`o8zaRXb&$dS{0~f z_o`Zd8feG8*j7Zotl{Y7*xRq4Aq01MHTe;%U4QO$>1xmA(@pq8go|d^c zC_F%>JoJe$hKRUmj4P;8M)gZFb2f>#-{V8N;||N5JFJg+PcC)!^?6Ca9FT|wyBLGT zp|eUw5sVU06&H8oN0Tm*4GvRJguk>wbi#H&cP;(cMEBmfSX+y#p7wg_w$Y4^+3Yr(~%g;Wq5H%jfN+S{6Y^dFZgUOM#4GEA_gv+)3k!1)zZc|HIojRm8z>`%N@52XX72heldP{7xwuQU0K-(O_`;L>$r%`V6vLUqH zQO9WJygA(|zd@fJk6!xdmat@o+Kym8U74H-7^V8kn2dUeN5*$}x#(6(tIv#;>2 z|5+lG@Yx7}`K%3{v?NV~#LwVvBuFZ}oIEgm#NycQAKczn^!dw|xAi_Sj|J#k6z-x> zmK@o!mnCLY?hmqDVCBJhZYnZM4FPf_d^hM4E1C`m63t`3*L-_Qre8>Z+|3qidCmFk zr25$Oe*&%haMn2vvpVo$z>;!~3=Ibv4$s-MH7*8S1gsIgMIFoA0f6N#sUK4}78&^V z#-JSp1WZv*g<{hDE|4}{u8+2&CC5IJ43U}75R16YTlix@TmPu4pM5Ap1wi~5xqK5B zaJcR=wC|Q$K#B}!0ciisLF7MzYD@{Kv_dKRK&=*JBQNtrb1L~BaG{LB8>*}o!}`*3 zX=o74K9`&8dN`T)8rgLEecu&LBRy%gId=94H9g(aqUCU;ag=K;?aEXAyv%7v$>z?Y zhP$E}J0|;7joppwSlF(=%g0CbAE#uuHdp70f{WD7iBY`a6^VSkch6^u@%4S*cdswG z4}L#(Zly@ZQNuo&;AbvY^5e2WB}U5r^JljSqDdu0Twx@kKz<^<>;tVbJg(v}LA=J# z^d$#eh1jlv_$M)Zo8^hB{Y;SYNhS92aca#@a67j z>bhA4=lSH`0Cf20l{JEan@L?_kEcEdmyo+(N>F;J_Q$vTL7wJkr_T=7nwXRs$%cR2xm90FJ@0-aAkzx-x*Q8z*XVTnWiay- zs_X1+@G)U>WoVWXLCl@3BKPED<~cA7i8u4`G< zZJV?Hq^dO5Q9|8)=`^UqWjfBF4_>7gC1Mk;m!pt4SMY5J$RN;<1;mJ5DZI4in#YT- zijvxx?m_&VK$O!tF?u4^8^_7S7Hc?2o>6I9Z|_yC3_xpFMwAw$ZHB0yk6z=?oMk)}fchtv^kVjoY;@LADoz~cCfYP!+C@f|aA*lcqer3ZV((J z*X~vW*}gaDiW1A`!W^GJZB3(AzzsT%Id9(oI5)WYm{M4zl4LFxM&qfp6|cwn+JX*h zgs7ysU)4M-Panl~hIvm{vuBK3ZzSudlgV+-Pv{E6xUMharIn*%%xpy{MQ^SbKNQpE z9yiXDy}1~cK-q=PsjKF#+peyH0~UgkxVdq1*Kc>&*aF;B1Tip8LmJHXP^HNB?DVA} zey%G7&fQ=vg65~lohQM5Q9Vivz&AnO=M^Wls2ckXXOoZD^_F$5DBdq=f3mUKp{-Jq zS^|oTIs$UIqAxTR7HOJwt-do51hsq~k`wOXGi4GyW%))z`6k*;k^I(2OX(*7geL+)?Q)wKMb@eM2EIQeuLej!Pv?x!; z3Txz}iyellLKgw67Ip(NRzje;xDPhnb;ID}3IWS2DjDRze zP~$rC=Q*ZChErmP*jR2lwdsP$5jSm-Y2iC5KO+61pNFOr)S}6(-EYmXyiN!GMvLQn z_uRC{_Am&{N$wb&o$C714PEoQPSE>T7I7mWbzIDgOkdB!umwNfD%DZ_)bEIcfygm0 zDG#u~_E^vTx$!#4WDV?gsIDWCL?VU_1cAJ>pT>BwM zP151A+zA9QAFhk*Coml!G`^p0+hwBVyB<%ZOubts3s@CI->b|FiQ;`&zULwGl&^

LHQc_3`_aX%C8}74;8=gN%z(SbhTs#f;G?Glq5MIpTL*@zd$bq~5 zb0>{fRd3!j1_)V1K)7W&rM_urw&T~tfTfTUgR}0Qer)1G4P0}vO1)+IS1>;uRj>%U*5RIP9 zB;4gn(=&~8PFKO%C6oT7AkwyhIGqZ&Chb6F9wd9eGBcKxy7+Xq#mHliDOjcR>dK?# z%WWE#mWw3#&VObfDmC0U$<_76PpK&sPaVAVUws^eP>Df*LYG<9Itb&Alxm)B5!{N&K&nF*!1^s@=$1pBr-_tE$2B zAD5G? zsZ1@(k1e>_g)LHA<Oh;x+`5o^yEGC!&yS2Pg(X-%?5xaS zGlOe)5)z$1R7?WFRIjNhUl?lPFdl5u2dCBlE8UAk=R7tW=@Eu~o;Fx>U`b&8nCQ#s z?Cj#I?8H-McRyvLNZcH?8GYmu^Ry7{QZlW$ajuket9}-X#Y@9tPZwh?gC%Zv&liv=-!pTV|RjIYM>`Y$0bedjf zeR$IXENP!d@t}vOdRvpTh%>I!?6=ylvLwq8hS6)ykXAZcCsiFXY1+|T2qK26(RS4s zDXol;=nWHXER*?mk{kf_DWH7dIK+cu4c+MX@-LT6F|h7m)|E?R}<`v z+IAWxK#`)!fm@rf_ANTXIfez-2Of%8yCt|Bk_9-1V6dGzbC6SVO7<%Lmg=Q^uT6C5 z-d#w(AFJiaamynQr;#r9$2H@t1UUyaDVW@9b7TSow+D%-|T zXn^J@B`M<$&Nr0#j|bMVN`I3jOkX)vgWGsbF$~*^ff}VE^zjRB&K4NneXH$3KC0+l zFI%R&1M|h~Ue;||aM?`xEtG<}vuUi~BAW_bN@lCF4A1cv&Cf#e1 zSd)IaEeteaCIVW@mUi$=<%{q23n;&rhfy{l++bIbL%!92U&>~!gV|*SWiC39A;f*1 zLQo!zjF~{GoQPF1G`%`v@q!|MfBbz$@Lv|z4|v)I<#k8DsMYJ#0X5)_+F!@&IJZ`Q zSrEgYDX}CmTctcgZuqd2uXLS69a6D*_*6qY(Qqz}Vddev?W$$ zb~}%gJ)gTsieEttGi{?$oy%D(@kFptX%1H70VyQYM9Ny)t{wE`AU3J;oAWAtiCXh2K@kM!;cnfwXxV9; zPIOlQoj#Pd%=F~dKilO}Wy3*;3LgZ_&D0{+{?YR|Y-)eq znQgNOJ^`Ws;Dj}Z07GWS`jU82$BJ74zX&j&x5%e$s{c>S3 z=8mUtvbgX=|2JdN%n!n)2hQZ)@^Q`uq>4Y#Z9O`gDa<=;WXeXG;?p1t zTiyoT(dLWkqOGwRHg$C9FcFq)mwz?;be2p*JN@}#>NQk7Rg?Fw2X;dYk=)7RZNZ{Y z9n9A=Mp>f2n!tjR4>N@Q|8@3`QMN4Gx^{7vZQHhO+pd~r+qN;wHfGuOEZeqi>(<)m z+vl#c_IK}(8*Q{Ua=h7lL}bs%7}*m2sg>iE#miwF2p-apok~E1jY<)H-|k)7DG{tI z7XRj3ho?UejyRtM1<2*oo#)~=ne^5lxa82xJkA}>^w3%1X`H|(fYcqL?qxEkQExfF zWWE4U{m`hROjY4HAl>Gj+K!^bNxnc=c#(YV3e<%VkSY*FB$b%EP&5BEL#`#tK&}Pu zt5FDsr&vyNr(s%lpgpgeiE67-u`VDgJM=D3)uM$fCs+6$xqOie`P=&$m!}-K#J zO2p5rw4r{Z_q zQo*7UBC)SfD}YKezy%y(Q#GsHw`id%jTUl}UNP?wp1yi#WVD``EhAG0W?2l2o#_Z- zU(7#-r|+G`sPV?03?7hsO5DgxiVNc|x5Oqg)BFAdl9!;L#cLmtCTx*Ym+>)Y%C@|;p@q+-6u~EVPpzz^C)WfXxH7;G^qhaEWL(&nGg(}r3 zhu^>%!x7@yc04_ui(Ib(p^<~KJLM5T))08&9z+X7533oG!vdRk2PA`Iu#n>DV_A78zF)Mn|Jw9}Vn2eBE~~FRyn==p99?@)rNtS*s7b7ax$?v201}fD4e_f}XB} z-VEcW&I$tWR@=`t%#u?b0q*MlIH5pBfcq?4vGE2v%TETg3oMvf&02NwIdopT z)K9&>W~_vPtbx<)j>xCTk^u}7=C)VqlH?|}f#j~0kZ)!(46@<6&iEonXKb0$vt*SL z5g9W}jzuCqWiOE zd+l&l759?ZAm>6or?DGt7{}c9ZIAclv0VVsDTXR4554~eciV5&5{C7V3Iyf3^M5N? zNF*%wJ{sAXM}(UjSF&kS0kk=eudr`mQ6I>V7Ledn7_srXTc=H1bnyD9&NiMXP53ys zcIwYfxuQJNNg;c=N?cn&Gun?{Hhno1&FCIEZLP)s$#58Dm2yVD`%cdw;(^)YfU5>O zMfzM^tiS$>LhvQkuceN07iFKNt6GTx4DHH74%ql{*MH`b+yd(gl`o%8@mewlm@i`s z(~wA{K~F>~zM@y~bRI(h7HhLAmp=EaMfM4SsrdGEwzSR73=%gE#lUitybT{|3pjPb z6e$Mctz)6bQ_=nHk9xJ*V3F`_t;359Yd3knkbSMZ#|gA31(IgL@9sM~&BtTDR&POX zrS}+X2rX+DV||Yf4;F>UW;NerqA;%RCWb$8N-eOQen-auFr-Wy3iIMDxehneiM0bU9?c?ZY{{Y0iaeId)1C0~BO5Mo#lz4Yz<6 zr_rx-WX~$j_v&CRjNl_sOiMFVb}V6^f_b}TeU?Aw5HWqKb07_-X@0R)5S-$L##z=m zI+2!+#j9L;ODu$Fbh9LM2R>|oxL_2WUWWA<9&ST=MxD8bET33oG&yJuqR}*Hl`Ala zTabqI!ej5b_KLrBO708ruLk0%r-@f7b-Qg4WOI+p1cmaxL=$E{Yq57hdcr`*!FgI7$rX4rYyXmNmrYM764LFwTb)Ggp4{-abdGp844S?O6~tBrq&4iL zqnN=H@)jg(uH_y?g+ss5g)PabjLg20-L8(Q#R=W?7U^r8aN4M8l6wLhI{ytu2b~t%oX) z4??+m^$gdsk>&EM{1HwoEc6Fn+xDGRLxvPmZ{azbjANCL5#H~Nm-an`fJ4t))ww|S zV=taMDPSu+W0$r9lp%RLwl{yw7af6Ad$o53aP<39p7Ksi8ar%(CiYrF$}6a0MGQ`R zI;eKHg_W{8mE0>S8Ugpij!r2r6Ii`;N-VN+p+g(x5kRqi(o4cMfqZwPllcVlGaI0rBY7PJRsYI?wE zM2@W4*ZvI2>?Bd*Nye*60Q6B)(g_$`oaMq}pq)lxGLIoFU%;+>@LpKUH9VO|h2y~V zHi7mGAd0gw%8ZXgv1HjC{mP735sV|b8iuY+8d5Y$wF;jO-FA~Xv$S(6)M=|qqW|k~ z+PJfMX|}f5af8^?DAx_dtWWf*x>Hh~q2FXD0yRX_>{!*Y8|9dO#bPQ6_t3-9I>ekd z@0fqf;8?FgH*zWFwBHCCiyf<(VnKf-xxIHOn;k%GRjNg12h6-ugpni8AihGh7Gq*K zleNjl8_-Nd@AWBe)9LXng}i9nrlzL-dAjE`f6`iGM6y{8-b=4Y$1awoQFOHv64B9X zPSjbtO;f5|+1mB=g?l^%B`#YbbqrCt&?v)WB7<7Od*od!AHl0@8J z9QIIZZ=ZV&d|hBk5~EDm@5S#~tERLHO-2pM(h2U?Pet}4xs4R86uT}l1i>$hgC^v~ z`^^M3$p}SjAP{%0n$3_p@~D-XvY)6I0z4@Zhg4;O8$-72E!P9S4v#fVG&jLi^m`ce zRjm^b`P>_FxUY-vRpG3`nIpRJEP{usqKSSg=l5kXR=FE*$UpVU<6H@dh>{oj2aAv1 z%}7H+wqRYCf8?B1-7*%FJu^BxCnm|6Pv?8LQ24daD9ICVn-)+Hmr4QM+dvnOo;E7(ysR6njS=| z?dbE@z{_I>*)n+_ceHI!V9>i97?^w&p)1QY9wDD z9}_Q)JA??nCBMtZfBx(kGG&BJ!R`UUTZOzNjz`j>Y@&jVFONWzaT}XZFy^npj^O3~ zz;)_(!`%6R(ZeHfq~Sn5J0tas_l&+U$!fMvNk( znBt()fL}i+QBm-F)V>Cun=`J&Hq0 zB748lBESK7E>0W*Y=nXUKf-)+#C(`4E0o(AR4J=LJj@68oOLTi> z*ba%ac4ZG#{YLc_520~px+N6SxATvvDRQl1oz3PfOkrYG(Kgp`H`aA1!gN3+IapZs znP0=w3e$Znl&LS9hSC`3(qUSuE%lcAAacK^65wu{cmz~n=%o4I+~l>`Rre`k_xDqC z6!ronbmf;XT4ULdMLi9i8OVCH391B zPKOxsdEt{$QEx^b39PG3HF<3WV#!Rg=!o-*vXagokQDhk?FYzo_(fNj!nH=iHL^xb z+XXGPrVJ@+4hxm3P5r_TBS5KoWLa|22c3kj>)szqHvK+9(OZS&X39->tihH7Kd8_A zCbW}9!JZMjwRHva*mEr^a2{w2t_N0BCn8C=e(WdM7s3fE-Q$jeI*fB>Mo)Upu zY&0ExK`7BxOdBrz^8MLskBvjBNgcPLTJg9v&NO7iQ}6j9}@V2nE`VQ@$*_vR7n2NlV!sB0hPjm<4@~NGx zG5ZGEst#~9EY0a<>2Do?Nb*)Tx0gN)N|VDD5P+Ufkjt=9es0(etd9_Q7!`QjWzR{i zaYAnsD~Jx+oqrYBO>+eH&eCApRD9tOshZj2x#%}aVuDMTvz$oYIVlp8tQPB_Lja{; zO}g>>jh$+E55at)L+jI9{DeaNGCzQqr)cl;sJQZ{8~_JOE1{^~R&Zc=l||T;JMnsZ z^3}2)qFAIf4-?ejz`Fo`{M~?b4yNUfyJOEcAwzJ%WW*5v&^l=hM9d3ROrv0Mv;MWQ z{!zYSa2-?a0xDHwgv847J`UTAoo{grU#u|ie2J32f8j#yLvL6dFG{Ba?5 zf3I@IjdE=EPyvj8ncu3DxW{xG(;~!_1G4?yH@$ zl7!IErwlqM#{qTT+OpddD=TKtE^)MG;el1uC^55?4|5BwAE_Xsf@$M9)wa}Fp!dYr z!}Qi&L7S@2KRML*>prNr%tf5mc6#4EL z5aTnKV5I`SPVil&l%hudl7DEJfE!IOjQTM)W~&sswzvzFiz(f%c02(C2c>Ur$7Vtg zS@ZOxg?lngLO#N-|0AC~G0yG;o-+(J<$a2>+B_er$uE}7T$rhRyjq?z63P}#&q-tR zy!vuX;+|Yex~)|ZLK_B#q(5#m$ncVa$(w$?c#+8`a?U_U2C8=dF zMLs5l*++N8_QOe|*DKRYuZyc_3gsMTHk4AoAC2{;KQYk07BY_Kkus-W{}-F+EjUD9 zPrM`0CnSfjNx%d4Wh2CWuAQ=#6XBMvAeJA_kNPoNZ}82%15ScfEddiZG%)jv_+2k% z0jSBY*sm2S8cV|C*g4mA_%=drUwUn8ueFJM>7u&j+!h|AwDU``E@6T)UCZh-AEm5f zo+-u{m3yM>)lc6e!VXI+YPscPA|URVy-G8h2G!(1#;GG@ZO4R1h?>L3H920@gu514 zuy25?Jqg%ii!PHxkpu#h#X4GDpl-vyokuOuJC%zj?e+HQU2>fl$hv{Rk-3;W#DkmU zvqbc}-3g!GiA^D^6ie_H*(a(tiCJdU^K!*qyBKJLVb>RlM&Uk}2Lk#b!uZejTsg{N zci=F%*b^zrSudQ{@RK|9RmbjCPO8PjHr;7%BofeFD(~}GR{;6N{7i^FN>N{F5Wop)CGJV7S2||N|O!Sq0qXR$<*7y-A=pWb@c|-G%DU4g76IH z%*Y03UlO>FTzS%0x{C<45|pDDN^M+t_N6RKQ*i6#n#Zgrp(jW)+8HI-BFjrdKA90Q zDKBCLW4hEX22_ND$e9qgq=brau{2Hi7{_DDm*mv1WMx_1C`(13?AF1*K)+bcE?{e=awvYk|N5|ba zPDnS~J$S9OFLq*{!q-UkJ(U~5HjhUu8*h|h=4AStUi)0dpyZd*dxxrlIZpmmOX|;Y zc)YNrZsaJpA=kKJ=dCbQNsW2v?5vArsdZOsC#eR(W!Ox)1=>~K30l#m@y;waO+Zr- zI+o}G@&tAKZOMkL$)%5nW&^WK3v3#A>aOL~V?IgJpmy+fHoNA4i&AuLIF*^VxknXt zlF0>s9u~)vo}%+`S6-r7yTr|TtLP`NKDoF4_%>Ly9W6STZfrp^WIt=i$BsT8s!6F5 z5JZD~qW^qS-leI+2e?GiO3`Jd|H1E8vb7z0?%!gh1`rG32tTjq1)r}}24m0Z4rd9) zv-mnymY%1;**#T*H)nL?FX#-(^hM4h#OC8T)AM7wtNIG-RCaj4QuIu3+A{c*NcLbX zNX)p8;yP$6nrl=)irI;3u#}Zf(><{qVObSmpsmg!BYbOnufd}`8i>_XDURTPO9$>cWDx(N5oNk+h_;r$|KG z^zw(;4ja#d3iti|^?C&YEdeei0&&n;?>$dnAy#k9vE^MI<>;P#5moqvYI5X`#8dI# zw?*mE#Z$BWoJ|M2^0O?)GM8a2l@ns43hB8m6|#JIQ!W$EbKWJt_Ll`m+}9JB-Rlro zD-oAh5xw3JL3yDlI9aAqXut?glb1rN3MYl#-MX;X>pm)A5Wh${EVfCl#rn`7g+=_X zy4B)$Bp_2FP>{dR*|1&6XpRg}y+rNZZlG3LB??6-XS!(J^C_TNEXF%QRSOOy!cHp$ zZ%q1(?mWmO0z4s}ytLL@kcKV|A4!@b-|lXWYiDqLFh+547&@(X#t_-Csr7H47p$QX z@tON}!;?OsYsB~_$bW?sVss3p|;42fwCxWV{xA<^$AgJ~eun~z4x z$A_|ZFklcy|Jv;%&P&l9Jl#{|6>~ zAh6AEQ(==+B$@^4tBXa93dQ~@S_24m81ZtHT-%}nG@X^Tg=qS^gXR~mPuy-3J{x~p zAy?F|^~H+pk&t6aI&-2wnF-sd4^hf+nDcej(V@{A1Vbbr1>)ME(K&gero;$WcetB` zZFl@lG@9lDE#jv+s)GvaKNm1q8IGeY?AqP*Ei!yYCRt<~;EO^Kf1qU;Ga`_3H)JVL z;a?GXPHeZwhUWwFfLHoDotZ;~975O`#!*QtaxovJmH;J$5B}lVAwwTLzCbE$4gDjD zSEpsZL(tAbwZd+$`!f}1Vy!i6yDXZ3U6aC{CM6*;gI#+>@%CugsYFL;jsXHoJh9iM zkhZ4uZg#+;Oh<0npKHMuPM{6iB?+}-46lG(GCPG^8JQk5Il#z~`2;4- zutSiH#hg)1CJE01_VV?r+7(l4IM>PV4{?#!0QXnBjqE0zZEWL@#sL11<+17hygry8 z9tb#>2rNYHUgj`7BX~qGPhgDlL+*=nwWmG8=5y3;K8YlMJ>$ctAgy5-Ec5-!UIuuY znGUM>z6Ekn$zlusRA2zGEa)ZI-oYP4{H7>h81KWl+LGH`F&cz^=`X{(Ih8kBB3E`- zp5CAMp%zNn=^#_ub?jMFiy*h^49<=R_lCypGI7a9zeJjCI%c;izzCQrx17IJzj|Qk zX(pUqrT`H~uJ>RigoY8oaj=ww*<733h_mg29tW1DxD7<_3!VdiuIB!FrJof?;`sv) zdT52E+8ciu95Ycgw`{`7vXoqz#Uc*w9jeq(z9&OHW@j=*bhQ(1DxW3CXWh>b>qgdt zIg@y+g}+Xj1->5|J@}`P0%#Itb_+Is-gUiQD|eQ5O$-mr2P*>P9s&0hWT!3xI4~2# z02;aObG>(QwO+Lg?!aVhCkUJ_{HSjTwwh9b=!c{~dVIR5%AdmyA0?&VZ$c@b6n^Rv zrV&)mA~O@b&j7_Dh*dFdD=Cluojk#~ZSkvCMpT$#R(74GOMJ`x2 z!{$0gp49>l4lpwbk^0t*E7oG1$mBdr{qj~FKwoQoLW<=jG9pQmGI8#OIRW@cQHuji zhT2eBU3@pX`Q+yr4Pz8BI`@sES4V3QLRjqgSDj}0iTfnbNCcfBM01o#_K6$9 zf7#jDpL2@<*kXhnyU%Ip>ys(KC++(yIBpNonjd29+W7s!YQqbN#l|& zr~wmHq^mw4&%t)Iwk%#xIvb6i*W7vKR}#QJsnBzY;ltnEZcanv(t*P3|8?pWvb?0F zfK`}5SU&zPv~mX8O6AH%{xaJu2RVgxG@$TK6){Z`Q7^udLWRNc66a~a(gk0$@b>!D z9qfv*q3$#SqAZKJs1ZTGPdeYJxf*a2QB)G6dFw;bR})l}HTx0$9>zsZVS9<2iOfc1 z;`N)^+!5xOkIW>u`O%K#MBoshU`n}t+S=o6Ma!@FTB~>d>j2jG;+MiJs}wnvq&Ep2 zTHoM@d>5woam-+LBI|QI)nyad53@-~m1>1*+Vqy_e!uJwuCZ5sS3l*8Nt^wJ<2ZCL zW*EYf&fg}@GjF-WbXW`;CTSE>`CAR6(bJKQwW}Y}E$|Y(Atfi+rCN6r4)Q^xU6eCY ztH!Njy@yOEcJm!+N3}Q;LZ7W(sk|lcit?h|}hzk1@k=5t+CPuyaP~aFDJxw4yF0nudaW`&Jnz(%yaTY-!~0YHCqa zSb{d1IVK`Qi{c%ZryBMfM~ZKsd+}DDxoclR{XAwwCeR4T2SyfshTFn*WyKR#rP7jS z9M;oyu}tu(yZ+XQykd1{esdGsCJ9@>ivGrgH=jc{IVDQJy z7>Jhz&d>4zHD&dyr_vgQSdomL@l!gj!(J~-x`&lTQ|vVv+>Z5&J%qEhKenIJ2MG#J zZ)T>Qp#g7=Tj2%U8L*s43gZ_qTJbsZ;qH6^pN*b_dBD$;HZ775KmJrMLWWUjK~7em z0?(tF$ff?luY2v%*a!~v_Eq+i*36Y0bL+!1Vo&|Wf)A@kbhg`shxy6ICN)Ej)8y06 zKH+J5pCli;g41*WN>;%#8@>r2!j)M~hn_Db;XxbdnXnT*MiG-#q=Z75n47yw`TZiJ zZC&11R%Y(8I34W@VVgWgQI5kTvuCH-j$8Pw!Z?m zhRRc@F$_B<=yc1E**7r;sbpGvFLv_Hd1J1PRo1OtB+O#xz|N#4aEFJXNJnXm?$ zSv`~k+v(=vRS7^f08i@xfr77s;k{bWU8cuZyh2t;)&1g`P48}$m$JH46ak8PzOokl zDE=Z0)eLDjF0#;|aQcn10h57nrn~mC#I}_I$7v)u|ETKAmn%zxanM!l9S*u!iUk$m z;Mu33CNOmD^Vq19wwXkPrbY#t{`UM%>>_4zljRi^J-um(4*U5(?2CY4Vly# z>2sGEcFDEi$M{IzqM8BIt)Q-zi(WZX#w=QLHWFYJ>Ng_BMvU`*eZfJj#`U;kJ2?@e zN0-6U=4?%Rosv+$zTO%bYkaze^d0oor(I(vbOZPUBA_TNOQ@UICF6H>1AahYcl#4> z#R~B*?cf=_S@XWNFhxbBr!VWBo%yTX@p5Fo{m69+&5y}VGz>QJ+-IB@NW!3lE+^JM zAdCfaG|Wgr&rMdXH1mXlaK}LQr~!ct3pm))H?5vLxPZCUK>l@&a|Uls^+rq=Kq)I0 z!L9DOn$4$(o;tljkKYfSmO@atyYbuMOCNSw9M@wxQx}p`IY_W*2W=QY>n#BFqCI#W z@dqPJ9J@-#r{bkJ)UAS;!NL*b1lc(&g;dNjgNaK9mOH}y_uUt>QmV4Sb_BU+-~mPr zD6ALQ_|KI{>Zd@rD*kABBu5mn0r9g%s>{lLss^;=4y3+(dT_FVC$#wrrFxvZodC(1 zfqA3o+)d=#mCBCQQIQECIb<^GyD!h%lyl3rTw{}mv@klsUdhk5v6Jg-3X7cTahPdY z;&OY0m`glA1!o<;bj(daC8verAu-N;hVTMyj>&x>YBkXT?>0hk@^l36YVoy{j8c}h zqHH-Ow;G_p`}BZD0@^+3-F9GK!^q5uY26%t%}Q1|7skdOrV#cfx2LaUgps4)8bz?= z50fkbDj-VLTI_78+gw8`rP>TvCttoE!3w~&w&R7U5#D0cHSSacG-s;i&|x?Gy)9OO z0gcBbH`Dk_f)sVS{LM~zQ#RX_6hUHr2di4*fC4t0WXiPL>Rexz)F&{-tMN_#GU}oL zD2K(58*|@z<#pNr93B@f!-MGzeq#Y-wx!d?AZXbdJ_j^k&^&1vi|lTqVf6-7_qx}- zd|u>%&q#Xx_@?+*bjS?XqzcJb=bxRfW5^Y^PReF!*IPP=Mg$9waimz$mN zPxIw1&4`8Fa}UAY8JY<(J*4zu<=#lLjPWY#40e}7ksY`!oBL?h-*rIq6S}`%Mdhn} zqkPRcC2F(bddH+)8t(>_N_^LWtEVc1|@DHF98Wrq|q*Dyx!~4krSXDUlPg@qJSgM+jx3-{+Y(FK~s9 zdRnZi?VhSknLxv(&LEje`TwNM2}(hD!y6BHXI2!lV$+>2VQ^Vp_2$8m)@{)%sH_wu zXtRQX_GR*e3AxwBPlOPu7^;z4{$7kB$XU4+0(~(!SOj!;L*vw~b=|R3TT}C0f8?u~ zo!2^Ak&MppDHj~EG*l3+;Rbl1(Ioa;FhL95v_ttYohd6 zD|hzntjsUQk6CY022d28*Y8fJ8A}%vv#4=h9CfY{ z#X{s604YvmD6=p~&XgvayKYw6-FK+J?TZj^lY}`;y|3dV!!Y}*$D_SW@I(*W9;w>A zUqmzY6H|-SLunIgny~(bytisSV(zZ`7It92sB%wJ50?=iIsYvS<7u8QO&V9~RD_l! zH|`)Ir55(}fh~U_XxZuo;3cH*3@I7PuOk|XhgLKI5V10iKq6-p6>UA}chxyR&w6^1)OH1|o;7K?b-a7QaAEHP=>hup-qgs=1bBJvZTsZL zTe^3Cm?RXol)%s}5@*9ezM~_n4bav854Kx$A(iJ%u4y|t-+dXRVVJ6Q%-2KgwO?LRq zDx$4eULcu072qK0Hz8W1?-6sd`-HT7vkf&M>On!d2r7n~r@1Sr1iN%Y7cYepj)8nc z*m*M+sct}DUkYHMFF zxS|MF#zA&MyQ(jWY#CVArEl0xFaAi?DcKpnO<4^znA&i912Z(SLOW1k@?<9kb0-wTT2wXb#KKQ;=Mn6zlI zl1ck&5YdLTWDH0kHkpR`hRE$m8SCi(a5{qai1n|eYb67m3yQ&!8XKOz-$ScPTfAjv z@k7!X!OD8HfI*t$ehux%0x^uP)8YC>XsVn{kb$9v-MY9XHPZVbb!~zQ9EF7b2Mg1BuZFK zyib-aT|aKhtplUHdN54+l18`h@P75|&c|GQ=e#^W1S}@%GUsz42ZF8ekOm8!sbr$p z7BigX-_>P@I!1c>m2WiN03Fusv>J0c@G%}H-@554)Ikn0;55KXYNRHw~xr=@=5ag ziNKM*?T8?PYDKY8oAfa9p=bzvTR%m7yc8I5oqUu?8~;9W=6cLnIm+|5@KiFg)pF+B zya_+lhom8#nRIufi3AS4_bKAwG=xy)i6CU<$doPg)~OE(R8G?^w2Ey5DS|+zRBkrZ z`f5|3k!;U$h&t5PJVlT34J2Rr?Kz)C=6I+)?Q4mc=cY|QtRR~;q+RMpWz{#WUxf8Z zVo-W)J;;k{l-ztbPz2c0 zN3*Zgh#{wXpnA=-gvBUqI2d!hq@7AWu#EIZ)7W^QMi?+SGEEEjim8)Jt0^~=`u^W( zL)P8~n%rmAp&NnFulK5A7pHAkpY1_N3y2L6xkaFmJpjSM9Tjj2Ws^3YEw>L*>|M%* zIuFW@MB-Vtjn)@(i=SLGk9Z;S<0!vV%xxHM*D#gT&qrYQN14xJybB-081!xo?mZ}> zh9jlI&VI40cUiGhYklAbFw?6??>Oan5r8AJ<;54twE9hxSJE$Tb`&czW5d}<5#a3+ z`;KBT`JrehA+g(Dg4RDX8ExU~c*7?F17Z)DKOE8tw|V1E(#$LIVcnzn){8E8v*$Vm zhdH7Be3w7E;6G}NxVVfuvG#>{-BIX`0x+VlZ~8)ZJj!MPBH<+Lc;9#S=lY1&q!gO^l%~( zQ0jCEq39^Shx;r?Kq#jN%J8flQsR5jF$l_$@9~H+T28cUo}#Xpf#T&$_1hVd6I=84 zE}h~JmXB*CRLu+pDB#4kib|3?56HgQkvvkown(DcF2Cz_6M9=#DF?}?1yZhx zFW3}sj?6Ljb7SsjJVcG~bN^dHNmh0-yKIYAy{juwHdGweNjf}MWImY1T8<2c9@_V_ zWpkD!{mw&A0=?m=JFqJMyDWER(f+$KI{$rKX~{t{*y)n_!4#l3X{(A-uUciAA?vv% z@})15wVV0)8-hBI0xb>-Yw=?ppDI)lsxO=jSpwXgt_9PPAR0Ur;B#kJPr&=nH93sT z>YV`%v086%kIpGrPj!}RuYmrQ!peTLr3fk<(K;qC=y#8t*tt?KGkRz$+|$QX-03Zr z>-mj@9z2%=sGwo@OBn=JS3~~yE&sioJ)a(7o-)R z4<6&hye{JHs8MMoV_IsvXMbX0LqJjlQ9gi^VotP6ZxtNIqtX&;E)%PH*`8!VM|fvs zl6&=D-2y}_Z5%sxkBNyv2j1qU@gkbgm?$4eHiRQf89fa3XJ-rinb7a8UGT5)(g!#y zIt#Pzve8=TISCt`4~|>DPM4cEI!j*q66nd&C%1J->|}|c1HB*~x((h+s?Q^Zqg+aP z{WkBCF{G9zft>o8^@+C9=t)DT48=g){1u)*ml8y|)Fv#XmoVyHtSt5a%{0jcA%|F zrSnN;8=(2n>NG<_sBMafaRY@+?nEwdB8O8(b90S#QKIq9@h!j;sM7ZTH4l&-a~u3( zD{AKM0m)iMV`T?Yo*m65U4B2BHtpc&(VSm=!`7TvyPlV(z<$qo6D1yvzu%V}!0UiW z0C>SSm8CgjB^i&IgsXA8bKZnyL7ggmU>`zT0Z|f1;r?=HSFS0x5AR#uZwW;*tOcK$ z8szJSkPO2M0$#S&5Jk|p|GSdfi&ZC+}_K#Rtlpr>vN!ICAL@c~!oooep;m zm`hNQsUBzj<(NcSMec8%mZtJY^inH8L|3LlQB}2Epi~VhKA{<3jXVcv!-h!kZWCAM z?0gu7apy@5sZ@)X>#+C!IqcL{XwW=5Y9;?S5VD`l@D>WLoLNnBds1@sfnst#AeNJ=%CL#TR!U>b` zuZHQfi^lZwY*LO{jawS@T;U?IV4Y9V8W}lqTH&EVwD=f_ z4#a9d&Vl!#db~aEu1D^JTQ-3!g1yEX7EeV4iH%q$YlnuUH4@Smk3P+F+i2F2acP2n z(E-~GesGae4!U!7iQr>uOL*8!d7o@m%irhhG-OoD#0m9 z+d;NE9t?QQ>bf*~4tn$+wMv8b);44N@yCuuEaF(Ygj(;X)ENZGETe{5>+uiWv2!-k zZ0n3y?lR|zAH6wvL^X~JF+Wi7Zg3~;Z$U)*isK+#CzR)h=Yq6tPr+dYpVSFtdH~7P z=`5`4Ak4?Ef@T*E0{QFs2Iol7+X#8co1NCE!;*wa=5dyz{Put z(_)qJPe9Aqj;DZWU^!rb(8rb5SWrHNgz}aSo&;N-OIi&_tGOh;k@tHL5XGug_JJ*eE5pBMyDNUK{6piLPu0*nTk0j|%sl+G#GHKVN zVdmnxYRs{>Z2fv>U;HvfHKIb5A%403(USae=TnD~w)u#t>LGh|>#Bv$#`pdHTT3Vd z(}vMe@bj9sV}7$bS=`f5xr4wwxaLV+b+mXQWimHuSge~Ub*YD2CdhY!KAY6;8+v8K z?c)!i8O|9PCSV->1Yst1CcnbS6l3$=fOaFB)VZ8AleJ)a^)8_Dp%b^EzI&Zmm=fg& z?0OMNlCjGGxyh7(Y1fu<(;(Y=?k+FipF5lfq5BZlt|<%1fO8Nv)|#O^qV^=sXWZFB zC{tGY^>|078rk;|=6r{y) zXIbi5%hcWT6zAC~ zoxD*uXq5YuVfyVtFtS_caBk%D&`M%IEVel|xCTgep|*(}3fu{R*!fc|Cv)ng;3v%K zcUYko@w~gB_!9cqpBrM0?2N2}NdW%o3$r6JDH=A)124~(Fqdbj@qS^o;+?KqW9iga;3Q73kGA|H>|hju*2wmpZF#f7 zYL`;Izp4)LvI5AX`4N?nKUKWVe;d*10|}$ropRZ)JrPYntLaL`G^0Qwdf%_pJ1RYB zQiG)xTlRo9$)7_yXAiBAW44d5?{Q(?0swcnm82n*JGOXQOI+N6;<5Q`XdTh|7!QSh zB5x2cIxHA!&n?Gm!GusU1Br5%>;fnB*xf7qBn`tTI{dQt**TdEY-#a_K|gq6%;D%$ zafz{o$M`-*hp6e`MEszNYQfB6>#E>G6tE__yBlOaK`5yC=O^=QrmlfuR%4Q71fLh9 zwP!Rn;8~k>sES(K408ivQT&^ct!+ICS~`mOIy&EobXS7Zo_mpN&Ti#qmNAVRKlHk( z&-Fx4L~wVtJYh!}sMK^rR^djoSJ~t2h=+iPBaZkoA$Yc#lmFHe!fvI3V01@5wRP2O zf(UGuRS^FKOsMD4gUvZcLffS@ci6 zWz)#1mZO!ti)#XlR;id+P@-^79Jy9>4C!>Vb<&UGx|X8CzHyNaj^NG^U;vKXebl-@ zMhCoppQ<5rbF8yQ#M(4M+5RV>Pg&+so8=r>%H3I7|P`d=zD{=Xo)f3M)* zKwEBZd^!OJ{J*}01isDbl-=!&@##d|oWzuz44l3fjSTP^{zBo>2{8V*8SOXW^Z#PV z^lw9k|FhZO=(>Lk=yyBDHb4JDss0VA`oG1$K}i3%_!s5(-|$r-TW1@m@BY$Bng4Xe z*ZOAO{woCE2-W}UuDpS%vE$#GX8a3v``^wE#s*I2wl>0lZ8y0vCnG)M_lKT?frI(` zn~I*DjGms10+LSXFFRWYd~z8BCp#-!Co6M9d>1BKCRzq+d^0B}J4a4BI_v*lMQiI| z`t2m|VEfbA$oPMH+5I%ZH#9J^{NCpOuLg?0b(b5HvgqDFflW8e8=Rk@;8m~8>0Ik+BcE+pBnQwfA&ALZ%FQcXy3Ho|DpZ$ zgY|n?{#E|n?)M@y{r67$hsH?%9rpjJF*AL8{fG9~u>Bu)Ozdp*|LEU$6YMPip|Q}j z|06E+-zZ?_f5e5JjggJ%pT0~SOy8j2f3(ZN@(nEhpBft@`#)pAK+p7_^|3KAe{*;L z;mgGO7oGaA^6yxE`~GKr->w}0hyepVGc&_K>igDMng40W#LU9-w>F#{49u;J9U%WY z)Fo_8Z1Mj!_P=K%MO)jy4w-*V6-wqF#@_?(uR-YOWZ>ZRuOaw7qQ0XJNkSqbCkpw$ E0EU^yp8x;= literal 0 HcmV?d00001 diff --git a/docs/source/_static/API/clock_driven/surrogate/QPseudoSpike.svg b/docs/source/_static/API/clock_driven/surrogate/QPseudoSpike.svg new file mode 100644 index 0000000..91a2200 --- /dev/null +++ b/docs/source/_static/API/clock_driven/surrogate/QPseudoSpike.svg @@ -0,0 +1,2194 @@ + + + + + + + + + 2022-02-09T19:41:33.111490 + image/svg+xml + + + Matplotlib v3.3.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/API/clock_driven/surrogate/S2NN.pdf b/docs/source/_static/API/clock_driven/surrogate/S2NN.pdf new file mode 100644 index 0000000000000000000000000000000000000000..55809a4df3b27f9923e61faafe6d6f9e6706a9f8 GIT binary patch literal 135493 zcmag_WmFt()&>gW?gY2S-QC^Y-95NF1Phkn8r&hcTY_tFcXtc!(41ywo_FT^ew_28 zx@!C0*L|;AtFEf1RFRNkWM$$&psZX2RJ0?o0$2b}W}guR1OUt`rq-5j05*_D4Zy77 z<7^3F7B_V_wRf@x2niurI$C@<^88PLq_?}Yy1S{nC4l4KK~heR?f~|GU#pn9yIZqj)W8_zooZ3fLYc7R8#CfXYv1>WdVi&W_24=3tLBP0N206 zb(GCMTbjEAxc`;>zwNNHfE51>AnNGo(o zva@ioad80HxOkX2xY;?`+1c1RxIZeZd?a%JH;xF*YL;$J9BWamr(NVdjY zox22vg^K0hT5pXFC*f=-)UQu@mA=u!%$_j9zgM1i+%G45UvJf3_td6IUycM#X5Sxk z*Z&T!Co$*mF&5?g^%Ocyt}oYWD~M^rmHx?|{QYtiC01(a*LEfbJo_^9=l7bD=bg;X zEd|vLSi@vDt;3=XgYuyA#gmb=*b-~oX}+JR9TzF{lsgdx!0)=kr5P2}L7mTc&&%^I z%FJ^OEz)T&&PE*W}X*l0`i8d3Oz=qxFemPf{u$lm+Xn8}CvqHnP6#D+Qv?$&@{v&6af zO0>igqI8u<^ZdHBzV0_W!O%-{T{w*JOLo{~)AczWEI$d=DYZ*`aaGewAe&bwzxrke z|LR^-=DKpPb>_2HN?|!_;$0F?-`w=^^T@9ie<7oDg*huvQTSZDlT}TW5+Q{J<(>5M zL&K(~BYS|IILG~$@{0-IhreY^6o%F{iI>X@9h&<#ze6qFD3n@q5LM7g`h6cR{EDX- zN~*B`*E&&4?7_$Vv|*i0@td1(JeF*`fj?7UGxqnmXni0?ss3a~-nwGv1D<9B?CO!W z^y<;<^USJ-QuZ+^$ZV~{OzGfs{oC(|JMCP0i#u>7;_HsHZymB(>l!q8#Lw?bVYp}( zRBy;cw=vslU0-0|;N{1XfL~Fb2b{}|yF-EBn#sq;&p+hL>NbbVyenp@FOh zCpaLHDlAGiNF4UftaXlgOSFqql$!ei0ht)LNMw;7&VR4g2JU2RdMiO3&Yci~gx6r+ z#}f)Xj!-Pm$%#P7^%0^bmlt2a6)rede2W~`fOQOo^#6RfVl6!p{^h%i_Bafl7S&)5 z=e>spK@`zVjnY55%nK#p!V#QPxJ?fGnLVgm_`iA9*oiLVU_PpVBD=RM7ftU`!BGj- z^60gogJ!7P#I$0Fp(kB z%Y;7O|DQ6>*?j*g(_)QG!mD!mvMdbse;BQ?PXhmok=m3MDDQtJO)qrxg*)1AD=soX zYs88zndHK`9mwHN?dk$vym&oqx;bhmPZ>WTetO%6$@Rme!-8QiH1Aqg1k8tGw)%vs zjtrzvAruU230*_Czz>^v0pst0xwrPKS(hy~YR!DZPuN2g*Qd=*4H z{QF$~2-%+Lp}MM(UG;k#DzJK=FJ;{Q_h1aUMfR0cLQ<7<+I$AEJauoDD3e(@Z*}>I zQ!pg#dcz-Cc-3F6n8mpCY6ni&sZ(^%{U)CezSfVSXU})B1t0Y|VBk2+l_2SKeDShj zoLUp%#si-@HvaB_KthSzxO%1MFN>-B-`Meoc-cwa3P^lBxLa)mWZ{)#6uukkgVbVB4L@kyr24HI`8!0$|he*@9n3%(9}J`?brTnuu* zC3PDG6W&aF45RD-xXk8`1# zQJ|GbVszJhW;dtGof394fwCs@dxN6oVYeY!#(@ z5L6{l4x4-2n8_$Xebl>X1a@PeJ3A5xxVU44m}A%y(u9XRh_eFznT;M@ac+(I&PAeI zeSb-oBUZP_109c^f%Ddna;vQ^{v&3YhftXJRCyjo>3P5=KLB^b{l;%d=3D4Ja?YJR z{0K|5J)sS|7?K967a?*E45TzzbeY%o&M{+Z`U}`zVe^i+=J+ zBz)7O7;_!3#iWVBXW~?E>xe>$0;+kRQF!#p507B0aP$eQlrLmCaQ*>16w%yG`Q1xng&Ar{w385ZE zquy!IoMzI_FYlb$L^`rbX_9QhWN2}rrGp3= z3JaT+^2X<@hY(El@`n*tv$}8CD|HHV80*8c9s}~gGRWy@{^D=5S!sWI)K*m!$;UMb zpo02tL>3szX9T7mEReR^<2tE(p2+M(47nB_|2X$bQN{EuiAgK3zz=LU)9T=7mA8x- zm>1snmU|#+-?j|8+A6n|Qb3M`h3k*r)iW-ge@1L#R!sx@EG78cFl0sACCxCcWDMK) zZC1Sr0)bM~HHf_!~7FNKEBEGk@6o8cD>0~#_Ien&T=yN2uwVY%Z_pypVF=So5; z-q$*dxtm~hipsC$zC)J9!Mt^!gc?z*RT8ZQY%f9%i}_(&>b}kcSfq^UYSw+EoeD8< zh!Al{F2#=TFkh=o@)u@TeNdPrdCtBE_5EZF0FO7Ut|TszgQg0@_w_+~7Pl0zvqzlm zwMS8(aQ}&>&eD>MEA3&N0E=77-E2T(Lp{88^mCj1+iqCDLF-OF>U)S`iLMG)tIX_3 zZIG0L*$O^q>h2}vGOJ2F8Rv#`N-o{@r&}|*v-Z?xs1tNulZG#@(J}7I#IhY=X7`6k2N8b1|gGHAaYUx001!LDtGJ!`J0Rs)YakLJa#;%rtyULbBS}mGr(t zibsz`mL>*^A;NGM7dS$OR3LAA<%ax|8fH%JuvqISIT?zd2cdGs{X(s>*;9_H3P@tg7RsiTb?a02BfN_g; ziTZi_R7k2lG)N`UP03;oPc^5I4`;gZm6B;USu1+T-TLgCF1b>kbA-sJV+8!;wZ(dv z106Q1iZcAmNL+c7*hUg(y2nq3c`RS|+w@XP$&SX*_IY1tdX#*8($UQrEu;PfUzhc8 z&KNJ(#&Q%r}J*vS1u2aDPPH*iyF~Guu4$+3$uattZv|Ao1eS7dlQcHSk+j?^k z2+p!Q968Kus*0MF4>%P0+9xNSKpllIsLzFe&W8Pccu18P-Hg}nwaT0=@cUcjvJWFC zSqOwML*K-4bZz5KxTmf0kMw{fS~+Czb}7ZwMO`s-BUi+b*mC(tB==hgLm6gW3Y-S1 zn~)yU;Ozou%IJfoKZ){2b8Z>hsf;B!UMGoL)0EEkdlV3Q(%EZ5ZOOxpm-xw${xVP7 z^`f8dvtqV#65;c*09oEbf=vS@L=vbvXD;0 zd^v=PEWi?$2y`Nf{r)4uziV55N|HwMD3K?B)ut6Ogw;SdPlofQZqgG&8o16?(|5eg z?eHDr+gwz2;lL}@vjT&3H?vprf#AR@3t(cBmwYtL^@fKTe9vqf7|nn}Lj91F&%FOSQpq~(dnzERZj8-`rY<6%#=dd;KmCqZpwr02=|mH)NuT!Ff1=; zI#F#6#_$0zCZ2(Hm0D|jMb~4vz+|JrpuS!x+;q@GS-aX9 z!+`tXKK}E({T~+OU(ewm5A5u`9IXFPBdj0P$%j%Oz{Sc5;9+M6aC5N%xOh1L+#GBm znH9jz%?seTV_^g7+5ctd2s3Wpk9d%c6@&o~2grx*UpVlB za01N)x@P&8j-7`GM9lCYe4PI~8wV?BdXN_f8!reUP$U}*2oVsvJRtc4QP9EufjDS6 zKWuEgJOCbUP=_om2pphloS;rPIJf{DAT&9+Koz(^exMcmuyKP{_+Q9_I{e7V4nmv# zqeFI3UJ!yGs2Kj=&iLTR{^55--ONAup$}+4{F<|jrLDD%`@g#e@lyXbGO~SGK@5>dw zzc{!LFC|ll4_@zIJ|MmgMDN*}i#l3^yck(Im{>SKgqXXfgBECpf0(e3sE@u}ZJphn zToIUc{?qZtl>bA-ebo48#Q!%6i=Ca5h36ka>tpA#vHv515M%|d1;T&Wt`8#df0?dt zEzlbCs|!L3GF~!6FccD>XV=%)n}QX+ykxz+BHZC*WTV`@3e{$c6X{;L4|Du_+%{M{ zop`>USx;b1X4zF+kCSkQ*W(vPW`x4)%=B!epbz)=BBH6Q#Dmv^>F6w_>*(ktl$Xy} z894O3H<7E`g=8C65}x_wenWGE2{j<)a*3z6$O{P-6j>O@2D5+!hv?hM)kZ3+q>l$kvn|TA0?55?)Mik^2sWn#$;e0`u{1#RdLAL92u0 z*Utz!14jU8%hwWFNUO&)ij4QnA)KAu4Pk)s1CpLqj2|4Qy8X62!KvrpKbbX2wAMaV4hulMg5kYF!DlzfH8UDoW z$6m`0Rxc=`C`FvN2iN>QY4Y@rWCHe@O#~JX|Jbz;%z3MgPkA#=5-lk~@r(udG1L1Y zerZc##Lc6io!mP){sRWKoGFL|wcO4lauB*Es?`j@9RAW(77CU~MhAvQ(ciV}#_G}R z>h59cT2~(sneb$aCzC^YHr`KocGi7g%``lW3N=ub)l(8b=EwRT@Qdpnh4^0RUGhNd z-s%9=CB2Q8&2CAbw=LMR|D70;l~j0zp}QIk1>2E{%Mk@^2_Ed8qsbz0No0K#{WT5q z(c~-C_Tlc)792}NKlptE8>~1G@Y4Eatp`lon!7f4Qhg{F4i640T{oj0LZKU5>wY+{ zLVV2R=G{aFnn&LU;#LO$4F^>#@cOdJ3@P(HgLi4wR}kg>^xHeZ>3d~C0ZI-Si4>O> z7&$#GBG_>61oBqM>*b4E@OwCLv-Z87F0>L@a%Ull=jR67-G_~OOX~B&4+YlnvZ*Qh z@R02T|7ea1)=&Uh^rHu?eQ0OVdfS9G{1x`~JpdZ`<2ncn*hlxC+w{&Xm+08s9E6VU?Fi`*MR~~@P6vB4H14Le|q4-c=Phee=78j84|6V0j1d^fTI>7eV(yL8Oa{Z z$|pWMYPm3+sJqhPPPiZ3L@Q7t3(Sg>K8BjWSyaq3Oa_bs@txB}+)77CX**aXO&YFT z)ryk8b^tRLEfJtEJxYxYC!bVGLFzkUe`RfgmTDCw0Tado{T_jEISog8_BJGk)*EC3 zr@ttS0TG7Q?CSl44bGe(zHk>4F|6)9SJpgSrP<9d46XNG84y-=XpW*oszH=10w-%N zA0dsjeRM6Is!O61l2Zx0eRLn!*dS`dCp5xO5LR@oj%0fDxCdAQd?Et!z;WHbTV75N zUU>x)yZsq_>IT9eJjVbrAs;qhbJM+nWDhnb5w+nOt)UT&OKbD<;I7Zj2XRVUn59vn zr@2ZCUD-iV^IawNf_FeWrk714d05!YhyfBoLS=#Xgr4^{z?B#W$t?i={nA;`7ijSP zCI2joh|qrAz~DFF&$Tct1!{cruCZq%1xttTtDk^GDCw~m%sUA?z?Q|!*$vnNnF#Ji z|L|@O^U_!64O~TmUxz+JX?|~k#dtD4d2&LfaP_Nrr$>|7i;d^tO&9zDU={a`rQRQd zk6O4F#ecW{5_W8c=vwFBDAL>DZL@R^9triC-ru>X98aX0B21mav4Yf`75D75Y zBS8;~=_LB139@9 zLlgB%K7jP;t0Ba8xv*kG#CsP3vR-r1W7v0v0|lW-;MR_g|reEY9Kmrz$}1 zf6n2=a8Q9#M16oIv6R$59-W&oFGb5-cA?=}Ur3|x0 zl^umHx3ApZPa(IX&PBDWatLSoULJ1GA%timW5|bZN77-DpQLYY0AxYU!rh?j25<p7iT*9qur9 zgT)v$KbF4=mwnk*26hc>O3Aon?f3FyxDFKM`v;Z7nsQ7J;FQQ!9axgt)i*+nfe*`$ z6i(M>T^1B+YYhNyS)JCdupIgOHnMz!ZkuqrIXO(qT}HRR5o%Q52hT(?*cy(s3kNTY z9Ufbg&qPD@nMNoHR`QqT)s1t=r1d1^b5;t?piYa{|Lx(oX6`kpqj>x()4z;Mo6~LBphDNQ5}07nHjl8Qy9Mz7w4hj272Sr(9fK}h{G6Wu8B9hOc*+8`D6?wE)07PCa&4VHY(cpZl;H`t=F zbVk!xt2{o8$iAPv?(WOmX_oI;YuQEfHXcJlS)@6G%I7wd&voBtAV;_2I{2F60=ne> zRJSC#^tJu6JbfO?)&L+BcZUCgMxZ2H8E?|;U);^VICL|%#oOk=o^X(0R1`6V)l*f04OQQ_%zaj^IJ zb-MjeP`AGu4q7ZmT@0$2H)8O-UBe9Yn(>7Y0{g=)oy&cg1Imc&? zV7LpD0GU7HmTvZ9jM3~u2kk_yt8J;m3sR5(mufg6T;1A@h-fl=7MF5RB(`H&E|-Rb z1(D|bVy$!JINimQ4qYAqHiY_|+f+msmbgunG!^j3(cy&Dbo^;{p2l)0#FUjZ{qFY` zGNb)83vj-_T-dW9F>-TKJl`uldxVe4DM~lVH(VWGD?z5sMI4;k3#=Tx_98osxlJu_1 z(IaP6#O|LQ%K6l zRDhGDap}NHF+`8EzvA3z8`2qEW{t$!FtVN;Z|Bix&`;_;6iEZGgqByUrC?CZ4HyOg%Fs69K!+YetTZwf#nfqWh7hKs7!i}J z+(Z>J0a)8F!^~;5CzZDV1{>=xDnnk_jVtr*m6cimD0&du)QMVDCF-T5@CLPOOBu zAeRthV=Y~jKeQFFEa>*i_L<2dArzKx~fvaGhsH|6KSAA_)sqPPeq z*3w|tAecFRJ z(9gf@y4YTkSjj?lMAYt`@Y!c-9aKN+0w>p=af9B}n(F^bE%~@f?FLn?pZ7+v<5IiT zNWz$D>j@0lMh7ip-esRHvbyeFx^gon&cTn<3ju)%Cc-!wH5^~Sob)cv&v`hMmfN>j zOW{Uuv}p|-*+#sg5)5X4Uc^sSOJ!R~=yBEzYc-1)EK1sr5UuaF6)?A;_MT#u4ut5# zW_iLH?Q!(9@>W^&*5eo>;@M&e=*FZGIr05=9T^%ycuUezx1HJ55lXQ1HVSeodkUzJYQJ z`b^G;Y}6Lkj7#tpIF;%chTRQ1m!Ps**`7-z#fv--H*U?xmd}d(YI4%C$3M_G_znza zPk9n<;S7zB{M^JI*iDvRuFZW#HAI|gYp!r*PO9PzB{o$9UI0D`O$}YHW@9G#E#SkP z4Rr^ftRa``LpCd70(c*T&F5^Hg`)}%=2-$L|F+b~TMzyfd&cqN9$URYlUcT-XEFG( zt6iXM;uoZE^Qky9Hqk{54!pyC5C3;!F99N6%r0luw zfQ%kiO$-@o;#tHA9Sm3U{x4hcw#)V3wr(9*^TTrA$V=8`4Xx&4Ws;J=w+#OAoyE7L zB#jWd2TuWp`x=+6tdCEmh{QBVQ%;T{3C+vlrN|uFeRgi6rCE%PQ*X?Ggz>AwKlN091>hu!{A|Gt`K|Hmf`b(H)LF!6lBAX z>V{bA3`=znHqMrt#qMpEXoRjo-M}T9+g{Veo+bCU=jj)saAHSGEpeFNr04B70mEW_ zc(A5SD0hRg-h;k_jlaul8;O1|V6D9Ac~I~&(49MW1Ua*~b% zcq`CsGl*j}bjC8l(S02!=l$nb5#@U2!7W2cmFFGSb8`oxrgw_6&LB*ysvY(*m&{I# zV340XL!Fe;^EL6VLp2%m9>YyshF%1<8|O096zx`jG*qRVd@jrDPm{2rVbCz^G&Q&t ziAkze^5VMnD~Pb7Z@AL;b840dmbm(Aj1|J`IA0iRhfjZTH!L=>vi@t%;}JV@E)Xp} z*zG3DFG)3@*Gj8=P_9VaBs6bF`042UIP;38Yeqq97_iq?L}QbYi&6*I8}t!B0b z=liBC!Q=g+VwD^Ytu>C z7`fmR7z-6(4!yK-%h1Z@NFZqPh*>h4rzjP71 z=p7RUY%(%E88|(o(^fWG8-m&qkY9<`6 zm_cKCq@qrY;omU65B?N$yj&mkY3K2BMfU8vvX*ggz!((o+!&Ew3~xv`9e}8DbZj2l z-b^${dmg&mnR>Wxjf2_9&eO#fHox%LtR7EE_Rr3#{FLw0Pw^H_SB6q8e4>AM$Q-j$ zF`p#{^;k=j&C$giGPCqG^v!m&n)~W=HI2CP%h4}eKzfEr>WMY{0j>yhWZ*W0E9@h8 z$#7w(m=%||^g^wY)?Q6@MQvugDbh17qxr$vaSd44bK(N}T???H<|?<&ba(rrw@9Lw zsrckmDZfm?fP;q{Q!9N4l%LD-Z0H|z&6EODf-|Elf($R)lHIsiP5jxm7d-FL?EHJn zsnm?1yQWE%(g>$(J&5*^zp150N>ljoJ%6(;kLQcKn|pdIYL^E_;Rg!ta%hyps3t#! z_Gk>&{=)R!StjRL&xEfT*j$xec*fCBQ&}ziy~<1a@)?MJfHa?kUeY2{T;=-iwgaEx z2@51RO#(M;xmL1|8yv&|*TeF#`epuGvm{Z0fe@Ksr)PZ5iEY_%Kb}6s*%7X8o+4rn zHvHAt1d$K;>#E#U0zT0oVi3)%GEQC2VbHDOE40;I#BQeeZ?-t^)7cTQ?+ij@Uk@fG zOL4)94`<~+rhkLC^f*)N;#K^wAp}Po>tm$oC~}8_IzmJq~?J z)qh4k*mwRa$XEPi#v^32&(CJ{iQNN6RqrDn$GKaN0s$ch=>U=Le-*V3)deTq%L6!Y)6<@T6)*bKyc!bcPi~j$+QJ@qU)W39D0?i&ha*Oo{H;Y zkKE6i;D~ZQ8yOWqZLIVO?dR&kJ6nhvZl(Rl zjdKS2zK*2xqV^cRm+e<`xG4ETpOA$Rs`HsY?!m*nSLBFZeW&l<@#>8HI(ZqrlwQ5r zW~bWpx%Epvept39q=x2Y5lf(Ee;rIFHfKQeh|?N_>PtMdOsA{yZD_@^_+$~kny%5seN0%lG0Cx!@Z5`uH-4Z6f*J{y;tUqCh-gtDFG$*csUEAsvN;?pH3 zRh#thZn%Gn;bfxR)i_ZuOewHS(`^R=@AKhqtSpKB(NLv z`^q_phQbf25tW1(9mB~S<|G62FqzoVzX*y#8n1yBW-@U;f2I3evzRIG6njZRc_u)j z#>bqN)}$k4m?HKXw*iA4rNCY7yk9kTcjeDIEV)LG0S7QY@)ZD&`lDNt7jNv?BtTqx-04|(-(L6?;4ts~VVY+r2`064ry6=@O zqyT$4E4hghR+d0zjs#Duw)Gj}ubnDQP3w!;Co*M1OqzRUe!)|UAk#wE`GD6{XbuF5 zCet0Xv>ahP{@)g8^xZE+P{q--LF&rGL@VpP7-7q3zJ&=-j&tFpa|>7WJV@CBJ6nVm zrihCw!X0nYU0*3W1SpC1*ILR>p)gaGK?fZQl+om8;n-0xa0J%*s!C3^7MKZPy&u%IXzq>@tD4 zGP8qAMf^w`;PTuR!mk1`H8(K=JEJ}1dGJ&X7~R_L1ybJ4HI*3(p_m3u1Sdas4y>Ba z>}(|0BR$qpxK`;gW3jU>4i$eSa{HV))?b?;_~pYn{)8imy3iXkkJ}{dPD5A29*TRa z3lKE;rZj|`nn!QU*_~rsY>{507?C}_razZZHg|*CB{d9~1-LxWV^vgi-Jk2{aHbn7 zl=ZPG#5kKWscoNggprlHLTUpf{M0Opk|M0`ubaAvejbn1ep>+F2UK!ydDr+ev>v>f zUL;XRpR|q@p(mKe*Z-B~vqsQUU2!0Mb0&%%P~hU+G@O?R!DSR!vFH|wM(RB`>@Ei| z@#XTc^p<7iyvTHMYx=eS_UO;3L>+c$uTELMC6iH88)L%lRUNsPj&*rXk5tQz{X2=d z#F_(7K231(*XUrt2w)=6ZM3ue=6R`&hS$0YxqeqBeG75w;Zts|zr-yZ+NVs+Bh=w~ zDCaLrl0a&Pgv5P4BRN1RZlS+gV0}M zT2MsS===^@4NtK0742dT;p_qVxQh5MBHdiYka+*#$6{O52zV4hGtDvnTT6f1TRkL} zR`q4r12~`~hmk`SvGV@Zsf3!h6S;VWruo9TY`cb%-p8= zC6{Emqi_hjgYWoP^P;)RqBReny~JdH27YCH2w3|QJ^>~FxuqrLZ)53PMc|OZ?NwM$ zV0EH=DBWqEU=ZE+43&YqX-j{l2hkef&;F)5Y~5HzLTIbrgEkGmN+eoQ@R4#c% zRihkEOx}C!S2U#l9{hk@rtZ(s!|($C+)m>29g<(Og*)4MVDxRbk#wOt1kzlmrh$nU zi%~*62OMGON&l=~W^R|N1T!kx#48YIv3Fz+yDfJM3l@yK=>t96ory3Nb~c?)eK7rV;1%s_d4v#8)6qpa+KZqWMvD4m;C64Y+*T2*NSM^IYD0{l7=|Hgvo39PauC-E}&8>kPhfYO6)Gg_jngz;t!CNqjCj2r)Uij<^ZGKkUJiwKm?ms>H(q^Mm*Gen zyrU!gRjcinp^~a-ytTLVYIq=2AK0U!rN6G!YqFLvmYyzPzGN&@^4xdgH(tz-RLUPw-Z9QiVNj5Qo!33CF905t_e%y^lmP>?p+ ziLuM*F$By}^n1V6t|^D@MGx1(T=M?2)ru%ZaK98iqorI#&G}iVrkas!kvFcM*rf+T zohf4mn)Ef^8(DRt!M!DZ0u)5aAfrOg3boWhv?bN$*5Ag0OiOTk9|jN6_QJ6BtW*mr ze5*>cW6E9qnr-q{O75F-zlNW-Db|z5b4%ayYOq)Dn=^zK3rYw~hEAjDZ4rXuRh>0M z=c!ZH3Z47P>?WTs?N3IlcOAAWnAMkmy!~iE!mLbHzqjMH-Mcz-3K^)k*EKL1p}QA-z-(i@ZG<+@3E_K%tUfNOi616r>NvpSiq|ber)q)2I~UGxAI2*O(Hg zvGT4{XI^0#vJ90D=aKgGpY(UR%MYgH7drD&7YX^x-sCsPbbDb%GWgsY6BhzOIhj?f z{OAXtqnxGh*JXIL!ZN-141araKOK_u`I`AJkEXuO%%hShqwx>N)kh--`m)$5L9f{= zHGlt#UA^J7absAfXxTIaUbT@MmBcVR?^F7&MK898hg3^!58DWtjk*z=0m4)_Cgvah zp;S{R?T-^^&)-q@;#GyHBv@N`GdJ+2U8*SkTd5ZP-QXDgCV<8Mqd7tcvvsgdnF$dY z%MrB!Qa6(~{87CrB_tB9pJd6sFN+|T{yK9)!cC}-_!t#f{F<u3mx`H+yM=KW+ z$5SFa@vB;2K5ElbbZ%YZ6+esF1WeLqS@rwg&!Se^TH2e1Pcw14yR&p>5@$u?3gRzx zk~oA5j!SGhEf%_cEKirPxl_2Bq0wW=(=MZy!yV$$dlg)77zJ-Dnr;5)>J?80eHdVu zZ>wdl8YesYaFafn6pS33oLqR`-gzk!TU?E#)p8+Dm?g#8HKTZx=>#?sYN$@Ej_u_h z4T6txv)7ne(hZe?s-&zg-C^%@tu|es^W3E5q`Nt{zk`Lt0%^ki6?DZdrzPd!)T>7% zO)yCq0=~$K5fZ(TI8k|#mpDI$TWP< zOsMi09qpCP?8p2$IJ^{%dRKG`=F(0*SVA{MrYiwNDk_r;K7uNVU_QXT>qHo85<+b; zMqbSrBkGh=0>fl3(M~d5S972r!qN(kx|@SBM*nuK!_T(#o?ceBfia-J$bC-1IOG9T znfgw0T-x_`O&T^bSTWBSA3Rvc3V@?M-V^fd@hx(ji!KObQ1-Frjo3q&DNN_E)_t`Sflg(YBPc!q+6`asn0+INVSApw; zy9CGOXFzUFq?CZr`UO|4`_Z|D7SBztDQUg_%^>U(AnS%!{L$ajOw7@8T^$FR(}H_h zPPNJd4Na~7wdx@J!MCA?oEtau)t3q+QcYGr!~3NE^7D2i#WN(L1W7vM7^i&pUh2M_@I6v^9kP@Nix*W3 zFbJ*2Z`?cE!exd-ITn-)&Z(3^c3Fe_gjH?`WCs{TCl3_$zQI1kZ|aNRTkSUsmol;W zUag;lA3-0cCj*jQuO3IVd<%!r=P9(PW^E9JQ+R(3 zybjV|jr%p#k{JZSnYNKQaqqJ37E=b#M_OfvPWSj^Zp zs?v3L8%|mExLp{sxU_+YBtw=<&?A(8pDsYo{6>#9!}^ss0Ip&~`@y)GR)k9b`&Je0 zYzKccBSMSLX52LATm|N^A&Uk0<066*&HQyKj_}?I??s9ImKB@LWM7q*<60T>>;o%qzwCM|9BNsoNK0-AfCg;yx2)gBP zGL@iL348cNR%>&)@|Q8@+udECt&01#qHj|A@LmR`wD=kLe#yk&PUd@a6__RP(u$e7 z__fJNIK&Wt(L~_j5EwsKxYhR@fNb{0DWaQ3<1lZ~Daej%p}R-0>52g+xPNNKaMW9n{L3dt;QSk zg5x2;Jgw#7ha<|rY>RDKT$I#D**#6xG&=z*)-(ezx0VnRE%IV3OjLXeAKed4Z&? zO$X0HRZvtMO=3yAVodllZYp-vJM-sHdugTd-=xa;kFytkqlfcxGj1sE?UMr*a7QQE zH3#X{8P_nJRNK*s(F{4CAq7rZW}D}5w&aU6-g z*9owfHFyno=i2~w*+0#)2!fh0;ib5;@ugl*zxjW7pT33UB&0Ml8%A|9X9_K2p?d||h)CU`j zp|r1>U-u=@Ip+n4P*b#qi=Tq}(M*ReNyb=z%5E(F$Ru+%wmWgoW-JYJf~EQp^C0+Wwi()s!Db}0WqI+7H^ znVc^=^q#I)e5KrMP7%)r+jq#S0QBdG>!1mW8kQi9)dd#(1oW~)9WTL?kwm=`V%P$~ zHiU4BA8)g?aITccEJS~lodtcK4>@`9HpToMCCgc^F2A`H%4Oc z=g6{QOGgzAa|JQ7KaF=TeksdfNv#B+@Te7!wA>i+n%}0TCSO-7$@BE?f5~sv507BC zNgQ+&>Tazp`9klnDoDDE;P0eJ8j60pvDTUEXH6lWvqUKEVUWi&FjH_B+U&C;$wMS- z&tU46-lxYRS0XKcPI!{mV3{Gb2_Dl%j8R7R3@c+JXWt@!omd zUB6}CSb4SIQ+7z<9i~(3W}VxYNy~%-0*QXBN!t=PALw}mhtQUze%0t!jN1hggWb({ z_?Nx%3G_s@Bi-LiAUr1#XN9n1c`=3DJvojYjAo?%*eqVnH37L;?ZJ?U7`J`J% zol3ZPpS;b6d$l5es+d1LSduB!;=4eUbK{EgmgjitQ@14_mU;vsqWw35sYf(>YX<(* zC;?($FL0uGDzwv(bnZ#f)j#bNxx8QJDfp8q`0c{|^C1nZuo>NO1EOI*lrz<}WV}b* z7v`#~Pv;^O`He*Tc1n^>dI2$gx%$ZR+GDTOCnpmAES0Q;7lM&ryh_p%Q0B>P)!0d& zLI$xJu7)|n>F9%Tc{5nlzBa#vR<<(g`E+hf9pi;PCrpfWL158GQO92jry$EDKIu1% zfxdl`TCUnOxE4uxnlAj}0_*i&HPa97sPIKruc^Qq9P}5W%4u)s^K3je~Xr+B*=Q>1e$p|R0BT-#0Xx#8FK z&2?)@r)4&3Gw9!HXKnc3RE)4hx;q#`d+F&DU3?Rs%nGBz-5%3G%HA3)|jY7Q2U8B7ehE8Yu<(2UjHp zH&cO!p8@H+z=LmGY~nKhXnCMV3#ZuS0kkmmll zS>^_L7J;y$A07KfOOL~DeL&_qC)_9jMxrY0Yk>Me{>)-h1(-wvv&EZ=@f*~}-z>*v z-oanq{|^8!K+wN2ly=Aaq1etVCcB>Y`{Cj^QV$7zwqR5z>oX-p%7OY{RmLxBcR$I# zD=g*q=?y4oou;iruWFgiDWm(^ zU8*jLNbJiO(OYDTX@a^s9dW{^9<^_Dby_^)N-EWXucRXU*|4&|Z*+yqiw561Q92{E z=Cj0nVn!AHRn zBd&P6TCQBCq-a2HRdM@zCYENqUt+66_*QQIRdQu6Rw3>&3ywUTr?$ln%^EVU$>j^O zptG~(A+D%Am=mP$8gw#fD;=sYVz8&`9gm6Zw@R~p?bbB8EHVVDUhE%GE6p5>iPsDq zFa;TW4@TvZm$(07gp~X{OzE;qpzGG-qAMjwu5~RHtCWKH0@MMc>*9PEo|%_IxZrfB z*5^r$`3X*z_&(i&xatR8y(&}aM`#4i<_;=-1aZMQ+9_Ddk4bCJlC*RpBAcMG*Htpj zdZ9Rk%sy%*SLm`LpZ3bksa)9PLQ$)WN7Tl)PJVV4^@xlGb5U>3cVxGI=X)Ecvx9K* zll)+(X35?|*j6d~rxz*i=rwKOE8FPqeube#>kR0h71~uay+3$;lKelF%Y=S3QQAK_ zM4S>0C7cwsl=^(u@;WAmpy+h><6d=hTxyt2%)OzZrq6lpda-+o`ARl4V!HUt-R=qJ zVx$BTi>eL~ryBJU4MLD~xDFhY$_#U0qArcc7z1s`FRvb-aMCb#lfHPnN7J;HelxJ`nAT-L700 z(ujH9LBrDpEh0asqLpcn68l6o`(yUB?h+=zfxJ!1pB`Iml*z-nGK71Uq_Jh1y!~v9 z9DS@l{K%p<56AR{O2<5VwGYGiyMg8bR$-U}=KFUqJAZonGi9%N+ejA7vaX4SnkK1! ze_zbumcWW~4Tg=x_xbY^E8WSvA;t2ghr@lp6$ zM8)htVeT_EZXhH2py914=r z@irVPsA@H_xf{@7BSaG>(ik@V0mCzPugUVVFpXOcQ?AwDc&~O&Kux!Aym|(f5ib~u zB5rOobiqYeD#8@IsBR#(#?y)P(1^lRd-a*mSduG{JSRZx+4<#Q4~ZD*kc{puVt6dd z&oT6@sI63kNaB$IRN3HI{+G^PZu%vo2?Y|mSfw@Rim~`?nMcp>$>xd zp+%z1>0fs6O$UJ)>*_TkC4bwftbK8H{lSz_`K_(GD;x)H>@;417BEI1`K!^d<5TV6 z4_Pu7Dzuzw(hl7u?yxv#fkqtAp3LdYuqQN^P(x6N7jHS^CUXU=vfRA(@SKB$%1y;` z4|Z=cj0z#09{%E4_uDekj94t04`!tH?jJ((zi;(V>robesh?j=OW+%}CPcthWTg>O z3GHA$O1@?YF#SS>8be5X5O*q$;*KdC96aSf(t4b8{zmil#}TG*J(}$OtY`Gu$u?O# z#{muBk4T{V(N*ZQ9rI?Mi-h)0Qflj=aEc8!i8JIsX*TrUMMi*In{4H>G1ip^0x<;<^XD6feJ}JJpe< zV!*RdE1!I&0F{(Tl|!>Z$xBML` zv?8x7bk{fa++I~(lV^X$!ovw!JZ*i)&jOD<&wq>ik{}e_`CGb*)#a5qG=4Wc)|I-| zt@XsnZ1K@=8fr=b$0?9J*-)fU-GlAik!79IH4uxJHBx$*7eIk^g57(5%*7}T}cEyT2XBpu*X-U+WUH^6^ z_GRX>K*Tj)us_32tmL&6VO9U-(l3*_ctuwhBfVYL&b^T@(CRKQ^Msqnq}9)T??2wR ztojUat-{gEb8@$7Q8M)GuY$^eH<)=y9?V~KFxB5?G#KmCj8K~z@_yv<3TW%R7`|9( zq_jr6JqoAdgf%JVN-lTf4bb2fwS>*y^;H|`?=!hw@qi-6rx)isE=g3^)JUt29-s>o z`F+LRVz96zQUukc+)nsy+GBg{&?@9idg#WBW)oqRf%W+_M2{;0`tIGlAOUnzgR;h| z;kuM#A?9?@AZOmYgJ(TMN}?5m866x8KTY#8ld^PrJCDV$MgZwU9B`Y=d2s6r(#~;7m5sp?=o<}g6}AO|**+ZZMxs`Gew5SevIpGDFab@;PfvN!8 z_UbeAnkE7}%eJ*!k)50Z0(wW4_#Vj|E}qxPjQX)I1XS45d}l%-EQgyt zZZZmqzhAd{gZ{qaKJoJrTXec6)#zKki9$a%FiqeTl6zi*Pi(v_dSNGa|?RP@1`!~p8z2{XKAiJjpM2DK<@+S z?<$o2^->?V`ipuu4?k66+2$c4=HVnh+xbmIZlkb5(KZBE1n5$4qgozJ7=yZXMH zFVv@|{f=VL_-V(Q=#Z%LbH?NtzQWPE$TE@jMjzFI^(aQxWFDW4CgGyKu8_liOJU?p zi5ar*u+4{@vKY8G{n5m91u%#S8n3*9UA}iAa@vQsM@n=oajbj@)g}_bO0RsWkcm6f z7{Hg0be`wS8y@{ZVAtv$ar+T%mfq@w=}r)DXxS*TkQBv^AI<^d*p|wE^i`a8E#bkb~mOwubnqSP}E)9< zzN{~)q1iBJA)2M&h$e$LuJ$~@oIB(0xAib>dU%Ia#~Jv3pVQ6 znZYlUebRQ(Z!Uuc;-8~EOiRfn?Xwy1PIwfA)s&YrHo)VFYW&9RbX~|)#@{y zpBCw1Cd1O7`ssv{hQi@dBMsWx*b=|MJ0;zp1aJQKHOKSgI9$!|kBEI=VzdJXE_tgi zv>rK{V<`RHm%l!U%52}k%pO7VZ1`ngW{=UE+XgYkILDw(QUA=T<`s*I+(o2;_Z_wU z9;?e;)^P=dOvs|npAq*5{7T2VmA=RD*lN(B?3Z0p3kA?}zHF1_`(`-?_m*j4H9N2M z4Lq-7DUS9hHDRyZd+yPOF zUU8rt+1EsMzt_p);9#Dx5-@Dp1~0!b_{&I_Ji~bLwta89o5Q6$d!1*DCdbwZ-y1Ku zNm?Aj*BjQ&4g;0bIlF@WgbCGd%%7HLv{E)P&N|-z?%XEiksYJw+C4?N4M*tCCKQ@K z8Y$wyGU}o%IZH? z-We-+lCk|2)znIWCq98UkaWGBxpFg50%6aKR4{-d1yc_>y`)I`o0Ib?UYY(3OOIl4 zum3aGQ{LDn$KUz-&8L}=8mkE%hpSbZ z7eVJau6};UyLblQXmLaIFpO3=JXV%C#N;QWS&CjeV`EPkULh`WqibA1SpKC3FF~dh(C(0Xwfxz3Y0mEW zR<{GqMJHSG1MKW1U2rUpMt^TQ$%Z06iz>SOHw-eMGBF^i6343sSU@nhxW&R(Oo~8f zZe9Q|(YQHeWPSr{pZ~*o%G!}PDFCzDz55{1ditytlsCG6f1>1Harf0~M1FA(AV}4% zqlcOVqlpgprTXsO+g0IRXoWEf(T4WsFJ;rR>1G}`B%R{y-QSc1W9HWkR@mG5UNjW} zehpYIqopxTTWw{Z3>h^GHMuMqha01d%%X;JK5J`ejnhC!CD_!}zv`-9vDCW4I7Qlv zS=#)WrM_HXyv1i6tAE1k%Q`(4L#P!89@y}S$$0fT^)2vTqf;}Ol6Q{zqI!;r#J>E> zfoQUd#3NXnda}DwO0~sn$A)2KmW3Iw@?OE^8v72NkK_`CKHU#|tgL$?JLB!HHYk@9 z;2ZvK5{@V$u_42L&>{xVe36O4TpYP0)oBQWeQO+hraax!j?ErdJ_g+!6mzT2W*{1_ z+AkuiV_T}N+~~xwzgVa9-QkG( z+6IF#G4aYLZFfT+d+zaO)SY=)-<0;E!t{`L(^f~P3!3}OV3yd3_w6yM5w+2l0eeB3 zgN3Maw(znp+W{d&rA24@{4zLSsXa3!c&|L`P1&jjicdRao)`283*iTTeyyIf!2Dcf zEg|L$V0_ndk&j>krsv{QS~LvR9v6@Wo!**4hE4&9u@`)jwU*{4_2QlDQjmMIQ8Dl> zo&@Rpz^crAgtmIwH^#t&j*jj7DURz&LV|c(ctM2`xVq;%F65C+WZuIscCAp!s_wjX zRn3>8J>Srbed?_)+b7GL6jdJ^iKHH=f3`6{4rgpClAo80sY}KeW=BAZD~1_x4$INh z7i#T(pYpoNlU#N|2wE4*k8;&Cky^Xq$NgIL;m^t3{}=Q>3&2F$BSVXscq?NOYDd-m zAE=BFHQh^uF2^tK8sNn=pS@$Z_z0ToU&5`TCc{@FGf`oOzdH0_{@d(eepW{tY5QSg zGZZ!u&g=374z`mwP(fU%4uz2QgB*uCk}yv3S{V63j^I>Dyk6ZyM*!!UK3^#rQl!{+ z=0*jTq^#ai{`(eukf=XN%6FC)W3QE7@P4fz#i23yE{VX}-5v_8SC_c8nXt_k@#^GI zG5k3W5RyIa_ga8INw?Rjy%?kLGT*sK(Alrq3hYtGM>}TJ(d~f87}4591*1iLR(h@> z?V4}!DJ1|{VZe}>Te?V19Mi03pX$9dV~ByJOinYH9UgQ_TQb$BM4o0TUxKgaJP z+ric^8UqczmM7bp@Mem;kN+7{?@Y|ZLTUX@pKn)3@&q7F^=F$G=w*s7Q$9G$*J3jL z>%DMH6UwtPCBQy*iJ}Ja4l2i*W?C34$SQNE*R^y)G-5&Y`X5E|T2#c5?mN{CU}wWWv+u-}YR z9k39f&2X-5INp#jmv{wstMfZ| z=@OwLlWC{SL9jN$q4`c`bp_TZE?llcB`+DqoCV(~FgD{Is*j7fVs6RFK+aY@NhX`3 z`>{Ttht$d!6G|@4oXs%*)yT7oO3aC#)3n}LV;!@yK;h?jIB3|hysIgy%_l+GJ7BX| zR2m~TxnTR6s#HivOY9;)Ovl%8YpP8*Ef^f)V(q0+RMIOZy2PuMkxFahk^gt#ub&A1 zGEB3F@4Z|%oyULg->%7o+D1bFQaC@aDLAokp-?t5P>B=<#D^gyW^#6g$$q)F>qJ-E2M2w3@Eobj>H%d-%p%Hu2BjJ1 zkx85jv`3erg?ayFmCGS8skvovbO2xbrjR#n%-U(WJnyU~ipxW3OblL}V5kP^$2ayE zOEz~MLHowtU>z6JkF2fUjeYsq&{ScFIEokE5)ye@E4ixcvNGwQ?F+`9gt7}Yi4yv+x|~WT(FooFCyaI?4|zH>cAKF)|@tGP=SbNxR>|I&B!?Y;s5%KQ}LA)B{#g zvf2>ngg9M0^Np^7C1 z(p?O>T@u1a&(bbNc6ezly08neT_aV-Ef0A*vno1esLhix@Tz%Dn2;S*B+mZZ1}=Xy zgD|<-UHZP|6UMSNYY=c-E2dEL9;eegzJ0|a%l8qo<@#tZ`58zye6>JM4Vix$g8dq& zEr$mxYYO5PC#WpMNN;sNg!I54*mTjG1Jv73oqj)v)t_1ZJodTcr{h10Ru6u_)t3&n z4#1Qv0r);6qEk}+%Uc<*&|N23vmgaf5l^67?-#7(#40s#fi8ls$)yDLtS*bQ8}m&iNeHKGCX0D;Eb=7La}%2dc$KViBD&85)M z2}8f^Z?JPHBI5aB5#A!A zfR5t89B`H39T;i$NPkdKGT-6_?VJP(nvPW$_rXKhDwzV3$!q1EU7;do;$)Co&!4IK z2SM=kLQ;^^Y2q~o&ywHfKE+poka^ynQ&m6X%WJP$=i(`_wQvjJL$LA{BY|oERUk?g z?cgTUA9z3GU0UrJQq}q67Kt&TfH@2@El)gZNgzT2jEK49tDD8qV5Jdf@fm#H0cfR} z703QmhCcbr6n?DYzQj4nr5K{lGS7&vTL`~j3tES-tT`0Q@#;Ym(fZoWsGra4CG&yga_R8nDO(bbwg^-o!5PE7T)va)ViTc|u~|LDNJHuY?f-Tf#n zc$ti?T{KmtF4%7ysS1u5x-P(%$_jQgq7_Y)vAoGgJkvM4K_Dq}O&Sa@DTg9-$tGEZ zP8UKIP#$&%hDv(hN~*CUWkJUr{P&#G3os*HWy!$D+| z-1VQhqjD^8u}4wtZG&_$92Ax}pe)b9>9UVRSj(v2%|5O@8^FDaI{HIoD8s5PyE9H< zk?M^~T1Qj-9~EK@5BRw4g(R z#R3SIO1KEJ?)^iW#-3=gq$_Fs@0aN-B67a5I`1ddj_Lxx=xBY?svV_-oCkVJyi=+D z4Y{+$;Y)K#Ssc}g3PRTRB`4;YwhGlZPV7Q zq}u0}yh2rmSvD1~nU)BxgVnM3Enz%H(@u%&*23c1sNUumVQ z`r(O+HO!n-u=k@5@c5C8dg;3Tycc)42l&X7piECCMdkk`EEvQ#u2yySZdd#qy5%;9c`tmotw z0y)#-G6t?M$}KgzK;=>hBGbC%uVCU}6vD!zWa${%vE>?M$CA`KJJ%7{Bbu@rHZL17 zQyR{adKRA3*4;=4@W!lntOYIh$rUwS@CQjF5brj0sJL1UG)J~| zil}Xt;GD==qG5g94C;qQpe-4BU-Ngi8a58;xQP&4&Ag^RN90Ji3urd=LWI`aMgpij zmgw_R{G6m-U+fs{3}2Rl*Vzoo88)au@iGk+ILZ;^duhzf?l~+PgMd&N}hLK5S;^;RSq%pN(d=(14d9wPs3*y3K}+I{?sm4B_1ItJrB;QW6fgH<&UIHu=7INhX$>+YAxcAYv ztaI$^vE0HjWDQqlWRNL|uMc`rs)tQKrp9>JAWi+~qv{%$rBGu#%+|VKkj&fFAYRfa7!D69iS)j_9FDKm3wMN&TCT%VlzK2Q7u(0A>R{ zU!RIOm2s^tH}Evp-OC%|aXCoN+gMS14d#zVa|4Pt=g`v6iWec#Xuwk> zWeYP^C|CT6JK%1+ppmC3^PCxiYk_d8zEIU4 zEM{$7hJJyDY;S1i^>r~QwCp4vrDfBH1mqp9V(QB=;CS9Uz~JFR5z=tS%w|OE(1*2Z zWZ7{XKP6(JmQ~%j{UD(&aNk^utmJlEwh9XOqw}dr{8G1x7=6_vDl1ZA8=SF zfAe5nAjTf}Jr=GR%zI!xLzp2`2mQ!2fb%WXs(bm=X3dIr9?a3&48RRsv>-%ItJqas zizFo#3|zocIU2sbXAGKb!0?6p2bMNy9*cH{z~r6Aycj$DAK+;=BsF`yhvzypm|ybgt66S7QVsXk^v9?lo||ShcKH*Ey_~ z^eH6|WDVLVeXd+ByUKHOZ)~i-6dAR6XXE=O9ZAIY>GpLZ$g_tiPI|esI9pW5EtPxl*4)a>5qLFa1L4fDy;%l` z2O6^6-o7nkL7paD*9m9{J@BZtZ@6xKZW1BzarJ#RtW)z(!BZIIUgomP>`eji_P;H#36`-_W!V6ihIn3T^=lO*CUTIFA9r##7_e3_C6)whp+py&b=W?ec;W7sTc+7+W2c`=0;R}n@KG87Sh}@urR?h5>8Hz3;{E|%NCCSHp&G|nnSOop&I|`Y^3tQceFyy)CNy%id(O8+A=gKp zj1-|y!F=`3(46-d)sB-SjG^sg8`-w~MG>YzR+BSwoDvA8Em4YVBC(V|`cO446ck{b zaQnlU1_l~Is2$-RM8a0a1IXVsAhTt#dmxWecMn0)DVEghoW$LjO|b$1=gf7NO#lFG zX&#R)!q$wnKVFO27smx(j;8IJbc(o6slISU4ZkQypl=q~E5Gz2<$K&K->dwcG%Kx8&qqt5P zu2ih=h20KTqgS`H0vJn$wIfkp!B~j!f}ciu4^$tHS-@nwK^$Ipm5+TbAQ?Yj3pxR? z8&H+2Y*f(fcH%<ndyjycpdP2wRWxiiV7-gco)YO^RQ3s)jw$4UYcKCxo zN#`jB9|jFTE@p)#J{$UpFMk>Vo2kpmMJfoQRl+b71!%(Iz7A|enF3#?=H4jYRl&PL z_LE&3PWf#mcuXAkA>ih%;P66t435gA))+Qy_C_}6BHc03?>GFS;sBfqKgQn7wbe+5 zn={R-wU7Wc87m9qm3T!jbb^Xr>V4bbZ}_Znf!$DBQt)+Dq7CVe{U4KCo{_$+!7NL@ zDZr)~T0qlx_?!qH=S+Jsa%1{)c5szAB7~tQC(l22dLa(ag~4wr**iTrpCTFsrf*X#KF0RCgx(ECTNf8&B6D7B zC%+8)a1TKNO?XSWx3?y~w7@Q+BxmG;Q*Wf}ll(M0&C?u|311clr~h)>mbvFjHHMic zyO=->4lH9D5yYgo3hOV!!`jPkx*6k$)Nl7C5={n%J$e8182E0AFN@0bmDJI*68=)H z)<(}`InK}V6AJGCiO2#>M?Or-RoOef9z<@yoC#d*!5-p>y_;v``w_O6XSDD$tWZxY zoi_dIuK4Cl{bs+uYs~q2Z8jqyBBQHMP<^hs_RpMAqi6f|JRxjU4{%YP!b{}jTS`?0 zNjpXgG9x(<0+wUmx=>Y43?mu{bMV7atV9r>bLZi=Yq^ye;#$PT#6Uc8qi^oZ1>;q; zBnKQ)Bh$%n!{^?T^nrv!?v$GX{jH{2ipC!_ZB*#y+cX;B`JQ)!bMpi_=nTWazY-_9 znST$9{Ts5NENe-nE?VNaKiysUD~hUJvbBs4 zx;}V++oGPro1aBEL-_N&Xo>6oR|sR0bp5j3j^qWhC*)rHoNe-ful*Jh^lAg@ zR>Tx6nQ(y6Efm2sK>PFtU=CWgEE@5IYV8Awaa%%{u9;r3N3XVi`LbBY{2dKW1q)Zv zkk%&1qGIofU-ySvM95hkm3eoIpO!D{Wk!C?m517?+ox}i?Q5TwzRy}H7bpC{lR+Sp ztfBB7z1H=V0Go`)WuaZ#d8gGMBbU8vPNa7GHTWz@4%YSZ&ut*Ysr8W4nW^%w*OQsD z@I%&*T5|?|xUx$n+IQ&%rxB`n!!flx!vw^ro&d6NefT}_fYsDt=MeRn{LwJJZ~Paa zB3}c*m5o^YNWmh1m(Qp$cc=?}vMi4iA>bn;va?#uaF+|RWIbL&d0(lIQGa%2a#Dz~ zg}u?{iG6`Y#}V$5Ijnsf+3<+VS#a_P!!ewZ9=)iHm8~|{&}XIc2*(hlSxCV7fTQyW zMQ=0TKmZYPfe?Ddv4G7&4BAsT4zc2Ms(NEEL?pPmO7&YRiHEexbQH6@3-?4t5oNO_ zD8DiZWjqE~gO7Gc7&L=PfAY)HT}v}Z%6=)`DedD`|Ek{~qUGnUh2rvc9P{)#pQX9Xla$mAQGX1A<2>GFTk;Vs1%m0?r7g5VQ3NS? zF^TzhkI{W=vh}T%*J-ZO&uqDglPuZ)J#qD~#Z8rCCHCSUmq11!f%B!{`6*J}cV23Wzf^4~_IL?-M${h= z3v861Wi9#JzV(IQ-}iGiy_0pnnO)X19Fu6RAT(*>5*@-6q0vjoj%aT=`6Dcshc5p) z$_}P(va2TN_GC)Vo>_;&a+zLDAdfjZ*x~wD5X3X)Izs7f{I{@prnU?l*(ArNct4A@ zAkc7=I%&#BtK$fV?b)j1xG`xJ~SLiMBA9={39PEwO@)e;cgeLl-fyz808P=3`{ z($61jG!ss27ese^*Xfa^?90HPzKZs>gyI)0=7Y)h%%n})Sg=fhaOTXGs8gIN2!|JC zv}|owN~A^OcgkLb&Mr8egXqNQ_Vztm!GYDch{*%n%mN0E3C2SqJ00iS9zErcg{f_` zxIRh?a5HnZ>%F*g{ojwg3a>cHJp!2GC-NXe-@CS2zpXxC zMxU0S-%U=1L(xVjQ#LL}_T%%9Yf^EAh-puo$Q{B~8CwXPFItO=MgaddANIOltm*(bfIYfSW(Omujy9)7qP>8fVLgVmI>|9B5hfpA z$CCZ>!Kz&ge!D^U+kFKR|H*d=wF-5=RIoJI^7GZ*K+);zNa;dT!N*Som$L`AEIw3(%YuUgLoUn z_H!OJ1oQ}!o%0pT4*8vKOAH^I2=lRPit3A872IBWyh4`BCL(TDynuLo3b3r?SH<2OQrWnNDU}Bd|?$m~Nz?^yt z->Vor4TVVdXg4eSaw9j-AmH(w(8{DYQc2|D#oD%X%W56})0{u?0dXaJ*W%Pydcx(% z57uh;;=ju81s5!G-i0&P8jCx3$|L8z^$fF7kraWvbD_t8*sXw+HU9Jh{xP>|sBliC zquC^GCKbj>)AMZ3t{iNPW&+yXwma$PyneC-vs)=Jqx@AyI^YNnC}v=aEAdvm0MS7H z!>~))XHbIXW>Mt$t?yHzN~sd+b6{=U zV7c?C+w9#Iy^8N$th-)_)J!&6wMX%cMNc)ljn}E_Wn}$wZJY~xxiktyF;)ePt@<~? zke49Bsd&irG2veFDOPbg`67#vE+z6IjI3Eo2+!&j7ael>cr7SzLjE=A<Z7g&eBS1Zf>EK9g zwk-Jy>I%Y6rR7nd)TnDg=#s@rG0FxSOp}5XQG!=?%qFuyor^Ir#w~{`NTzjQz64Jn z7MJjB3tAC?jJq+C_ueL}QVs9}ocvh6PQMx|?8t)6DLaRE69(hX7+G?N=t#j)s^igTl-R)}0C*TwUji=jq3nLANefkO8^?!gMsqG< z^%R7P8E#*_K|-TJr7yKw$n^n7k>wOj%9hj8nVUUmJt-=&+uUJ#~{3V*XQRPTbi&vmMJNqO1uE1gYwb z_k%t((fz_}KK^CEh9f!}8XO$e5$@>b zy3-P0S^niNmvP&I zBAF&y!jiA|#GOvmLZ*cfZzXw*^N+-hUa>!d8gW+0P@)c;*cX=q$s3#WCG&9gN1W+! zF!O2SEh#W%jDAjTP563F+RSFGqqF8vFPzYZ*`W=Z$GL2QdUiv7nq6f*Wp%Xs+puL2 z>6W0~?>;aR>G?>cWX-~gH7}W+&b-(=bOV(!^lE!U@2LSSAngt6wQ2K^Tixjwy379G z=#J*;gD(UPg!jsd;^i5oKB>Nex)1EKZMUyai`1>Q7y&JFWtX%Q#P`v zTLo{ww_{lpfxca9&!4{e6!F~}6m$57}J3HTW$y8eop9W^o!(R-K_(Oz;X*Bz9yKK`R@@zP48={cb$E;^A>OxFJ*iRa< zjZdT+e2NHJCo=|faUg!LGx0q9O{yPU!Eaq!k&!3MUPPnekr_wE#Qlcw0omYJVj=V$ zt;Hub&>~Mgww6?=#J0@(n@o*vlD!5*VMte?Z__Wc*?>WLP64Bxh(L9zM*j7G$mM3f zCu{71_rJYgRA1{q=&;E)=-9*(i$ROYSnJ>A$^wuduvD-JVr3w%Ze;I=$g|a_JA;~z z@fDUEiLWYg#?awP6dFq~# z)F9!FGqlM@WMfEx8pJMYiLn=V3*)e7=(4%3pJ_I~-gAr9q*jQC*20=!@05hcHMeHm zW&kelX`hj+q8_Msf{=-yGhKO4X&D)c$pjn0sn~Vh`5UUP<{B&ExB>(HeREdoF8_0a z^E|r;)>iSUK`mHjLq{q`%6eIB7HSN;3)L=CC>iY`SPt*0$ysGaqIPN3a|A z4+|=~&}+h1Kij_#CTA`1U76PhpkfU+XP+H<>Uyf8~QtHZQS%e!vs zquCRE^eClc2J@i6_n(GkHCs%#8W=@0+&wkw6jZ5{%@5O z9-YNol@>!?g=)R+!fp3k08V&GATi5S7XRtZUZZr9KO-dUtWCbd5X(XwgOR|$Zj zI$e5=NMbTI#X0V8$W}}6Zt!p#;;73ByFYTThBWaW!f$hf-{`k03B$uR#tqT@P&`C- z`wh5jxLCvSNYXskpC76V*c6Ky)iIj+aU0g0atA4O$q7;7u)jp$>$irQ>_cm$0`o5O z%H!8!Pb{#S2~?=40=|!f{3*|@J_;4BH=#WU{yMb4f&rtjVB#)Ic738g0AoZt7Up%s z=s0J{YHXKPqD2B4tGG^nltm-aN(z+;MoDl{mo) zni_}i-2o@!L-PjTCG%6NHK9qWtZ}v-)mt#j{VQ0+$uTf?Y@EZMv8=xvdqTS&d2pNVK^L;aZK&1NDrR$ugH%E+tR3`Y&hU~F z0(TT-SFy!Q!@r120C&l9hp|JTr2U@3^eQJ^E$twL`_%$%3w3595acW+Qcm5N-;Xyz zcJE5p#37GToAq}u5mSWsvX@~UZ^sd@zC##hFz$mEU^(#1PvB#(tlzS97U7t6(g>-1!MA@RQ=L3njIWL5Rq4elk3rf9KW@+Lc{AnI$*l;|h(;MRO_1XdR|DDcEuF1Nv z#J!1P6^Qb(rl2k;Y_us0iCf>@%TY0FB#r4rTz+iy(Tnz(6fJLEXf6r{pEr%!sVp`v z99|j5Ja_UOEI&kI_7CQiQ#V)mDYo@}0C+>oA|ZUg>G>(tG2xHvVje{bt_{fy5TnIo ztBAn@KdOt$ISwV48BQtRR0$c`I3tJh0QDV0`I#um1$tw5dp)NwCdn5CN|sLMT!Gat z2^6Q`zjb>%2f07X^tq#);TLE)2bNo*`Hkr!q9)a3$kClhqSYOK>|k_crqT6RB*Q*6 z$p!Zo1-K)}rqfx!+p5Ngh|asZccdCp!2mYK4ro7(0~XlG<9;Oc=mcQB(Qh~)HhX9z zvwtOlkwPK8SqNNxQ($1rvS@7E6WewswryJz+sVYXZQGjIwr$%lv(G;7-FH7~b$6+& zYE}QNUR_PnjjbUDmE&?RQE8S)i`^}kx1_Zny3P;*Ltx@9gJQyBeFglJv`LF>3rQ22 zP$c)6GkHke|3UTMAyw}|z&4_X-}XnUC&yyRkn!7UW-`cmhZyuC-+kc2*DZx)5x%At zX86LMJi!4?>FAcShxFCzm6frVF!1aG+2Zl1%^HNX=`U591uQ3ZbybG%3zQok>%9zb z1K{@6TknXjtaT}$>Cc;+-8D^H^ z(wFj#ODPtQv`Jby*WiD%Iq|q1|Gt;M|1p>_lb1`gMARdU1R>T}8O7F#^TvLe9;SlD zj$RLYw#?gx(0#D&=KIX}{*x?4myA{T#0s;wSZ2DsM_Ef_ORPRtbct>?0j6ZO!Njy` z3cG#tC6sP!i<5mEC;Ov>C!P7wcRQrJEQ+H>e9I5vG?lG@O{CT9(ew9jNsJrQFB4n#c(96R^Qs`okZQ`*C2{3*=h(T{6u+ zjgjhFXS9N-K|s23rH+l7moJvzAp=EZMcLgG6?R|-;b`kZ4}=ep0wEF|b4hBacxpuISwe8kJL^3;(6V74XGcGrW9(UWyZHA!Ha$hq+*PYHmx z^vSGn9&CVmFk6i&II!r~w5kB&_8F+cM7Qd>NSNWvNj0^@+omvLcZW%hV^s_rqDw;b z&&Pa#Od~{fmr>9gN)S%|GV|r_X=VY7cRLtD{afPn0lrEb ze$VWN*(vkOlyJ;i@AKsN@{&%1q5lYBoLRXey z)fdGbt2d3^ju~9pZdnU|em_P<0z|)*g`a#m?^Qa@l`g1~bA$j|hJw&_>(hk2+IEREZSXQjm0MnV#6Y zHW?Cpg;hsOF@b)$D%NNZ#zOmK#+K0X(;Tcx9@7fB=wwojmfzCE)zIZC&rSJQ9%#`4 zl3_su>`|N<`b3|c#89gVHjx<#OT$D+wh%`GG9Ts-o5=r`c zrz!ehPou~Ut{TQNsElLHh|&4>XY+VT-0@YqnIiHJ#t^;7+4;g^Pt*yPGVC&G5asV} zTwSR6BwC9^rzM8%Dba?S8;OEV;PJebJ*Nvhn7K$o6GNh)93OCTw!jmNEAyYDA6zz{ z{#;z3tx|XD)hq0(+O!UAX#FRQJtFeZu#fo1KDHdKnGR7VbL`x`^6c14`nb?SxdqT8 zz&{m*k?KGD$M(R*BV0I1nEen`0SGKK_r~`Hwt+Bn^7*OAq=$`Q`j6B<$cdKjqx1!1 z4?xqfkgTE5NI%z?*e4$4c+$0kElwFI!t#PawLFP$VJ79fYb60;9Owdbxg>BaRB>|? zew(djxmuLoZ{%!{C`7ty_FOSdY`*(CdU{`>8Xl=O)`=mq6&U5z8 z6vpW^^epXC3xf`}d2OZDCW`1UeUbb#?p zQnN~iK?9mP>grDi9@M6p=w?J+CDFgnvgsPtB>a&n&K<}rgfC%I@mZ$xuB_}*Y!bIG zR~k5=Ir2|VCA{2@C|GkkMG2A62@SA7O^6wUq~KyEZh$6DV3U2Ssq3_R3z%;t zPvW|J?^yc_xQ9HuLj>c}(wwK9!x2CkJ`3!q*hW0i9rA4`0R5Gs`>F9mrpZg$2lI?P z9S>gD>TM{Sb-3bkJ;0v%MSPzTnXcD&2PR2>#m-47T`Qd(-+f-ck^dm9O668!Ys1N?|#a#n3-eSGE+7*fXd77Z!uVm~xO>mWO`qLCq1-!fB4 zWWCkM@3T@yG(0eQ%ISE*RJbbqT@ohJ!sN*6h!D#AFRIRTiZ(J4Q}rujmnA*BmDE_G z`_#coA0=V%!+oMp9lW>)okXFz%T{fQGHIoFv!(pPIr+RKf%^JM9} zGW9^h-ziEM{j8c61mA|`5F`x=dqt2wN!RkpidaS4x$S1}E5_R0qAHm|I?AV(?NxKa%z*bs9l?V>wcxoIf9^;v8)R2bClXHLK6o@2 zAN0ba76~e+%v{ePwYo}6m-vz35bIv#B1kvQ6|HK}S5few%!$14-fOF*s>{ZXG!-m*413vvcj-Z$2BK$7rb7 z%AwY#JwZf)RVNefv_Y^1WfYGBjH59sYPQ5+g2*rryMjq&>J+%dt>u@>#D0oL>q7H) z9W4Fb4J`{rb4RP`~AP8anLws{E=LvjuGjnik zv89$kUTtas>p$OL0C6Phw`*st(fad#Ict;wug%0mBjc2fB;RAdmG%)rC(|u76RXj- zO(YHsw$lZuu0_o$P9vNT4g9{H)vhL5-i!RMGKkAYG7syHc&F3+p-TA$E@Rr)Q-uFw zZ!PttsLKE1D;6YYs|}2}C(a(Z97-inf}qwX+4sC(*5nk!VD11f6fPMG`^gr1YP!y; zmY2xt1#m&mD_7L2KUtj8jS(j-(G{hI zw=DF^YUaZ2z$Xo+r>a9;&aW!cLZVqI8;n|Vja{DiLbDLx+yp!0t+l=Pd{$)viVmKt z-=F(jM>1m8&h}dki_B}4-IjA&(VrPEhx=^Q8;Ds*`d5&!q_K}J_vlt{ToVgCF%q$v zC6OBuf76D$qdu&zCEA$FV+@bzh+1Jd>j{zYv2fyw9jZ`hJ*OJK zSi|;JvW3Zs^0PC&LZcAB%Zc8pjWs*aRzJ}C;XSBierZ)6-y4FhKe;w`n*niEnl95h zCg2OJ!{Vq?H$0@3L0BA&bnl|2v`~;YgE>hF^)WQjtLsDR=TTX z43vju(&w(CCwfA<)Vy7nk%wGv+kSLjq}RlAa5yWW&H8hFws;v!2UZQB1<~m7aOc%f zz!b|qpED?RRf7{cqlrQd-;mf@)tk~Ql@83^;k0|&Ss$Rx?pCaEw;>5%7Z3bx^3@HT zrGZ{`Q%$6&T3XAYc)`0#qAaa3Gu@^laq$4ym5MLKnZuH* z&O>ujkt;v)?idG1yL{vKKNpaChlQgSk$q3c0F?Ij{36bEY_r5|qwisP^t|g05fAW3 z*r#907!188ZHm79OOI5p)r4NC+@{>C+JQKSH)b0XVThas>+SLw16ak7`dtuPL80n8 zkhb;w<)XP~uE8}8^p3m|6^df}U=xg5Tu;aBzueEbK%;U!hi`!E2q^_DqdJ;Cz5B(3 ze8b<3w$>ZdGlB%q-o`03%!U0m|KywDpy46@Ahq66bFq1Y@HEYfcNpqg|07_(Ul(ow zIpO*`zC=+{cr_T=!FaElI^g#?Ug|n4?HyD{$5k6=k(DW;39v9pHL+{MR*FSHd+z+S z)X(|($1?B@3HUAR5oC zl}d38aEH+1#M1M@eRM;b#+k*YA72Gasw7$+fs}UyJ&dy)O%t`^PX+i3#9q^#S4jJj zZ#62Sbl9MFA|mu~XC}$e7861|PZCL&vnI++xg6tVwwG<4Y+kvl3f9h>igY9y=;nL& z>y-m=X(G|ufMkYS9;_}vskH{dY{Tx%+nvR{5Ujr2!_lGrQMYi@7Y=wZ0us>)4oYNR z)vt8#3|m!e)YOd;ugkdHmY+LeP$sQyezXjj=9gpF`=OJp?Pms;ngGCcSb5?T3`|56DCYnodsoSF>eRyVKx ztT9@2p<#(w0y(m6>JXP$Msr>IwVe&FDxmN+{JzGq)g(!w_ltc}+7{L$u26wR+KduHQCziD_hdhpfJ9suWpU-#_z7o8Z-|j|Z84=x zd|+K;dSN9-fU$`EKq>bh0dw`*oiRFpD1HD+$$r(POpzJ1SDM3aT= zRDAF+(FYEQY_+O7H3e1GFG*Y!2w?D8(PK$zX zEvE(0j=IaIsw>CJIm1`G+nYN?C$maHgS%AF4v{ku~S8YE(DSn!m zMu%PjUv21$Y>v`+^#iIAY8aOc2`Pm37ww~T<#7iq-->8k&&BC{w4*OfksV$BQ3MO1 z&c{1$ssKPMB;0uVoaWDIlJ|ZmrAFA6DOZ54A52RN3)ECZ(eMqTRZzw@Mh=el#`@NO zd$tA^Pz+4?^!R@}+}!wd0{RZdBDOY;_;iBO(&7yC_;e!X_70AMX8QKuTy%d)LdFh; z_U6AFZSCh|fU(9f`k_{#Ec7{t~_w{-fx> z2K}R)ogJU)8~)P&%g^-h#Q%H3|26*qjQa09f7`!$nAsTrfxrEK9HwXgkJGG7_^j;o z_^d470bpnThsMOp0LAjJy?+D7!SHX8zGeUN{-cA5^`F4}QsUHRdnvFZ}mSFwlQb{68b5{eRPf{ol0so16R&hQ{BS`PU6PLH%E1#^$DG zj^7@A3)B5g%D+S!#=le%D}7T3d}gSB@>M{<))ik9pN5(KKh$plj(?PB|0gL$^sUXU z-0;Z-r4<#PY*NNHrjBO#Oic8Q z?Ef7x;4?F_v;B9(`27kv=;@*UW&OL5zbE*2dAnvrG=nOkoIdix!~w<4vWT$>y12No z!O%NOggXY{=Qy%Kkc)}Ah&9h@&(hrdnB;l#J^EaH?HbS+=YFd+?Y`Qa+7uKosu(0U zxYUD+bF6VO24Sqd2I@yq@}mYoLt`aHLsKD^kZ`Igr#ZG|LMkL3UI9JVn|d?yPhlGF zM>RlZvkMwm{8a-!+_nPbYXyK$Mi&#F90b}k`sdHxRw&$L2Y_AtlAePv5c1E(Uc7yO zA&SH8;}e*s2KykKyRB*fEG7+rnBd^>o$3PsM6MX7+ z04<)cWuTvZwg;!D!(-#;mzR@921kGPx2-6|#sGeSw`%|@@zdfTE#W)#1PlQPBXdpr zXc-FRfs|m9}7;-;eY=(w7HI{p5R0}N0rbK_^%rY!iUkl0WB9vrJfD0)zJ|BF~Wm*5nBw6ETnBNs3afO|Fk8E}vq z?)Q(o2>>L$L%7GLtmwHf_w8=&k0El((`~?O6C+bV`a1g>0AKi<0nnG-x1(1d?yu@4 zAN54(9-n&my8MXFAYk7kzI}q`$7W!F!zCY4kcq7m0qEyF2!Q%LVB#2lfV82OCDR=} zhJbhA&#$P@?xc5JhOe5jFV*EQcp`B&&9&}9^Ut*F?mbdxJq6R2@0P;fEx zHDB4S8aZDDbwI;7=O#~leT-G4G4}|y%?%${#y}~(v~hayF|CZt3*)DJ4EImIgMyDM z{ulhCaJ1Xtoq+TLc$ZrNF&Rwtej#Dd`!kKYEY!383!#pZ*GQOep^`T@) z`;Pb^_5d9O*)f3tZttky2lGfhz*UC{0s19%eS-p+X8ZQ)08$F2&d;UrDts_~0Zi;- zD$nx$P;zTRcxZS0ekO}TggTp$aK#)TM6={w zltR-V)(Q-#aRy{n7zAB6GBLav zA_dmp^>KX~`qr=p2&GkC#3%dGH-m7j@f(fIFCNUm*!Ap>Tq2yp$>Ik#1cX2jH_Q zh``3?#N<7{0QHXzlee==0$xtOH2iy9{>d1DJ0q5UaAdBKg@fFZcf*61w+6mwe{zC) zqJs66HGk?XH9sHCFBZ(*)X=d$G+NeZkr30UCY$K0IzH~-eRI_M!Ow;VCuX4T_d%I= zbh~=NqQB1(bU!=9okPfU@J)k7f%_rF_O4U75gw~9qxTR_jGVvs6nIi^5H8?sqS*#- z5U#e#ZUz8z&wxs)zVsl}(Zv9G?Sn!FPGphgy25=jTb*eLcv1}4BsX4 z|0VA}&kcco@Qu&A{l+W(_{OjP#rp^$&|-ZH3*RB~x0vyK3t#+Yx+8Bvplypjrt4*7 zs0VEQj1}Onh+g?P0$AgN{v4;x3hn^pN^f0JU-rk_iw)kRvkSV9l8w&@zPq~kTF2h) zH(b5>nL+<;G1+)l0%QpKq3pIG2(KT!`5lM=yC$z6LXpaMF6_(!f#g0Gz|{O?U*Yyc zP>>ykEa=nsP7vA_e5msQBB;R0`UycbM0QPp`nPlR-HFcr?ktfZ{29G~46Jeg01;4Y zp?~lI!b{%S`J#vm%sWpOv(aq@pLc9xYrcDKVrsej7`P@FY7e@W z#^)9t%xDY;#xDW&94ZUHaOy(|d(CgCg(E$lMeQ@4?88C}4|HdG2ILu&#Fvh~9Z>UL z?ghwOG8PX8_MAgED$B6cosO{`F#g^i`Hh~K69Ie1XZLAfgiqTu*m#6XF%_Wj zu>t`2WB+st4&pgS-KXhi5BT>_53?XTa=O#X8`8O>C;h2hT}i1}^T zzo#nWql>y>rFa~}sg0B3vDwP(;T8)$iW7#CtKzb^1dA4xSt;2MVr(S_G4fgR?mwW( z$m^x&$Zm9c+ZmCFX7%yhbEGpeqYX{qw&F(zXNX=dh3gMI&S+}~wWQrAl1uB`8g>=8 z&vp^}GI~e4s!8O*v?m#>f00CdNWn&F@_ABl=BMs+<0!}y@2{|I9R)tYp4m{(PEXJ)*II@?puK|qW6|wyG}1L|ri`~e0xR0g zUt7>OR6$;s#DSsDxFRW=khM6QNr=O3Y=A^U zj)u~v(VpccX^uW}K%R84WT*#Z;voraavY7-8Q#r8Tby?9+*d6a7KtaZwxsJQ!r6pn zi!75A3W4O>`P!Jsl4P*{IVr0vFT1rb3|M_1Ht`4>HiO!lTF=avSoFn>3ayLuQ97~w ze(zpt_H-v-)WZM-tES0+z(Q!)w-*+^+%jdB+EP#VD4p?rp;p&cdv(=>I~0Ow*?P`z zkHyLs2BYsgD1kI?4j;2yQ@THH#P`n0;r!hF=<3Z1+|IWP7{<%jIp}Apv;>TjJUyjg zsjN2_uw@h5Sp}Km1=4gA=zK(p_SIcYzt&vk!#rOx%K6VBm*kvLx_KoWnfoiUO*~v% znGG&R00eU1?p!#P$-DnRfO+QD7IScmlL~IUCu*O>4pSK%B{y<5PP>t|E~XwIY%|4t z*Y`DcaJX==8?>0`u{0NOw}KS7SZ}aRq2k9}ZrUP$@u&8#Nw!s?;1Y6K+p-%^zrMFaS*JXpK-r>2k-;fL6%rD%bRF#(3z=JjfLK zI-r@w2_>Z)@+IjcUWgRrO%b$jAs8ZBhQEF+zH^Fsk>P?B^}xHH;iLMbJni(~L+IXI z)j)0PQq=){2OI?tgbxg{Fam!X>(TtzW=eRYemrX+(P8l0QF|WE4$??hSF2 zcJg3hd&x*fA!$t&pTX(1+y=BZm6exq`Qy2W&a^dUF#o_y8GL-Wi$DK>g9kH0IG0Qz z{zGb>c&o&DQCBm1E#K?M&?2<59A&tx3IhX79E<_bplD$4RmhWPusTv2QHk!IVhlad zRR%q0#Q`BchiKOeGP z@xuGZM|GBr4`~P7+O`+2ZMUVb>QXu;H($l8(NCof4GVD9kPPv~vjpoE0kJ0!g@C`uIv7 zn<)IDydq2<5yO-Yr^F^C8&*o^6>*pvsATqtA zHq-iv4xp1fMIh^X?oz9zU!a)w^csp?BvM-BIl}*Zyv_R6dA?ulXqKYkJY>4H?V>3} zWtJinK8Nz!5HT1L*9`y1sHW>R@Jw^rr zC?hPg@96G%LX{po8<26w_S0dEnj6|;?d;gpwXiQy+rUN&oYO>PR#gi9BN&G#!!z9g zE)6PB0=G+nW6HYE%Ju>p^m4RCN)S|WEJgCr+i?oQtm@*4T1t7LcS{>&%JW5qw2A#bQcy84+ey}xeU&~_pUhyiPa(8G{KK1ONNoOl3%eY=N zRo4FXL7Byo@hq;qCo7F+;Arg$0qW1^o5*N|Ap+H{CU4$^108A^`MD|l@d=?;QW(@y z9`nWs|3gxR%k2*NS1o%RR+qGlsJyv?w9i5W>%K_cKwsS4S%zyNYq25Tu%D-0tSGfW zRp%#3$y-wSkcIhq|GarmfO=k za962(-tus5bx;eIu4AXkD&Q?hkrCsWD`%U9!Ijgi#uvM_<_B?x(ysYPSKFM&H~vJg zjiq2<=R3FWtY}x9O-Me896K)pizf zK75z}8Y}ELbgwR*aUx#WPqy|wsERlYmi^u)Hor*wal{s?-^Ao*HHW?{`tc5+Nf_S; z*|^G8@Z-0Q?LZsB+?+zJaj98D!mS&QGYQ}EVV&kUi3_}yRWWF`=`}6plk#t4BSLo_Di|pMHBK0!J?swN<1S6?_5I| zJ}y}$&U&bV5OTh1%V5>`@>q7|Q+3Pnea00y3C&P}33@RTmQ}|#pjjawDdfH|*K%`} z4g%r<-0U*&fDzOBkif#|<*BtywJ{W3!<{4FMa__yTCNlB`5f89_w{~|Osly{Zg&&I z`g&DxDM2zKiU8$eIwuC5{UW^d@I;k62hOIvb~JwZApupUxEv;aMIbJh!}9atrH-pb zS2!HXiXBQw`ZcWy0k3VV`O=s`X?9xd;^wp(UO84;HvP^`Aby&y8a6uZ;Ao81=EX3J znVS6M)%$(eEogpyoXqlQV0}K0@;if!pv;wRz-H5^%+J(qqDP=Tb%^o2Rn9hpf7FSY5%`Q(GJo;@re4)Zji z`w%JwwHXdYl0Y(&rZ>0HubJEi^y5;k6GSSTYha*e_M-*{HZ+gy&e=#uEhedno0wSe zwEd(qki3JHeku?v8Os;*VvyBcL=UA&nd`nhuOMUl-4ZbKrU2V8!LE8-&F9t`0;auF zQr%8~u6iAjb%RBeohFKoSA+LRTYJXXPmCc`%oD}7;R{s);a?@bm{Z`K?nH)xx40l5 zDLnU!EH6o6A_temLp2sfCG=t5NS7^0ToQYToW*41g8{zaxG$}tntrMt(xxQKIg4k`1PQ|l%Y2Ih<~wi*2(I&cYT_-Z#? zT}+aUu3I#+*pcl~RV)$a7CuwWJ{e-AU$#hRs44&`B=sxtcy>!w*_f#TwVmO$pFj6l z_7py`l#uo{;2gZu0Lj3iAqpf1DvQ?O0sRjI7EEnRkEGP~(DRS`gzC7ba#I%r+0){H z@}HqYpEA;fg&h~ggLG#q8-tZ8AP`{V(A0+> zkerHJ3xAFNoCZRaiaI_Ign3cHB56WlQi;scPsrO#T)&wZk+m!IaF6d$(XDp+G{Fwu zC9=96_XL=;CQK~{mXLhsh~G@!v`BbW>(Wk4h`O0w0%*G{JVrNg(85gcJCCr!t>|#q zlg!Mf3G6*u|I=ZBp_QBXVyUX1;=j>eZD_0==}l?;@e7Abbn97&St#7#eoe((Pa9Re zqHmLYgkQ?g;ro2r!+R7nYGRm4eE=#ay9%2(?-iyxG# zB{~~;Osy-{WMt3jwC=@lVu7Q?`d-FFex)BjKefPg9Q{h~{Nd>A^u>x1_a#+OT8nia+7M}U=s|{15!Ta8ru6es!lnvMQkYN%uFIC+U8WMhGL1xlXL5d} z{`=ayg_k;ka0!YstJA0#)X7*5I(Nl&nqv8a_B>O><{V!?5d(%yGSX98F49FdvEcY) zQ?k}v9Uof{n--R*HORyT8+zf@A2G}}<0bIh{yX{dd3D~2D#o8kGNSqq=qF1rl$dYS znHbatVAh5dB+)xh-t^k}TjqAUHKcG-pq823<9& z8b2>4z@yObnnH3pH*5n zFzjpIeK;objFRs-LYpj)U3Lp{j*H>tK=S*wY|nP5nUJ92^!IAR=ullH+{Lv&&X|r*Nev4H!}Fd8j^8ncA?SXzcP~M4V;t^`J?uq77hOevH$pjsW=01pG;;Rz z22fWWm*ZM9ZuHoTUEwBsEpf`ZkGN)P0T=EZ4h^HHR@0QV%Uu!`gH|yCW5Q(fSL=&_ zORi8G`Fr$&E`Z~FavE$9ZDRF4=Nlhbv-O#i* zSkN_V9dFdowaP`%*{e|udV4aSG{1C$QpoAvsvYsOt7rz~_gNVfN-&azrspv)S1?!~ z3bQCy3K#3m^rFq+-%3SN`qWR^@7#p}nfp}tChSH3W(KQ=rgD?y*;Lzdoa|G?j?&9> z+KKEo6D)u6;RgsiMvKLW2TOg_>gxH0)!Mc(7Ww5!hK<0}qJ|$v#Wjb?;&dBMICXRAdjJZog|;&h9KC3+JiRXdr8)%)Pk3Sj|jx`7uBVgzX(vL5m9|OR-$UroRGS zx72BQeOIVawVn-Cf4Je8WP$1BvqGM_M*-O|LC7zhl|RsM=3IlG`T51Q&QJ)0)(=OR z&icZJ1NTJ+9*{>V6L2{VmJpv9gxoDUgpve2%K{f?VYyF~>Da zr>T&9+UF8qo~g;~OxCzXgMT!z7G{02s^Zgv?I(oI=F_PYw~<&E&dk<>ILQ!zx2t)^ z9UZpZ@Bak#x6t6$Qo|k**VCRetiXPgVEL(135 zarAJirFSXOP#~Oi1Dm%)W()`f6ql(@tx76L*S6zXZkJAFOI0Hw8MjKtTBGLE!PnvQoQ}A}dxuD^p53V&C$pB;@K!mT^Ak3Z^Zh z;^!(`bJ+0b+fCi_=AS|BIOOtH#5l+9aSHYguUcWxdUx~vW&H#7>V=)UQ}*h&t$!>1 zoVh+Q>S5^eN>$(fz&g1}D3SjC<7h0h4uYWTwdO3%`Z!R0$WgtzYZ1o*)E~`+K)_h{ zk00&~)s8Ev!)pR)_Tkd=*=z49`O-wRnj2m^D%4-Ck4CB_3*gsq`b}^m7Hj53iGBv4 z!4&pno7NY{#oEppe#CDdPQj~6>kZ(5kKRKG(tF9OY$d;8G?M4yMqEPv|Qh%g>Dq+6L#ovm&;J32<|oMrI{`FwzAJ+gcot>Cx!#u~$r-(la@ z=4Hei&3SD~5bXGZXQmaUHvN73?Vps5@|^Fl&aV(%d7VpBWkV2{qFN%$g}kY=?`@vpX}7+F&@&}cZ1719Zqs?co$v#zh?{B(p;#^;g`G%jM* zF9BPM3bf;K(us^-#>=m<4x^FzjrOi3tbEsm!Lr1U5FOQeXkW%(KqDX0?Dbh}In38k zx#$|+hJ2A6&|bR`(14QYhV)^O6J2V0kJkXf9HO|^NJ+nNPzNfhak)N;?eRP&u9R={$ z0#%F;)(f$z112xi_C zgzMPn;#tfb7CgZ($Ou2MA(5|^VN1iE)+R>TaBe7<6~F71?VY^6xgr;jqUPfPsaAmy z)73qf=NV>rls!A^2nF(VnB6oW=-Q^0u6Uda6$f@A@<;o<66+|f<;gmfQ<`KeAneqc z8XLi8&{io$#kInam8aYGy+WoeIkpam2vqo}g_bE38qRX`7-AM*7r=*9qMJ9`P@tFB z)<|B|CVAFqD0~CYtGUPw9VDc&~iCdF@M4q*0iov*gs9hY+hk6c z^5zn4h|eS_P4r4~vz9p+dMEU(=YZ9Ds|Pc`C4*$aV#*E?7XocLm&{Dn@`%D;3r~p# z5u$eJJ-c?|-VEcox6~wDJV5avH-T~IZK^7*n%>}I##Snoq#X6B^3&lm-KtbcDpY2h zoaH#kn9Wm!^^b~+C%>?kr_X-47aou(*WIzBpwrjAz#53)M!hn#84!KqA?=7hf4YBC zb$qsU9IR;o<~3X6md0Mq|Bk!-j{(D1VOJ=ff5W<%K6WVblj9zL%wuR+xm)z)O#>j`#0o?)F{JCfM@{b3)o zB3$aI;ukAkwvlU1c`$1ydha0HjgwXnWY}DH8$|J;X$&@~epp6?qT7RL)mr2PhEs*Q z^~;mB+1$iT)RG`sn)F?4VTX~0!Sh(CQf(=!O%q$o797$H@Csi?>rwkiOn;$p3p2%+ zKEdwk(Q>vo(w@|gwAd@;`-SFxjjhzq^?U=izHNeWqCXZ3mW=UFG+6640DWO)I_kz( zCr3cUh+R+lSrq@SYyJjq&zB2Hp+>{^kgcc>-1ZSbL1mZ3fH%hl$o9}s7BAOewULTk zxM0x^>>nMCK3w}RoFIbmEaDpPvtxpMNbvU0i$YpLUKCQPcO&E%mN@qZ{3D_TYlWu` zam+*|E;I_zti|Li99K*CoFJyU46US6QgFDMaG%>KMCaB>`ee(}rICfy^&u)^szfVq zmk{T_2e6UBS6Gr{qAyA)Ogl>YZ=?)(0irsnm7G}#@Y*d<3N~S^+e6x8{4V&L?I0v? zwB#Ivk$E5FL?laZ8}C48d-;$v+$>Xk2+}Eht~0b(I=uuuBMhV#4s0%hLvN|#p?8d=wa!q{Bs;D|M|I^q|Zo0zOdtAXr>#7!%c+fRPqj?Fuvte(HZv>N@| zimxx(EeD%b)nfiC@hY8K%qLk6fXQ)YlUg03%p#Lqv50eS+8;My2nNW$${l{Gb)To( z%CDNh+|Dw_1SftBBJglTr7X=BC0A+hwY(uvC7EbZW2cC()BN;Iupg#>sjIqu_kOYX zR-i-ZB!bp`p=&X;KnL$2l>c&05`SUf$clj}E+}a8NL$0G_oXUSXo}6%N)ztI$LePa zggV1)D<^lC=cCmZ;q0V!;OzesjGBtC#8AaLV@m2Wwe3(oGd78&`vR$n5Uw#k0&;9~ zB!MhBJ{z}NHVKTaLC}c5^&Uu9CQ-xih!=cbXH;2(N@x8i>}rmJiDEgfpp-V^CT zrT#z=o9qn}-c)a^H` zd1LvGgKmEoHmvP`sS2R#I>U7-^G6=Y!ja#Q^rz0|ipCBoM_6o!HuN zaTD<52O5TC_iVBe7tXJUsW>3M8qnrER~SvKK9>mWK$x4@Uy`PU| zKwnSp?i}x=LuvaB?I@gntCZGF1{~5}rJF5}cA2%O^{X;ga{QKASdPa(I_pdc z>ysLhpu2%bR6uW3@n-+br`ITppx<2{dz5~G2G;mL08l`$ztV~R&W1_E%0~g+!TUm$ zTvfK*sg^NsTheT(zrW)LmQ6bWB*dv5%X9C1EQ9x!y1}iivPI6-?fS#KCrg_VpVRow z8Rx{$JlB6IotMF+(aq$DQ_XB_)y|Ni)_BpX z;c#~Cn;wU9x{Lyy%W2Hbp#UOARhP-37Xv-5+{TI7GMEuSoG zJRS(;d2n)*qvM2M)TD_@hxXdRi@h*6Mw?gy`&qqlAP`p~;(HIOO(Qk!r|1Y*ht&-2iP1r**Pqt42Yru*_!|ZG38O&rr2BT!Mg5^7H#qzvJOD%(nVD zJ2mdoe&~1fna}1`MuOe=35So)H_p``hyy&l?g}b{B*jtIF+vA}ahtqRw9Bj((8{j{ z$s@yH?(ag5xwT=X8Ns>CONdnUq!DXu9q*)2MN>#rQohgbP-KG2b`Bw<6Rl_!XX=@t*qGtb_tb^f%qg&S0_u8zoH0g7cs&WZubwq3n{ zTo#sze|U-3=48GJ5@EEV;et+jF27}8<~cUlYh_tE$AU%eaM|S_13&XMdP1P8k%^?y z6MBJBHvWs!Z&Q~8C1wphA8+{yL7tl63YJAU(}qU4D2mDP_%oSG#HJIJro#L2je3MB z+n>Gr@o))e=>_XLrd(oGk-Xi7mS!1d=mU+fl!BsCVk*}+%NK~L1EANiIdaAuDO3iw zVZuhXl&h2jHCmppRgfDATGys?t)8k{WD8#og+L>~R51khS(F|1mgmLo?$77!Nk)82SiKUlRAv*BD5cx*!G zv;++3nV9IM#jCD(MZe`Eg5ck}a-q^~3y!j8W$>$BX5G#$t$Uql;*lj>lHggON;E?Q z?$bBwutCWUiq}+FlZXzDgQZ98y*yzrh-Y?AYn=z(e!b-R6q_yGSXF?=CKKXa5r^Qu zC53B#sWvah#y9#td|j|9pjlJS+dIj`>Q2@m~S9Uh-~Erd#Ev$%Zs=KY@ZHOVOFf~ zIPJk3QHvp!=LC#W&E&%_K%G-3CvPhG@q?MiwX-r(Niu-wCA8&fDC7cn z{%d_sE2 zFdwKet|`-zI)*J08_R8#|Ej$y#FM(ILCk#^ldCpPN>02Wpa3Tkp_L!)pt@c6dVZiA zPeH<(NqjPI5Cl`cPB=naj{fp|L@FoRdk*#G)h0T)8hTXKZ0(bTb}6|}?S0&~J24HJpq@-~=!w(GY#6 zgDs3v7^U=5U3NY$u`P8pr5j|^_+j;97~`h`PKSrI#j6tm#9>knHC|P*+hwMrI23~~ z&D3JF%DiCHX0>7Cg{i{9vXZWrmSljtV*K|#2i0&ZPvf;$?q2qz%QE;=^Hn4+qkQGK zwr}3%%JN7>pS5Sbrs~LutUgnm&~Qq1M%nR~g^866w>74`H*}R}vJXVUx3~=W20!rT ziz6<7a(C%|Y;3Tw3{i8t*jYt@-$i)w9M$HLadT>{LqSCc3;D*!<9KeBcuj5Jq33}F zrPnP#UZbmxu+fe1NLqB5sUA?G9+Mm$UE_U@i!m5xs9nH**7a_2DnUD{&}QqX6#=8` z5M)y)sBO9HnP6zrQ8ZN-^P3=-*WQ{uyM|{P8TGjr`?^`vWWD0cRTF{Tc5F93b81tW(LwwdyBKa}OocCgVuk;u1nMr~lnJDw89}T~i z?O}M0(DJ&7YXLuC=rCU%zsDKJlH-|+87nhe)=j%jqRQO$!_WX`O=msRq8wURuvbXfO=^b436tzX^p zi@%jN%zS+IJEA2^)L3GTP~Yrv#><`U*lC~~jG-LPi}Q9|*1uaNOG6AYr2(iYLQYAs3T(zpc2IZb?ze*~3ezLVDRsiptW8a&kqcIF`aj zgc|eNVmS0RRr=?wcRrcTu}FL@weO`tLc*fWf9d2;9hI6$4RV}aJ{?yCU6q*68|5+j zTJ?F@atL`v_I5G!3Zk}&>R1|o!hZ`2{8Xm46z`;{mn>j*YNfl}XJ0O9a{JltE?H*iKeh{4Oq6qezxVU>1**U){F(u;0%z{8g@s<)kl}`(G)E z@hQm9QVm>flM&p?(vr71oFuEAfc7>esNIKRoUmg_GcTwb$)hs|vd8h?7~pc?=X;6s zagUHyH2SC)H26>@s8!r%P$6P~PDdseJpHC{ zK6hZf=`fq&3-Em+-Zkv_;(&8l*Q)TX>k=&fJlYwRo93KB#h|S@K1|Q6F#%;HMx7|; zZ`Z3+R3hi%wgfpU3J8Ba zI~g#yd7d0ZRH2dWsVScJS??DWoKmqe5{K8JC_GWk3B}8!z8wv7k?)AQ#MY|?>M(Jf znfb(3cE{bMXrmg@MiX#!W)nLnEHAxL)7<5x6O6ru3yrDy2*Wve=P;1f;r+rav)>7Y zHc_L&8?w?r^D~~&DDB=AKJtA_6+`?vS@W)}KQSzXVdBwe3oB2?uWe>{U zk&b}^v=q_;6O0@eJfouX&UwRTH!sZ8j4n>De16Arhp5&<k3lD%0nqUe|ogP2@MY87S2b%VjkITBa7(9=vB3nLB#VM07o&2ap#g z-)JzmO*oz?T(#BEdgeF!9AugMA;gjP zsCZbm2A9;2b0cY+gMObC0x87nzXp&3e^RSYd^qRxwdv_GC}xcs2wVH%^EpzEOh7NT zUv@jiV2}nrI@ixy9@FE^Y*sikHG?*3DQ!X>Gp9!&Tc>MEi~>5WoJ0#L7J?JCnRb$ndiBpob!dbNB218?|oy_&Ib=bz>~{{)5oo zk}dOwU&U{%0C(@aC50!6U+kjze%!Sj0+Q+~QRZF4+xlWz$gK+yPg##$vSvp1>%D5n z+89GUF65*Hp-Y9vZX$T7{BRu3QBb-&DT2!IwMQK9C&PS8^7F0&Ibps)pN!=lPuxMn zn@pVeoEx*YFhkYqFGNT?_)H;)c{J$-b9uoFE58qhm0&m*VDvIU?nZp~@(ePoaC+Yl z^GhOJq6{Ac$H{aB6Zp_!1-* zH(s4uQJVy&8Y4>f5;x)d1O`Ub?=}RIOLBu6S@sB%Pw!|m2F2BsQ}q26ZIdltFHtUJ zR@@@n@;`rBde_PMRXN^4+LC!}jO6p>ez!ukUUZaoZi!-=h`alTX&I+?E442b6f^sK z%`&Fty;;Zuf9719TLz%>r8KkzU!uh=>7#Sw<8agt!nl6GHthdRHW^2URiu}d=K7E{)VNc`*g51;Z136|Oeq~mM_-9?~r z@;fUju$A;;UcUfo)Jj2PfElu!BrYzT_Vct@$z<8LwAHvn19uu&F5;9`pfVqYQ5B6D zp&irEannpS{mSJcn(Rb&5}@4M5ZG>Q-<;T0ykD_Y+Rf=1^z%Cc>ZTDZE#^*d&Xv9{ zF9^W3fPaQjD;s|hDAYnNE5(usVWK4~M*Y-5A77uLyhnp@NQb3wsAtZu-6eR?pzYVE z(kSH$vk|5Yw34Wh0MW?ByklH3&Bh{^laX;B32mch4^{mBUfBMv+&rg7jQChMQk1n& ztU+bgbesHT71tMzt}GZ)`BVMVITZ7{3w+wEwD6&{V~Jm+UulZNOJNUl8DCt$D!;w- zN;Q^QTBpL~QWov@s$f4moGA2t| z0!BSbQP8BOIxoJhzoYZgVfgv^Nz&Q3vYA68)ugPsC8(d3X((kXVc~rHY_? z8PYd?p_qC9&Dgd&rfmftI-MlxNBE1|gk~WNA>&L0Wqx%{EgVFNR4OjUNyi^Yyo=0o zizD+%GF=ig6xyEL>=lgDB%vCo>tUK*yW2R4l=KbEm~%$8STAtn_s&bQyu7@j3@suH z$NYpw$YfXI=1f${j@Yrf55|^6Nd~x@63Zte(p5%6@<#d@+IkAEgN01$+RD#eVpCa$ zG?mDbiyQ8?(5-c3`W09(HFF8@!S*Cnt7fd1^pC%?a_-guX0FC{%%|Vv-5}l9Gb)di{ z-$d`yOxjS{}sRHl+AuoQ2E;ie6-6N`9bY#Kb|MtlzUex+Op*y;n!@dwzH`#EIW$a6)%~g zsBH%~N%6cSv2|%uIQiN2Zi$Y~bH};+B@~Y%y&kxs$tmbx9t2Yh#1J0&fE$u>7 zZ7uKU1K>aF&al+f&G-~%lA6Qbf5GCmU{#Hou2yX$dbN8!B?d1X0j~xMDE&##T<#Gm zZ(K6edKkj+jro&F)XaRog(ccZ?X~$3vKO2qrB+k12C%*cs;<#U<9cl#If0NGhC zGb5(5VD>jm!yJu?L;#C?Hqx!i(4s9q<;qxue67gBX1lZ;mBpRR#1FM3J_r%gsXcfr zpRl%PI)Sj%pMvY_@#7e-)%%Eu!=vD>!%d1|3oJ0;j>65spn)>aLzIH#678m?PKsQq zdm>hy9*ObZx)6K^@q=>-!;nJ-+-ASSJU{l2X!s@fi&nbz;x0mi>Rm_?ubxRP3(oNi zT&q{(HRNFPV|;&s!n(H^*5;Fq{4%2)(>V>3ptDw!5_@!-%mo>JE>jwxPv6ddGTWV? z@O)ndyJcdVqo94>ha51+MYF((mc04N$U4&9-lEMQpfxaq&5vFN1%+zo=pfuGJQ_ID z`!S*VVUILE76+;?PA=6#M1vb;D(?|EzGJv~s8uiW}V|fwjU9~ArY0+xx$MTZN zMC!1Z#u?nzv&|kH_A61Af>~3OY#B3{cvAbAK>!p@QPwO{=#`d#!pp3a)zwtQVOA-@ zv@-$xF=*$z!=5jWo=YH{V6 z5I%ox*C+j+93QKQ*~|mWuGI@gF5o~0J;d0%abFa-mtq>hMxHBp_W_;iGAW8C!K zZ5_(8>U7w!`$cgiFxGdar`)ip11Ys3kW6Sb`*Kx^*}U!mZyzpc3wk5~ zM~U7Sd&ObbMd#QN+rv9wh(*EKg3zkv!rLhBTAiT!2$PlF1!gKSr%?YnWhzJW$bI75 z`kIeXHD#!yBp4A;Z)cpIf3J`vi+LlCB*cpHLB;_^AtKWZ7^?OK-cR_|ZVjBz1QM$s z?J@O)0qQ4R>{VK-9K}<+jQ(T%z{8AIJpI@^WKSf!8E5aiK%EKa&i5W9!Us-`u0H2T zmI9pv1!2Q$ME&3nuWbJhgGx!eO0XGaKf^8&*f;e-!=!vIG0dF{{T@FDeJuS*dAi%8w<(sGkDf2ve0prVdqenBL=)`2^iymHBr;?~@ zPg0~X;((r?F&*>_WgS97Ki!w|Yv;L=hgRSO(>5r~)7IRR-I(b+bA(_yE||3Naa}be z;D2HZejdNcld~2xD3~Ix+bZda3Mc**W>IX+J)0OZcZNCzQ2DJEP2P=J#fVU1i%v1` z@;Y4gch6n-{w~?zP+MCZzhq=IjB>f)M~igjsvcq`nxJ>$C6UZolm_p~d%m^UOhgag zv>FV$_NOt^{=WbaKJUT)^|3^J2U#4bEXSXE;>CT5au%rFZ_FlFzY`ahzq44*vsCU5&v$gpWF8s-u zkj1lUz4l55Mg2BrI78Pz5^leuviN^%Z5l`$jI8d&_sk0Tmj>k7MB%2ap7UTByApwy z=Rnrp?fUUaj7=^_{3qm#0~Y93{vp1(Q?l?JlyXsYaiszPm&-(yHF~p5%Q`k~iM8dJ z&i+`>DLkoT(2xnu8`OCcVgSR_LKiIjBvF}mB&-!1osJWDWwlrF#g(&8T~c(|pk1x1 z-7|x1JOjML|C>*>h5unayOh{Osn@d8sT1wqQRVd7^wc>tGG4E+uPx;yx;|gsmVb{@ zhtdhQfghVzwi)p13PXu@?`wz1)8F;7DB6PL*nE2`tY(ObF1)^lEE8Ht*Z@ps% zyB&D6?PIOKSq2^@hA1UG(I92CLLz8Y_M0s)e%HI9L2hnE+@Un1z&hXG0D z%M0koI~T!Vo9)bS=Gz|Hq&A!53dCO}Reqq@(-9z*ozj~sB2B?_$&-31!HFSrWwZy- zezq!7J0G?$4#;3k1CpA4cd#$L&$DnC*oE~(`~I=PrVYRoJE|l#^|UY`Yb#7x7#s3BP&g}+{8bjH!Bpvz0r-9oOk7{e zy-2wCuMcQw;-qAhRLjy?Yyd$9>|6q#KXNnwO5&CQ1u6HWyEo>82dMP2Ihb&nxm+IH zRSZgS#ZZ!`&Fn81Isau7(D8csZr6m8!yM#KyiAP@SYj(sS6+Rne6-1YGN6K+W}479 z$Y9dq4;75S=Jfep?u8sew3v@bzTieTgb2G9+5Aegs_z@NRzji&V;n7vaNW$9hsUy_ z!OAz+er{jgD-((iCCZ4cc7IKme;evP^6O|kcFC?)qYpZxSu~FjD7j}-0DQSQsUoyf zOs;!OVTfe}4m%qbe?Zn|6v-}1RA|BeV~aqzq%o!Uhj>!BEgTVmB5&Yl-HYs)^&)9n zXFW;dYC``1aLHcXh)@N8EIb4`)Bfd1x^?b0vrUQQ!>q!LB#DiTlf~{8` z#C7{^#QP+D|Ca>cY>LlwJ<|}qZMMOBiZke`!B`JUw?<#^LNB1g$DTJn5 z`6}zrX-sy21MlWvk9i#&1e)7%l+s)>mZX%b|(8g>BM**DRNurdR2RvjV z@k9nDsMzv;(rLMdYz5xCSa$rF3v==}%KT}*I!i`mg@sZx(hT~gXaDHU^?+b4g04*C z+EavLchG0NEq9ZBn*G^b?~5&0J~SUD2~qSlb*%5_Uy?Rp%;ne5O4v_q$_eYCQ}J}Z zk$62W^Q4P14U5gn7k~ru-m`o_AdDJ%{ zO8$QQDm*vec{GWa64+lkLQyvc^RfgDZ7 z4U1BsWWo~4vd~Y1xWH|(?0ZE*V1PzyWE(Qxri1kWBt&a(K@WvF*2kG)vyBkF!|4lc zhQ67ai{CGk*{iz_vvE&WbcG{Ti5VVF8P*kq+1ivZK!o1wJfDSH<=SY-AK z1bJ5T)3+!yruebol`^BTA9kyoe#0bRkU7EDk70%uzDT>w*v``=oCwXQs`ob(@Gy%E zZ{vt3TISFYi#u>1MW_+bWf%f%0?zfK7TlSYh!9!lh`lvu6;SgwHolNt%8v;3ZF zTS_tFQ@G>}w$x%<%UWS}^!b_C_=;ueZ>G-s9a6h>fppu+X%CAxg~ zXE3KQn&aiE&=dItoylkuS=}946OPqPmXhY9AA@2AaiPF+OE4MMa_cBNrF0UN&z`Vp9{EJ!*)~_*ju!DWM0rC0kyHB4cr2^{1?Sp? zvnNg>^LoSnDh!0k*uYr%s=P!eYtzB-W7Yip({Lw{~xVE=T(F$nyG%lhu@pnE%jk`DUK$4p-oj#a7iakcD+Dis`ke?n0egrYt`KaDh{1-U0wYId~*z-C3 z^$S^~w#`0CG5Eh4=wKd;Gkw}*B<{x0hlUV1ld7}KA?zF z2k#W1t$qT*V(HhB}V!9 zi-inOBBe{VbLbRBVNWS1? zCeM717)=9be#;ssZTYO(Ofl(O&pm@jqEFm@R!_OnAG_Es)3_OtI}SQK1rE^swSMOS zT*u#gn^+_DFjYYVto}7&S*n|b`COI0tTIgimpCo2FHaT?L~+`hjaVOBsGLrMZje@I z)bhKDueqvX#)6PW@%?5OAiZY=WeC^tVUD$o<1ROhBkYHqM1hiOFEw??@GLT9=v0=T zB#rUYQs)k$SPdK%uD87lZ`bvrFvoXFN1r0qogMyJ(V%iSlzgS4ZgurD9)~pUl z2*|bl{Q8`1v}vagRhX@Rl22ty$`3?8r>|x*?%{7BDKON1S`D% zJ;Dh*ql8#NFOmRC098+lj4ltRZ(*t*&6--8Nt|r=e(-Y1(q^UZiN~JIDr!UZW5NxH3US` zgTyI?Rtb!Lx8EV1Sw3Yvfvx7*AW6Op4ZXBfg>>u&ka*dY8PsZW#XU37D)L>t=altr z{RpuPk}117L^zYJ%@6P(X`8wiZ5HQ_{-uTL$GbN;3EtUEG+mxEvcF36W*+K%i` z>n?21lV-%13Kt?24QIHb#n^|2zIF9xBnv1fZ+v|piZAx6b^`H;qv(0rgbp62|)Bv zAiakAdv2&zLSkC9HPGc#9!}mhTRwpqdgH*%FlfnzeOnZr*u;ZKr4?wQ7vJ9B@_>i* z02CreMVh?2vv=z!Tj|e{TaFIA4Z49roizlC3h|dg-jHIy8anYW`rx? zz3Y%P@6miJ5)lbm^oJXtxz&#oR7?6wp->r&W;o2ZR!kIv3wU`&LZa^BdXN;+HWTqR zNhH%`Boo55)Bi@UROk`aGzh+h&Z^1Ln!x4;E+wUb`rBDgVm|=E+qcI)A>FNX)dcqq z;{O%$0?P(;1_ zOdDiKXGKTZ=&hcz3F_1G({2dfD*ASA^Ndn}7kkZ6Xd)hvLu&uKF=F=ePZM5LEzJ8? ztzPfsAV}@HF<=e@?S=1J=^FI)>O+XBVr-K`A?+2?XVR~AoL}iORHg`4cZ{_soMPS` zjrIFYcf7}&g<*AVzV}F=|B%u5Zc-o?T6QrfNlwleS0C89aVk2A)HngdtE(0jmh3@HUxuClyxvVGUzHODtiqz! z9X)%b*?2aZmaf@(`Y8K^UP{je1x12Y&}aFT6G^%6*uk(ebP>3<{{ut4!+)14=XW_E z5p`b3jTJG09j@U$4)Vq>@7>EytO#AWXFA8TxbV#{9dM&uf*XBd1*5&VF=!0a@qv@U zVt7wSu|FDFJ^ff9-Xit}tMNgexSdjR#C62M3QjNu?DNsxaUb}KOIO@EuNe0VEXvRk z)=)RUk%F>1Z;Vy*&rLQ)&9}g?7&(I~g(QsZzz2aRGf}bO9ub<7zi*^6#~H(3DoWOH zO2?(q-1`N{-(ip+iKH)jLYa{6rOrr%&U^dRG=1}rdCH#QlmF*elZ>U_AStV89ZT5bMo zmC;e0rt5bEy3S!oQzd@*h|oj8fyb3m4_-VwHz&6jPrlb)ej%`Ed@T(cqR!7zFHu82>v?A)29T9xiA{Rp6`-_~P zYZg`{5%;VuVj3^D{qWIKogdGM1Kxlqg0Z01y|F*d0u62lhwFnlK%M#v(wn+2i)$j_ z)DF@}zioI`xP+83dY(<9fd(&Xmf}3-?c#sY-*jXoWBC5E<}05%@T#M~nn%)<>q#|X zH_a)pbKVqT+Uhf>EY=qI2w^P z`IRLq0Po!x=Xhfs0wloTr0&s7M;DZgH?Y~O%8IhejX46eXhH?4LAE5U5~*0u7RUO$ zC=Z9EAyd_!{&jmjE(7A90=7s|zhociyX?J>C&3+V|5n8hL2}7sEaS*-gtYQ!Yq;*lj_D44aRt?0*XBnOU1;=52 zRtj-Vyeu2)P*R#!k7lQHCERihu3@a+|#OQ`-RPJaNU3`zWpP&i<4vuF)x*D;riI-+C|( zeY#`SGF5we0ZvIX1CeMGlzN^kXL65Cru!+N$94Q~Nf;AS`I9v&6b>V80_@T#$t!MR zr!A9WL>vt77ZmyQ*@-uu^}7Qk)=mapRHqOAjxet`nsJ%KqKD$T%mZ-2d~!@KLxDP@ zDKv1lQS=&d%>7FHf$#$xHS4McG6Ry59*$bX$ttE**OaP zm)`RU&I&v*D~(+!`{rD;Ltq7{+4u!&@$x^zNXnr`tiyYUN=9SVNu&Yki~IrAVg=mi zQ!5aO4%EvalRu_<6;a^>VX$e=Rg)lmQXe}jx0lJ%f-TTaA-y}oQu{FJE_d|kTdKQs zEx7Pv;eO^CQa0Tt4ixh+V#JAYL<2iC6Qu@bgkmh{uasc@ZK|ce^fwM}{CCcQ^tKr( zjCUiE?mZtPB|cMc5+OH&B56V{&@mGRaviKuzNI1jIMn*nRA%0M4X+f*(l5%s`ez)<%#SN)Sn zt@Jx8H0_Y*g!CN0Hcjr$5;`Wk9RUZl$do0|!rbbw0Sdo76}xg&+!N|V@1X&_MI}vk z5WZR}2rOi&-_#&QRlQ6_Sq#^FCkrG|>Unqk>A`2_k;IZ#4T*Sz$Z&Vg!y()2YhcLZ zU@cHFAWZQxjGRIz%GUs&g?z?Uu58q)HUDM+r)z>~F-O1dtiCOAA5OCA(e38WbWOggBdU&SiAV&TC;FlpEabwiw$OIvDoOf%uBHR zK|I20s+vZ|)Wxxdz3z0+qJ-j3%X*<;l>WXOL6{F41n*CeAr2L>7K?Rz2HhP<6jZ1t zG>S^nwa#55g4g??(+t^oGBOw?qJFDBLdFsE9qC3COx-{gofrP%iIIo70h|nDbGowkTBAK zJn_Y7*velTx$fH=hdfey0S9AK4Eg|;~+#sn1#rG(@ zxo&p?Te&t!)^SFA=aC#NH3RtsEb3ZDC-@)gtY@efn;DB?f_?$~*IVeqk|U8kyG<+gS*+WQn@E8T`O%)WP+^_!7ctsK~5-S`-H33qreB zzB?`E3)qAZ^;<#eVsLFKhdg-Ch4}qaP~*|GRHG&j1Rh!~{KIepm8pQiB^n`BER3#6 zKDDZ-rX3fRlF|OrR=NHCxq0bvwcLg4qbo#8%bFkbR23i|#>`2Z$9;W) z%;1&xW4&BRU(#y{jbFsqM%KDgXy9aIbHYbCViqw1v&>^kr}sH*i_Z% z{bn1fQ{Y{^)+xJ9j|6BQ!$lO!iwxA9OiZ zYo&TP!!_*NU!66ZohT1iuaLDKDK!Da=WxOjgKI}FVcCT8oO_r&1DaK)eDV>rUlcQDpZ)Bx6U8zoiRbIM*gE)B_nVR8!hQEDX#A5 zu_)*Cy-bA77d9_GJ!vL2oiH~Mkd@ach8v+||NBD#yWuZBmc^;Mt7$1dywOYLYxA*5 zC*ix*G#GULT%_%35&|yQr3Em3?x95BqXPU`<&5nJCd%UV4)uoq0ac8lE0CLoRGPsS zi6PdG=>{h;I+Sg0chgqL9D4*qZ%}qQG%oD6FZ@t2UWwAxL4sG09Q>$J3Oc1ZMegr0ZOeEvfuoTNLvTFyN z01@NnQaT03P#H(R8A=lgeS9GJL5EL=&nvnIGQ42gNCo$sT0Qj`Xlsh=0&#_x&8NvL zt@{nfG#e6q>HFU3=*$H1mv&ZM5-zPCimM-IW6mF%?m*~Z_W*p7kUjjPtdre2{u7Ag zQ0Gr35EDP_Sk;mnGGQU~>Mr>}A`8+XsQXR|P>+JZ^@Za2Gck@2@K|rw;t%?QDZQhn zxJ<`rJ8)L+5Rdh?>?uE#FaWi7U}*I?6#eq7vJEREBvk<_`q1z;$X0$@9OMu-EdM+)D%zJ?b1FCO^2?@YLP_=~K>p?4 zR)w%7F$-cZcX8SYu9U<1%62>uZ6vYAi3;11o2nL<81HsO`^f~wJ6>rPzxJ|0eeb*e zMxm{a5jumWb=}5Rs=qS28~{2^T40&a*_&C3&3Bnb@s{nEYO|VVX%V9kL@d|lTn>LgnefFt&ulUcS?-Aw*sVxU(;U|P6*61BE*FL_tqmATd1b3_N3P@2Rb;p-&%S^u6=KpCJ0vQVh=@BgRujo=BNx^)m&sz!ZEJ#JKaVCQ?nT`KF=wZGPWNfeG4ligkI#dK0z%Q9rk7 zHsm=!h4J1YXcVYsCo%{$p2ZT6&a1^eliq}6U3CGj*F6xCWbhWNv|Nzp(Z}cP6T_<_ z`#(PR;Tfy1$SURG({bXDlp2zl^vCvjyvxIBleq5D^<3SPn}lTim@b6ZxT;I7M`)J= zVCl(_lljT@@*`@mQ3UWqIlTMPDeiQLgog5o2a z+#4XkYA_lnAe7%dznda@noZKagYP0n7h(_RcE#D;Z&U<-FRofX73#=6=Azh|O4_5|(dmOUe*XD{xnZDs9$gk+#$Jej-)0DrrOOurK zREJ(~gXxh=O^6oJZepqoSv>&eMc90F95o3Nz zL(l#52R;;!ZP+Rb)ufg#vNvsXv_98Ibt|9dFQ4KH)+#RM7?;&$9c^LLrcKUS*h@b) zf|?!({wK+Xwn>E4asZ9-C4^#k*M(0_uO9cXX3$B3k7kZ%9Ctt~R6I1CM?`0SF``e` zVfkx5q;elu*?`1C`3=b{m=bdmeFhytJ~#S}2+;I%2ml4UkOvu=kTgERQ{IC>%Ys2b z3-ms|gZwXj1E0P5j5_77mU?;4F&!SIu^j=&)qfh!Z6+99M`mmjLY<|HNK;g8=%)j-j|>(+iT-I`S5y6;)pDjI7^iJj&!`bB!`Ico zgeF6z)#D;{j%MvB@IXr(qO37!VLN6+!UZE_AapUn)S3=!nk0;UOX=g={0PMV)BIMR z$zxC#O23IcqM+&23XNoWEHU8qPL&vdCAKp@uk+og!kLsJ$;*V7QrKxN5aE#uve0B+ zlBu!~`4Je=y$XjYJ!!Hs<@vX1CxT06eP2&P7oW|>dLMVtGj2QL=w~tK-7X@eg^qq? zO9HL0@duD9xjwO@)S+h8B3!aBUIKz2Ia^UGO&cE4+A$X5feekcp!F{G}KpI z60d4Esdq8cwgz-`=0v3LI#oXBBn3+G%Zo zr)LFgg6A?7R#4>r>ET?2wEiH3?TS&iNUk(RhwBnvpzf6c%J?85kv(jk?w|Jft(KB+^II4>6|q0rjUM$t$l(Y`Sf}u=+VVhqGx!IYAY&6&A0-fOz5j z$M3d2@;2a%yKZz_ARt;WxGbgi@c#VjaxzU^*+z_pHyX!eCd-_YRZ<3Tl(QP-?dT)g z1I9=%O2-M~t%GG7=r-sP<$#`2tbXEbfr8$t8!{&Uvl2L^TKVg`#s*=9)oIc2&x<(3 z`2VVoVEXdqVpuY5lvZ)6oD&iZSfj7mZZK-Q9Bz^jO&-h+sd*};1BEcn&_fv=7wK@r z!DeC#P-@8WM!0UnhFy!UKVXb*6k@3UDv6;Dms%0_=XYf(4*L5XgW&vI*icH9E@@({ zw3?1n=2lhHH4t97jDf)KEY`@Id+LR;*pajY1-#Tg96+G77Qd0JltGFD15$=nn=P#a zr`9=6yb;F4`yukNVLMcZUM2A0+*+mqS;o|H@8Dw&Tkb-X)2leiH#2>LiphKD0I7&) zB&mqsjg$!aM8AFVzlx6;lk|dKed^-}9`bpI3&^2tujN=|!BxZ3f52Pf*yNa^9VDx_DF?1%xz8c;2$ zX(1hb1S7f=$!VcrK|wD0`7v7Dt(lDLf48u^xw*$W3Ku6vjXT(%R={~c zRfsohg?GG!xn_x}!+Pv&fnp>;Oe96L9ZktIByZT|_esXv8|~uKg`UeO5ayv5F|{e& z{yJOq+>#RLt)AxRw`%|?l!Sd_cxFwrZmfyzJTWG=ZEIrNww=kuwkJ*|wr$(C?c`+M z?|b*z*ZFgPRJ!^ubai#DC)WzP9#%ZEaI7F3`oEnKx$^gCX#H5WQY4-hDxO=EiKf;3 zq0@5B;t%iS!*Gj5-;j)2NhB3(1cSe{0M7i~yt+2>qjXyCUaF+og>gz19aG<1(#Nsr zWaQFz0<^p%V-gaSUbu^#o|`?Q3t6@?JLRPf&D6NB6yketk?`xL3IPG8a0o~m>@1z@ zsgwxlZzMmz)P3pQN41@yAS&I=%-a@HRx?n!1kN$a&4qr(RS+I+DzfG<$qOoh zmus~+E+T6lSgBYZ7@UNLis~jAlj3S)(VW(&t<`i3jCmO9!UG?eVCD(g2J@0zQ&x2{ zL(P&9`W|9x6}ZVE9C{18%JfNR)wa&Cmno0rTp3QfAf&Rbv^oifpklaLq+{8qI-Ewd z>^nMZNUkKM*i=|@komb?PFW{e6~Z&jd~Fi(eISV%v7#_arUISvQLHRZrtjf-i@_>u z!or3v^u`dzhEP0PBY8syO^yf6H19VU;Nb2o6wQka!UQ)^rVGA}%f2gN%6SQmt@FO5 zTsR<3?{m9!xvzfP$J{}<9l)^S%h&mA(n0P==D} z%1-{XdvF+8TUnB*HT-P2t*Txmd)Tp7D6%yoMg!lXvt&L=VeX76)9T%1m|nRGEPpdw z>0ogowOj`}Z5&4$9eXP|jK~Cr>xPPnBgOSREtF$pT^vDqdXqs-@re>jL>!p2eM2bK4`f&e)y;zHtpQvHon&Lr@wlIwO6?lK!88jI%5&q1@n}ey0Ibe546-Y=Zb-j2?1aXS`Xosl zR>|gGL|x9@PH9+$;4Nd`P_eUqd<>IeYiKISvvkiE{T{)dc7a%79N`Mh@bzpwl_ujV z+MFkdXT|=coP?w>8!3*j*setu=A|%Igqjx{5FwS6;~a8#0hDHwYnz z;lPW`NR+->9gG4NYyJ$YI%=2B_-uW+Rg0vg+U)>T;G zc1kcWFfC(9_&X(u=fz9s7*?(J6MCuTe!V2M6^*Xtml?nz1{!jD^E@p4n1HM-!MW5j zWL@Q!1&v9)>La00LB`S--dq(&QHT?}=V?xMWxMF}!kRj;>L*&tLI}Y~x)xz33ygAJ zlUhT0ma=Tnzexrqd z)kjQ!HsoHE>p1O9R7W`Ih@ib&Eoq)I+?z8!p+we&Wa66YUHz$#{1(7gbNS*>vi2+B z5d!p-P;6?fT9G*(@?l-4GF62q1a;E;BfO?pfT=pqlW@KnG&y2~HG;VOl1A1LOwYiA z90oZoB7%?-V}?11rQ9b_5YquAb>B5^s9v(%M8{d2}}hFV`5ep%NcJGiK)T zdB3Oj^3!Kh1^hFoy|od8HBwelVI;s~lh71lTGh_XK+)wxga z9#$Obm9ymVNZLdB;(JZr>3J?RiF*^rKmE?zTD)zxNoR)w8mR3BN_zd2h8#su{M1w6 zP&nY=Rh2&r816Izbd@JoKg1XG12a*vN#p-aaOU6F_>IM3bqk6)K9lEyNwtmFYvq+? zrr+~oWZYUYwdP{GE7rwH0ypxTh(#Y54(@i%L>?70>9o?Mu)@qpF-3E_-2!9{t7%0| zK$Jqe1XlBy`MP4vf^XX+R`A7~&@f?t_4uOFBc%Lsbtl|y*KmiW`rLDt%##p1hEpH z-AF1J?8|uKz<31QU0;?LpBtSQ!7pbVgRX#GofZB3l0C2^|=$m z;q<&28fQ)Zk(SHr$Cl$uv_PWJZ%@u(E9e)=$MSk{C-c6P(&SWP2^3zX_HXIg6nH0i z5qV1Vz!Uq5F0Cjirl??k=3@m* zcg?|1$6S@uJ})tgK2P8>6$A}deK8FZOe#Jg0?8ybX1!_*b2eN4`t7Q{9CchA%=U7( zBB4pgVB;Qt3}MyBkEPc%`%+Fch}L8mw6Z9cY3Yj65qF&MO5)s*8j0F94hbJ2PNyQ!cf5JYsqqWy&tci%7KGCL~s)_tp(b3;X(dz$~kl>UlZZ{DjmgPQHg&#zH*M2g_$OQr`*x^Kd(9|-Xb{)yL+m&hp3x-%Gy*?U&n=VrY$!-wrlE<`v* zwwc|etF7c25?o|@jHt+{cy@}8-VjOrvD7qkINx7yw56fyuLWq~v_Y5q)I9nURpEZP zGTI-Lysze$_i?z6#s>o94Q1TbktQJ(T8xS1aiyFF#H-MW#XxSm)=a zSaJ&f6ZW90T>Dzm4G(I1$TF_7B_vOtTCYcGy%oKTAm5dod(pBEyF^QsH${7iXjQ!w zTaPHk#RR7kynkOicBf!Z->VSwdRi%!a-Rz|3-S+Yaz>9Kz)2`ye9oN*(&~$}dEfyDTIMsNVgbH`G9~#WRzGXdZ-ZMXnpv`% z9~Z^bO-!s&Yf*y8txgX44nkEsW>0;oAAUs6?k7FS*le(YY#W1O!w)=cUF3nKu@9`h zF}a+5QWmKxqT_AqUi9rnAat4gi7<$quJm%b>I4o^y>iAzLcV_W`Q4Ka(}m89K4EN1 zH2%E6Um(eZy=r;lkbM#JM&^)~OUU}O7-N9LVWrx+rMQX>4IZSx!97u`2~9TRs*uxZ z!>d`~Vd@XC_GM8pB^uPG7z+vYN>Z#Yr|FzM2JmUu#7J(s=u2Q?A=I9t*a7reLpyxC z;0X-^qv!-Cp;sEtAkZ~pL4;?|d?)gIop4DYV`9f2=a=!J9~;Uzr!2&@jS$qU-w2-L zr0Y0NkJ&c~2Q4~XF!SnV-Q zaCqNliU61T(Cpp+i85BWYK0Iu}m* zg4(~yHEr)8MUZ-9$|GGwh9u3qQpxT}!BNZpdd7xv!hq_tWycBUCDcxglo!h5^ z_c>APi2<5JS=Sz9qD?}vMB!7O2wIf!C2z}pgmS6#MJweygLild0^dxO*L&27wGMJ~ zgrnsV7BsxvEi8{lZeW&8rA@>0!85;h{|#5$$y-c)%gz&Kk?n%mi@xTPqhaZqJAN4`Wx8=uqH0P(BuPh)(*S3Xzz+yGUp#Iv`^i zrfCxq<+@s0h*!pU4cP?5-L3WDxSTZZ8mbw2?364ka)NXIM5g?w){Q_Ul&$FvO_G(i|a@WFAS3~%_ zzI93vo=Wi{YLGnG^Y$6FQO$`%N*5y$lTVzV9tq&L9@!T`>%Jj*#xVmN%UkeJm2GG6aHXMjOvM`atk+CWnJ9y>vWX3Se*o+_`t zX{Q7qk3~&0(R@}$L#hsBJg9E*w&)zh{l|q_R{#upV*liGp2+(%^*r$XJRaCclu>~J z;sVYfv$_aj^TCMlv-mYx@b+;MmLcc|Bcz)_hm}^jP;iMDna>r@?^S+5FH#(^qI9(J zmgzp=P3eT2wAYBpKXe~u7Ra1=dD@SW~xk zK~Qv6&El@6-B7>dz{@_>j!rhV6nTL?>VgkpRWBHOkv&@T6FCNB%Yu+0QTk5}ay zzmfifM7TGFg;~Nrg*|t(x_B8N>eic4y$$AXtd<_PKX?S11m?yIC6o^`ndU)-@{NBZ zc0P2xg_cv$-^s`T1RZ3&&*eZ*h*d+yA z^lRp{*sS?vqgJC_#0wQ8b|hhWsVVwYr{BpH)d;r>h+Sa=Qn%}4IH`P1*0ewZ;XqXa z?m0oi2Y*&X&UQIw{ik~kyFX!eVebO0KL>@=ejjzW)3BXE)<&0lO(%g;1z|5~Q2sV4 zHqBnb^-_Ejy#KteK8|IkH@U_<3bK}gX$SwiL{|<6Dk+NsKT7o=(EuvFQ8wz2x7+d> zJ?;#XP>=;d*Y;zq?7~}kZ`$KUIU;cB4+N#WXBPX#=BO3u*vGK$M=zdKLRMA)vbi~xVN43X&|N+@!I5TAe%4eSwwaW&Os4#ET}z> zPYoeWL2EMM-nKw{F+2cs-!^Lc;cHz2#u7tD*z^QV(6-|`-9vOQ7dq}8)pF}cYHgR0 zvKn+4!-qu3%WbdBz#_deijQ#N@wgMGSTa zgj#CZbAstE#h4SX7^A%Icz{VbVW*ZR+3d|0QroXZzz3rrd-_}%6|w|%|a@_?|E=S$f-6JqYG zb~4G(=^xlo0`Mr1Fvd1Uj!q87`qqCrTLTLi1~vda;4j0?4WJX$cQh8YwQ&N_3CSoj z&;#g1%^e(_gv|6EzLe|Gz2>jBGG;O3nsO?smoi zI_0mg4F4JaV+x>CH8*lHa|AH3{$t3?`)>@4^#6@P6Tr&A0AOce0kE+$09e^L0IV#G z05&FO02@8qKiK~sS=bl=EX+**&|jOs#Q23Va&W*fvvUCcU;68ZnTh2e{9iwRb=cXN z02~}&>wK;GkNuaAfABAVUn9)F@%)uBvD5!Uf93z`GcmIMWA_*P|Iq)$_3t`=$G?7< zSsDMKzx>}gSvi>gvu9>D04pmifR%~;OYe)mWK1kyd!}cC`K$X+zBs;e^EXF-=U=-2 z`1s1{-`xDO&%d%SVxXu068?9>!omJ;&@g|4_#61YLH*_b8vjk!zvEX@|Izt7|3~j@ z&hmAVfBC-&`bY08S%1gB_h1kil# z`5!3<`&S0Pd}#ltB1QGBzb@jw~dy1ej zl{Hpuw0b%ta8w5{X6D330*j9gti(ozWyRtEsLT5LVD*fQa!rklLdeNg864}t-mOE( zRY8}zQkK;m`O*Pa2q2sR)8UB40nyKwsz3#VXntYg;eij8`#=KUW7<;*r?3Iv zoS4DY{pFa!G(ZxpVaNHSY<#VNbM{!Ao$B^Y^z7{Ib0IZ2rdF5IuN0<1w8n5{fF%Kt zG3`u4P*+C`{P1SrFVU8FS9?HcN=%`jONrJIc-nps0H7s3a}%f<7obYJ8di{H(5@oT z1X5$5n+Cw?r*zWuB$!6m=GUK`@2<B}7cBqASKAgtYd%hLQ&K&+yoqVTjTKtu3A4lK;ZZfiRIUaKhBwTN4pn<_o-;Q=j$duT~`hwNjyH5NkTh&hn80h6D5bh&2HDISPD`_BF zSy>=R9OACi$^B3E58a~A8Y-ai53>HJy4>IO)o^~Nd~4mw4{W|bfw!B$0?)1u^1vVL zAYA}c&?rMhzZ=RkXUt_xxbDxC4`0YnU8#@C7~i%1&+jLnp|L6P1^c?Y3BEEOkMV8dr%Y5>Ns{LYM+x@&QrdO>gYyB2S)dNy>U_eb}mpF>1rR3jKHua@CGJ- z0L0)-VnVZ@K~tNcrP0Ht<|c58Rrd6r2{=G_D8##+klmD~X21)mkXQNdED#{3bJ>$r z7S?8lcPSoYR%bvt?L1R(_+FeSm^HxMUD;b+K+PX$>p&15mN;TpL_kYYbbi&5U;p^9 z%Lszm=dYsDC3dO{K?W2EH8gWQ+y$#A&j8NDR0V`-sSHi%Ft`4HFvg)KOv`v8*k%+eEd03|kGIs%!Q z)iann0XZLziD%#BKsRipRhM!1_H!!kC+2u|gkBceLv-_JG`KXH8B|nS(JDwh3)l*u zV-%M3LKps>6&h1YCEC$RyIK zlVLD2>qDbmD3IINXrMPHkV2btBjYOyLNYm<+BfG%J%aOWX}y;QfTvxidjs|v=oE~V zvs*dkr&{v|@A+iDTKC0LKv`xz>CjVZ;<4>%Cs`m5C?~hgw%)Z7n5T*5@!cNP>8{}t z_aw4P6FQDWwGr@kK8pBgz6dfS*xlq<@~rBs6KZ+ll1uD-Iy?$~aO0p38o z(}s|(fTy6YyWlLul$I+LkXv+jLA?pN+;QLos)wL1)c$P5l&n4!5XU$#K|MxAIjAv_ z7ijs~aOE$j;U{0l3Z8;`v?qU! zcfXX(c%f-i!WU3pq^kTv&;f&dia*1jDO*2r^`1?;vzPntQ`k%%LS$72l5&dN_6%O7 zw7M&HZ%GafK0ROg9wE4{6I`_P$YHY(m!Nx1jX}j5>-$5hvWYNE{xmpL13p>m#lGx*F4|TpkJr&k3H1VA2DFx;)^ZvyQ{$+N* zZ1?YA1JpVGD$!0-k)2`?ux7r%Xh~b^JFUSK{8N;n6WsFAfdD_qC7)-ZjKgjC-~2I**v+qPJJ57Z1kja0F5dwQ2Q z)nh#mR07LD-{z-cfzK09cC}9{FZ_ZdcgtG}(Mip8F^-q~M|b7ld}-NY3NM=%hYA@V z(GQ`=zAxEg1uwP01|A7zd|ZG+vGJE$a03t2Kfd?;($P2fVkmug6#AbR0V(*2XP;() zDfldKPi<_5Z}~_D9&`1+&Vtg>X%FRSU-pd8hbi~IX@rGh&o7&p-U*d_JI~Rg>r=z{ zYo6>M_vtVhlAry)?|KmaS^^G_vrLu~W$#YfyG=b9Nd zvHH6uBQtNw=XBt^kemt_3nG+sUf>>OR>)t?Kjg}+mE62tgimCNQC$_mhlghi+$Wo1@b*&QQ> z%~O?JKa_QOqq#+K~{AM20_(h zW-tHu)vQ=51i(46)Em?ZzS85oUAcFZGsag{*$hXYZJv}PQ4JnuyzO!)A z)sy&ZEE7Yyx#I(6gv)RQEy9!pA48{Z#K5_6o}SRK$uvvWAX}!L#gRO2^IgRFB4pV# za3dhS6wsN51A9pgAQ-q^L4q<+%brXeNMcvcoOAZk_dSfoQhEc|fL{okA+9%_XmnCZ zgx)FKC$Gj8YK7Q4_YB16LsKD;$J{9h@XSyuH#uRmu-P=h1i4eiP|y_F%K%mvRA=?sF4IvZ8YA8yEK)G|4 z@W*1L9D%H_?3LGarye84+JR%_ol!U_7OhHoADdAAFsNXt3-Xw%_M?mAPAF}(Pb+4L zW-~lKvpws{q#J~B^Byhm7gU$}^eE9FL`bmSi?ChwB@xa3Uh4?v-THGueew3b9+TCg z2}2WJxrt;TLUqzmC%l>4DqX*gwwt>(5FJtZc6S&h$$<8o`#5%8@xj)Jt4Vu{cz6?` zJliH`nO>*8f?;tS*?tKoK3vBYLr``#Pn;mJm*gG$Q>mUJtlt$RI(LSErNzWY4(Zj> zxG{E=M^Kk{TdXhU;ZKl$AL#sfC8enOkV*T5^6_}R-^_`Dw_~R`dBiN4# zESeha)6}$3qw?W6G~X_)Tr{2_&F{rKZ|?N_8WLZdVt*y(3+}ls%&j= zzwSiCD)AE4;hGg_X%c&}>vU;aEZWEwb;P>*_`SfM`O%dr7C5wo-qqKVtJOmhYuwZu zbuVNYmR)a`t;L*065PUm$?CRx8_fFE2&G?VZlZpXZH{ODa!u#TQ5Y`B9Ju z9#7QW@YDhYLP4Pc-13@L%uQP0FxDS!ZfSUoxFT^`;X}j6H92-2Zyf*RYQOzTOp}{{ zx-%w^`p)uUC2&g6Hk|M}z>Ok9d`yP5a{>MeMG+0QM<=>&Bix)0QIzSPj!TT-B{P#j{2& zHp97|DVO;0fQ^QMlYu!hiq7{kl+5K;46`zs4`TTTqw}*mEnqRK&BUr$j2ap~)ES2} zY*i-Lj+YcYD{#X>c6*QCx(ZK79!rK}X7gzEp7bNKvW*C*l7mu&8v%b7n|Hr!xTU2L z_4tL__?1%#5&W1lzXX!xjZIEIuU1h6PW_ad^fHqnPj1N1a3KlS&Z95BI-;`I;3zGr zq5-aiOSxj!)D&Lv1S-UM$i^WZ)}LC>;jeDE!CI9N;27%z6a?SN&O zl!wa_uM|MU(a|ZWh19pK`n!oGwrtHpHVAe1M$oxEzh66=bro9Qm%)5qHPXDM8h??> z=UA9%u?kn*yUe9qshrW#j+d^TM+~DwOxJtazqDNZmg6<4Y+Dg^&#hr9813t=RfI4M zSA@HfR-lVZ&dB)AQqbEU38c2H(FiqMCWZy3-y=##rfY>V6Uo-ahj;zGzK`oSHlkj; zev+PPl49~E@f62ymQZYo(B!DDsmrDDFpt3Py+TAsK(1jn^SWjBSTsIi{yu7C-oR_P zXJx6I)V&}}B2k29q({flkj2y?trXJkix%vLrciWl z2G*U(r7P$xVWfg0QIn^fkTddXHkOR#Ctv7|HDl)nT|6t0_T`y4i$W}hp6e7LSfa;t zPIy1rA!-qnKaM;`j=<*u30RP7key>=d#^TmvL^~ehAYTLD7H(fDg+s{5K;LBwF{={ zKQ@1(iH(ORKpD8TRtF~p#w?L`sbC=h|b$HQ`nAVSR{X55@&l#@Hy+zfDaQTIbbQOHS!+Y*_bX)22k zg(iga;EFCIqVqWm)~Y(ENp+JgU1<-`9C-kD5c@OpfYliHT_3K4wQg06ZEGea@6Zb5 zd?_*1bb&>HSAWEPFmFsKJ>MJCYnu6Fygt83AwMk%f=1RE*R~Do8i23QAP7aA<4A|y zQ%Y{nXA-(#SUR@gKWXi6kJp@XgwtJ#dxA1#o-#8TC{5Mg(*~inae@;bnS7EDPs2A1 zlJvYCG+A7s)EvT?wp?}+9g47Ibu|Sx*SLHbYWyvZHz$=_x}Wt1vP|Xk%>$L;heG*n zoOuvi43hx8=8KpOD`7pN^2P6v95aeCe`|k@1uCbes)1mKw5FuZWRIU%$|doZ;WHgV z9#+8vZas*vjOwrQg%b9AHNPbre_qtx=}+XafDJ7ws#2U;3GJ+%FovqE*SZI+X^XpN zld@v{D!_gnLAbyIdrS6N-+?JzehwEq8O)CQ*%_klTt$u=u;j+!7SHABh2$=PkHc`jIE=T1dG^F;b1y= z=k$}!cG}*ie90kxFClmh!RP__UT`NJ@Jr=ghSTvU>_IWrK)P9Jz)%$;YnjalC6;86 zpgN5a@Ph@*=kHfs2^F-{*hgD~Umebq^$QMErwxsl2PStm^1s#`8OYG7 zi)GGyr|t`b*&9?Q^3I3Pla32$^NDEO_||9lU4fT{9@pi9g5SzJc|!4D$6bmo7)~!4 z8Ti3MBM^nZ-<(hWx@w|r4Us=vLR!h<*(-Ua%R2$LS~JGbnD$T!dOO_5c~#{egVu$> z_JqQY41f=CEp(`NO9}&sOAKn0X7YVrM7*qNxbh+|3>*3f*rD?St-o z&o^HS={NBx{C$2~M;Www7wClU7~kPKb1zCcpRVo?Jh2uZ-yphl=%)UwjQdw}&e+ei zUwfWEEyx4j7w7cP1vW9k?5%@Wz-u$|$PBkBX%#@9dL17qD92h=$QgWhtX~u>evCSOwF^1+TJnWmkeAR>(CFe49+$r zz%(S-Ns_t72^M=UbbizB3-8zpMrQ9VqQkuSP|DK9lB`icK-soY*4*H|FIgc+Id1u=^9;L6cx-@J#`{olY(-2zK)>_?Lk8}w;< z?uW3}K-7e!l0LdW8H)NLHsZsC2Q75sOPOUe$hd=Ue?U|A240P4igcH!pT`Q)fBB?z zdW^Se>M@@rT*58hN=dxxxdGvrT_GP3(UkT{2dJ9v6YaF0aaoj=9C*-4aD(OYgH<=tRktSnpifD(_yWU4k<~nDjv-o36`7H>wsO7C!W_5H$7je z(J3A+nxo&bfK~|40TwEcdl*ln_99h``o1h5s|9g!v2uFW5Akd>`g)rop)Xf`cG>>q zkz_ezi_Z+7@5ZduV|YCGvi~xuzWUqMT*PvN+JVH(?N`HS{l+jQj3Jcxp!Y_2-;nT>&Cv&EYPZ`Zb*4z zP+OqWVeNkbkvs~;K5+xtp3Rh>_!l%Wov&$|Kl<&T1^s< zZRgKY)7Q^&iXhNbv;pn$t)|{3G&a>yisf6m!E$ywhJ@hxKzKE)nEM7Js{7%{nzFNf zq9V&jVA1gQ!3iN#$?56|Pl~y!d3+KhNp&&R!~q&i{2vJ$e0z-YTftzQrGVd|3&<<# zV9do^E&FMM3kokc(+pSdk~lV6n+JC&uNj5|Bx?+kNwt}sgXGk;5J+-bDblYGnY}o8 zH>=gF68Gpi{8F9-<~d{e#f&P43*R^o*4}?kK+t1H%?roTysK!vd9d0hYY;NP`+GC$ zbTb~@Z$st^H1}=4Xk4ce0@H>-&;!pBe}_bsoY~m3*(%YP_l$;roT+@Pcv=M+d#;(O zXyXFmyPuI=Jwe*nE{{IKv@8B#{c(nPTfR|dM+k>sCW1F(WsJ>cxKEvLyK|=Wq(VQo z(brP^l1dmQ*<3H+Ez+fX9iP!J)+8HBl705V()2RQpOZC=nd&=Xnbp3y;7CS%bteiy zAvSl5O5@1gsF;oz4^ilj*hTsAW@e@Z$Z&v4B}uB2WnvLY&QqM9z%c0pz9DD_&qqIn z;;|tl#&#A|wa1LKg;_QV8RA|V+lEsJouz-HKIk_;N#Mz(@; ze=KD-qFMzAFD$$3$FrVkwQZ z{;>2jp27`%-Sveid+V>xmLyiQKM&N!3>$RpDRqXlwHChbFjAF^(7!LdSwCg1@xD-; zrv`6@;vL@*M4jqfGH4;OQU#Q?RLnP(!{z#{ohC|jw^TMH6GeYSShiJGTx-;jr0+L) zkJd1SPZ%;ri(cocRt1>O)qE=m3G}5nP%5qomYAMFKpFb;YCsDgX=rH{-n>MVy9(S= zZiyjFIaF1AHqCa)bi9N3BLM3mkAY5#(}`7raxs2E zIZk2m96E?h?9m2%mM)(K3p!X2XcOOx2o z@yfUfX0;8g_h$0jj+YGE315+@o%7Xj{*klQR|!jmwS=Yp1XTyP#hBd@J{wiZdX~JA zNb5z8%-8R2ot*=6j)ARrY}ED~>*l!@=><23C z^Y=0%JfAB+iXBlL?f};selvy_67N(_Wu%p?Dm9bFSgPta^9S6ys zP0lkU_YS;Qbe825W!XgeYkQeEDs2Z;!uEO2~o*J&0n$k?uyU2YfZ&TEv|n%|PI> zMWCs(jmnJS#FZc=Kb+n8+#~T*!GXA{5tY70w)2of!+p}vg>p*Dpm|0jzGjq=W^pLp z>#606J`o0;PGgR0o>#?H2P1qqBSw}=R2EReeTF@-bVGORR~u$hFuVBr-so;nE|arh zBaH-G&lGL~IS#>CE39?4l`wE=K}ac-La5uhoIz#au!t5Otu`3XC0kxbEINnHiL3RD zN|MsimW$_F6oVT#{M;uw|2eD|Zhq0kH!s?|#`2p}fDGVSH*Nq{SM?;-787ZKINYCx zJ(p>|(mz&P3?aY5Jvk*~vMv%Ini0hL=06V`=_e!g`4TuS<|b@9O4ta#mfiyH%p@b} zC>&D4ngt?ZGMl`SQhdQ%*ftF9s9&;@k&c`)Qd#eoaTXRwJT*&0Q2-d8ORo9mrGv5y z;M~PVheSH@a4u=+hBR_c55_q}fMuP)tukwNS$sK5KwO%&L>Ka0|J6(2SP>1YnbS8N zcla@2udl%tY1W=(?1J|;ZY~k+fh%)5)SG*AKf26$>ym16gp2zzWC`_xv9Z57y5V6> zXHGh&<843+ZDx3373*_uVvx#^@t|;^n!*Vw`*ovrD4Vc6Fvf5XHn46ogMLl(?2y+? zW#yqL?;Q?tgmtT|$hY3rps1R<>eHqETf62quX_{7UxLXQ%V~2~jMEsUWIJE;xF>tqM3$ICsQ6<_t~dj~`YpUtJ{kgNDz)bGF&T=LXMK_4ayc zsi*ULLoNy!n&h<2yW*++s>JD4Yk3%`!=P2YdgL1^q&_PzeP?V6${dJG1gG^0s2aU^ zv$x>B5l^%mAFL_11GG6{V9dW3NfqbuC(T)G6Bh1epSKdCZb((#c4*gd(&0+(;Fr;1 zG4@dv$E?Ts2yBb}poZRlZ!WhAS|s?!G*2>;Nkx|EvER4jKOC=3s?@FCsCa>B&#rbt zJmW39af#aUdY|W%SRZ|RrS`MzTlw9XGa{VCa$&u9DBe9SeZKBf1U7cRntWs_PM||+ zD+YXhy)c?hJb)XkzFI0E?S*fizn94B#zNYS$}&AIx`OeK8uH@Ev{JUr%+1YlX%GQ= zvOt1*wdarTKP`@{gVVDC`oB0^kn0D@+2G*cC`cqSBE;@^%q*(;kd!tbmARM-*16=F zwczz}9yzEE>QHKGI*-M@L(jUY%JZvjhnZrKQt>!1A||=R3Bx6>s%9LXIyjic&aV1& zk+9xPB?&Dy*hHlBDmOHf8dEmd2Anh)j4J3}58TRS=^xSQ0)%#SWT|`?1kqi}o(#Iw zu@o!&(rtiM_TIV16bNt}KcGkffsxYsm24Ln6wRLd?EZZSh@Q)Q#pzz9k-VIySdxhV z{NC7c`G-}6iNp|w8OuY;sw{``*zKNCTg)Cj)6;VzmC}u56kF*2$p)}roJ+Y3kOKM; z5;e>ULg@OI35cZDdr-G>xTH2;jRR0a3%BJcSaN{HG=GfOL7EEWHtMbzHqMR@$PKJ{ zhHu}SdlqjDbnG^N+g&X^9C;D%J~$mZ$&W89PwP9Gun395u{Z>|C!XwbE907uB_N8b z-t`x-iNN{uT3%*cPhKm*cH+3wfx_l_p;%H*Q~>Rnt@YvL@;<$z=4i+shbJ6n$Ta&c zt-H+i+^)f=qk(iuOb~Xb*2F?IP5!4yE*;(CI9Yk$Q_TE*(UM@h8;?d{Tet}cp|C`4 zpmLjR^#Hw}G z?i+q6#=K*;L1;&3RURbSw`#qFo!@6n_Vots9K!oeFC{LilIS5+A-+Oc6i$W2!fa<| zATXKAniykp5P^vbVhZNBEJdTC`*H}Vw2iTFnW36cASS@tJ`ENNcE>U5mU-DN2(`Bz z@yr+)n)1K!j4{n0LZfobr2$JE`)o!A24~@!(9_1WtcRWC%kL_Uk?%7?$~;a!RI|8+ zXHoQozu|n`Q6SOA5ec*ji8qw2P0SOF9UrjO99>9Uqn% z1xbUY$a`f%29{MY50Vbnhe|;WZQWyx&?)yXwG<(fMfuw+;P5qsR#iFQ;`u`1L|rl0 zKUBhZewRmUZ>vu{y`^y~fqgki9SdK1RzD1#B9F_c<~=Ma1HosUK$&M}r3q~1Cb>~u zbN)U1$1=gbJcVLlO@Woh@pJ{WOu1h4o!BYSM{c4#lW2PLw0tC8Q=<*4mdVe(Hx<8W z#T;^nneiH^EJGc1gZT+vFb($JK+Y<>FiWGxumr8dfvMxqlwdBz`~IDeZz91jET<~z z@6w$?4)Z{=*|ulisiNq78q_KtWIv<9@6g1)uR#VeF2GQuuJle&J9Fzzo$$XMyg9Yv zVf=g^g@6Y)U>c}$9dDzFwB(*u@6L7Y$G_PRC7Q~K)mtW=F|CDS~@denO}PA}i&7x(V4J7NRMTVXh=YTWsh}hNOo7 zy!I3}!a~FO4A}*LTrh4mk=|jB5Tpf3n#D%6tfkoG6ZDWM3ACjwa+Lz7bm&+du8jm6P0oAh@-gq3cLblUD~X86`pYNH)%i*stn#liCI`?!#}m zoO+rt=KwHQoo2_BU2OYA<))qk7gU(9VCRb`bP#cZ%j2=U{C~|!-MXD?q5GAr9pctI zym4rltoe`KED}8>gCdt1z$=S3v_eOJ>!HG*3vvHh~YQ0wzA(^M$tIg};8|9sx)d*#7)y)H<_Lca$ z+9m@w@wb+Hz5CRG^mLqvfHoYL9kZ1#cZ9xFfUz&9a7{dpNHt=NHL_I@2iJHRT95Ki z=}Xe0Zvlx=h_pi1UIhELcM6#bex@nU8FO*P^#_V5rcV&ZF^WMscKzcj5vBl~uO8|> z-oGD_f=XF}@-?u3-NPbG60v!uwvyBL8_?}*dTv~dVw5JgHL)J433iwm?Nc+|>J0NP z44{%@v&K;8int)XH)Pj_UwCI^Zb?hZ3rRBjB%JPxFN#Z8z8MWOnwiH&LLB`c07F2$ zzb=iSWXgen;kg1(_?_V)%T4P#CqAYJJFrC%`b3dexW9oV+98aqz0M}K%G+aVS~49; z5(+u{PNLW4GE0f6`jkx4$@N1qWoWYpeU zA1eWcuT!o_X>?Fm>W5&H*6#Oh451ddb~*$zGfE-@2@H(|bnN}0j=P4w%_r7$G6~tn z^zG<-?@1*Dm@{B5T@enf1ep|1&6t zBM~(auC*z*MU$AMJ3FsPVI>MjUGzP?wpq{74wu4@`iE~ACb^%k?PyD7{X>gzCCeGY zq$h5NPy=Ql2!gR(t@)@cTGQ#5#G2sIvc(n35n4Sy!=ctN<=2wGnUvEB3f-QW6nK>@ zD=%+WUoH@{+IK9T-|~({yBVu8MZwpxv|uu)IGv{m@1)V9$M?sHw&k)i-Yr0@LYMx2 zpg8r)S8!knfNC9Z%<%gqx`&*TrO;CH&r7J(I*Z@CED!P?DN7-K0f@6P!IjuaMZs4c z^#r8){tVJ_s%dE+h%y1cuEcf?4iO!Od$SMFMS($RWfC563$@?A*05&Wz(*_meo+JS zizB_y2JiK~{dxT>r2vr50H$t?pJoh*ip8|drn)QFnLwhoL0OsFwC=}fp-KgDJ<4vY z{Z9%*GBqj-h7Y}?`o(bl8HQ#ddQ)J5c0`PKXVVjy6KrNdUm4ll)?0m zc8{tn4Rn*o4@0P7OA75>KLE9@(Ffmed_pdyk9ymc`0A|_MV$}MXEJw1pns)BkuG6g2P65~ z4Eo-0Zr4V`9XrM&CnG5i)#TVERQVe;bZdt~G$&1@a7XY)3Zd?~|ENNSf}ONhQgZfC z(kGOQT=2>gB$M>Y{t7B?zNk^-*uL9D{nXK zPjlu%A>Egip3u8y$_xdW$lE`#mrHct?3LN>`b=4EYtryZvDsW7t}KBNPKp_-qO+To z{ghzYLX0Y|~i~f-q$9@Q_^nC5Yp3lStRW%kCe%DHg796IdYBvT|M48KRwfQksFq zXkzm%O_L!e_4CvbjuTA>u9?t#yOvKJ9a1tJoz}yS&0Vh6g;1eNX*55IER$U?B~m21 z`k7dfM@tx3-7#hQgLN&fyCu~sH+>c@-C{+!PaVQ;e;0P-e*=ZfKv2`jS%uy=jvrK) zExs_;Q4aQ@qrt|nGE^w9J1&mROfSR zxXaY3AhX}UkHCHR?M~rB#-{-1+sE6e3FsOaEB1o2Yv=6?^U92U?7>uUQP;TiFm;9>!)#YuYP_I^Q)Xdv$u{=Z$ zeAfu!cSP>CxO^3{C96CJLik}?FY>UmsR}1$JM$&OTv!#>^!-ufG zE^RAq43)p9^G%8DD3lMd-+yrOkI9b~!vZRCZ z_SRL*_g)dGkKYFeSppF7I0I$HfUgap2K_vP6~91*M}rpGzx|TP&w7cWq?Ts>f3LQF3))`4=XXx2XvjFMjv02|6H`x$z>@ zxj@8J+=U? zO;hk=Y~q5;!rr*&p#8^&F6;sGcl*+~l+KmWZaWH{<;G&m>cy;^n@@bu)Mj~`1onqy zb?~Bi8uQlzb|i{!TxWZ?wR{X=Q^;aT2eX|9h1yiDa|3F^beSvSSB`wH1)xWVx2Pp?+?d=R?NzoqZxrV8r8Dkpfv{l> zT}Lfb%!=(JC_?;mZ z_E_m99nGkkZ&Rj*JLiQM&`eCWmMMX66{7sqDNK8(=5F7=+`)p}u$Jy+{yH8rV7XVJ z--$!wCHop3=^`sT(4yb_Vlcsi{^NN?jQERt)$Hp%ScHyZbnEyI7Qq--`9WLG zuhOrpzQtVG6X_>?h%9e9!^gx|MukYAiwR$IALCZoD&BjmYPbuZr!0}H1K)-JkkCmpmh#-S*0ZVd3f#uaa zFm|O&p6~kiDov8UUh)p~shYEO+_nfM*J&!6sg9j^^26G|o~_vK>o%=dt5jGuB=m*N6uQOsAvmdICEB4K)#uUObM(J1yZVVLvm-aQnZEjk$XZpYfDO4lSUeG2mu zAkt*<(;!Hi9iDD(7LGm9vRyye{O%kSJS>jeESZgZJYxs7M!V{*q=Zzdw)}o=18}|r zfW%0Srhfmzb7q~ub?j*T)U?wY*F6c=-G0fqlze}+@V7t-6;bT5h0fF+v0|94^fW_U z#9|+5_TCcCfogEcAf^F2LthFRQMdM+S2hR*?h`AJt11Vyh$PLNYCpAhm=9536JNJ_ z$@MoQ+`c@Go1>GLG(|hN&z-y={BW4vJOspV;fA0IuAQ)zO85PyU|Q6jPKPAIw2~~e zEy0jc6~dj~*-BF1RVq#|pC5~^aN?pC8>wKpzlXJdY2TQmZX?zOrOu^LmTwd%mr=33yMJRoO^vF?f`(NYHp!fd@F`ft(Hp7k02+!9 z|GHcShf0q~-H{32{P4p^qFWgGu@F;@>56X>gfF(cYn#V^5fVMLMaBch)OfmIh`u8f zo%)Df;;Q2K-H3JW-OlUu!f(3);t|Lrma@~1js*z%)o3a`f{PWud^cFnZD%X-=VayY zG-;E|5qjd_o#l47#_hHJAf-OLi%?Pri#jNbCI9;FHc!wU-N%&7&(H(in}5Zs_?}1% z&eIv^!6*f-C-fE4YgtjWr1LO`w%`5_gP(fYk@Fv5RQc6Mp?fI<)X>5YzY_!mnNZ+- zot7w>ugq{lNb(;n1I4FEjSTfTA|2cYYtrVxW{BjgAw!buF*F=gT^xtXm4~H3Scp%5 zyXjXjYP_;=vM0yXG`7{ROEo$2@p0>)CvJ|MfFzKk$HjJ57(H!fceEThbTi)zGM1o9 ziL@C!R>g?;+7)tTy)~oTExxew0sT1rv(JYdpMzHk{IAltu%ak1szfnebAff?lq`mM&>D5n0M!Y4J~yFpJ@R`Mb*&|@G(-Op$o3m@Km z=ewGE1Dfm4N@8L{=+V)fM)@=8n}~(=qu+APcPF$&D^uk?h~ zL0B(^kbuw^req2QtH)ZIT&a;7qS;-#{msh{fRkZhBY}-=LzZSvb-5*4`Us!h$g&0x z*=rlCnCRJx-t0L>>KY3}v3*BJ8t2hOGO_%8_T;OTF>ItUYqZ^+`f5o|2vj-^S>mB@ zJhwM&Q%x7L9H+*oH--Y1ujF`j3C)`9LSvv)_bzluH z1+G?p^`p_$$QA%wd-EVj@avB%U`2W{TwxJJmJ#<<_sv68)X3pR+?FM`_{B(c<#7pT zwE==3F}ZInvfcsG&TQB$US`tOhq_iKoF2%DSl0NQ>3%rWsDFo6V;pp%taoI+-|M5L zNV|=Oxq1dnAYHFZcdlBXz-~vtlQu%G*xwO&7u+{T8_k_JoeA;a_N5>>szGy;eK>zQ z@dsUeU2N2OcZ+Q5wPw^>3yGMTFVqVex`IG;_u(N(iWxO6D)H|~ea#InhackBBDX2I z_=t2TuXSgxY4DEHOcx(2G;r#z@ssqoM+psOEpVENc9hsfGNG)h_1*-LoTzSAuG_-#*I9Y4?*sNV@h#8@(rK z|FSy&ZO1!VbC}@c4dAMlX;|b+k;{{EB+~$QRFMmQTx2`UxzxPMa;xh&T>i3-ATn9| zjt1gJzdWq!B(6&tXZby$Aqqj&mT#+we54&S$(DUi&HZu|wDU|+T(4|1VtJCViYGql zONkj9&NWmTAH2<*iK5;5u-x5xkSnaD*|?`3zvq0`st4rBHv7tfmPKy1QuTMQn&VW2 zo04m{>SfLZHA!u{uh}Ryj;(PDq5A}v#v(|YZ+Kz(FBGtckw1BGcqkn8<&N_^n}t#D zrG;?7nDJF@3FhOAjLrv;7^aTXT=`$Abu7-Nm8SsJ-y2QP`JeH|Oek28F5^{;t1^D0 z1}ajkuBdh3QB^f4!W?%p8WI#{=&rmGQb7>^&3vDl3E!m;gJDMtnTeV>0DT4Df zwQ}jI-GZZ!!hkE9ay0tX&g6O@#O_+H!K>hYp)cvB=_bb|5sZ*$SL^sV&kLH}cgwZbj>(d`>C$g63jzZ`@d=J-WFZKk3 z=9c#wmA5T}x3-2IO(a!5jI2!6>2pZ+-_hPd@HaLybJzFqyjtH_$mVSLInl^t#Nwx^J`$ zaeRa5>#Vn?aLn)e_*Y^!FfPZa8Qu%SB4He6#*(qQe0jYj8w)4oh8=bkAI<#sH<9Bl zElE)5<`iZ?1vD~(3|9uj$c&fMhzufJ|9h8G?S1@l3o?s$mvYh;MtI|Ij<|d=->v8` zpZbfeqkY=vF!j)H+)=JKg@Ro(b44c_LC1a*MR0U)oVMj%6`Y{F)9uht_y%D&7W#@z zxQQaE;5=%*i82S+2|C-v+)UCVZbdpu^zztd58>WF9wHhMVyR zErLetgn=GtUz0rz&|=;Z>PeAFHSJnlZYB;|E6&2dma%Px&#K>&fDiMrK2+?zRm&@PJG}P7mSQ5f zL*W!OeZJTaQBEBuO4ygLQ$~cXu{o3I!=S0^y|1phdpOO9RBhhGBaT^x_#HxJ)O-V!xXH zdPakYr}5^Ki`4fqQSZdeP>*A4Pg+9j_Lvl{kF0Dd_qsW*1zrJupAr-|n8;O)Ppn;C zN@2gaqgndyaA5A!3ij&yf7E-KFa6uHYSNR1Jzui*`lWjPDgYgcx|Nc3H6&^7>iup( zE}guU_#3sXwe-6O{<_y(d{{EAcbBJ|Dt6*c2d-6ZHu8dXwPJ%AZ44ClW3mw)$HpRI8uUN zUSy!$_?$Ln%e?(bN|-V|o0FL6cdSNFrY{uDvWTVmPvnfdmNKWTF z++SMW8+F`il48LX-jEBfVAL})m+OS|FRaPu@brX}nFV2WX7*buUnVe1R4?`n++MY+ zU9RtQsEB@tiG&J)gKx-lV6d<+-G9YfMEP127o!s~{o@;YJHwM1JDTb7yvy56qx-<~_CVq%!{e%Q3j^bY}aUzRTR3)~|o13>_H$$^b$c%t`_i1|2<3P_pYtU(=L3G6`-6AkWc!sX!EIMln!De&Fo5TgaH9 z@8qP6<9nyn?}YSo!n03BRCYM3hb>+o&{APRQV&OgQ!xChk5a4+dy%Sr>jS0_9gHIF zsZcw^*RSE)(Rkc(7XWAC_Mw_K{`TMtaZp)NUXz=Lz%E054(O>sVlfw1`pLT z&YiGwal_w{nZETBbK_8;Wi;4?zrrK0Uldgdmy{1#1YuEPD)8eSma6?Vf9UnkT_dSf8gTbM6ZN!tqnTV9{m_?W-)QX@%kMxAs%A!h3eh>c))2kVG}&#|zg+bDormye4kTUGG|!ALRiV4*b1>{o1blpjUL zN=ytXW@Z!@(a$2U-|hRWK0?KC@GVS(@yJP8=*ScW!VErH@bX*4#=;X~iY`;%OOlW! ziwCw#d+pH1S%gVn!7 zpa=vznJ&Z_-dy%~5Z&@U4nfI>)u|&Ue@vJyZlmJoRWbkVoHP{+>3!4WC$!tu6_}2# zcRnmeXZq#K31Ka`Y# zX%TyFb(AKY#rOT`3Gf`doW!>Ht#u*Y~;ofFcL zC|SF^if<0xu2z3;;Xn9M^k_!@tU|&KIuOQhej9D?R&V1C1FZ?voF1RX*Wd_}pD99d z%+U>L$h>ECdG6=+LEoZF9u9*FM_d;JD=F(>b^u~`EZTHkLEiMnop@l)9mZUo9wmu2 zp!GcE*FLBVL)EE7s}i?+v|_8gDH{iUzrK8Y@Or67Z5|y_z0M0J!2a=}8isw!I*`8L z)YGAln|)-Mpzy<0Gk_++?Ra8)yFV>p$4mF=Q#PJ?>x#V+Y0YN0t za?9_z?F51T(w2&+{3lXYph1z0n;RaeSP#xM@Ty2D8YCsg2cWeYW;gO}~;n+5*oZ%oBdg&=hm zVE4|27BFbG3MrP70RO;Y+ywyUgF&L!pG1Rck$f%~;J$x})ZJ1*=BKD++3aZ8!@i26 zck6%5#p{@uqC}u(Md|h(@?p*f#5C|sxGt;^*7^63MCK^>yLjw$pOWZ2uzb~mu$S8d zCIOc2LyKF|IRd6pv&{z(Il#u+vB-4TYyQ|5at6u^@pf9(pa9m#dac$+45NVmZGkY^ z4#0ro+KZO-%X?A(ufDj_!jtKXrr!)Z;V8tuvI!UGET-vW^sNj2SR5V7<{+yAmq9zdqTm#Q&W!C6=R^SNYG2AgaI; zs%?x<0k7^TGjy4PIVcOZD8)i2*6pA6s$|J#uO2T#O(mhEJ$L`Df*O6W>e~0iW}8*6 z$Gr}8P`UpN6Q+Gsu~42|mNIU1owXNrle3 zyS;ajbc?T@!^(4w`>u>2Ry~kDo~3G5v+rUR4up0&lciN?5Pk6Ha>IRQ{D|KZk`E2XZ=2|! zPTq$43Mg?SruuxE#N#V%zj0p=+iBD_UH73oYe+8-h$8gos+b{+>f+~VHBiJmSur~F zo=K9mN`z(zcK3n?pfLuWVh-3Jvy_X(EPi7h3=L*{&PNxw2Mbvv@V*!(nc5=&1cc_8iq`N}>IAfOcp9W>hww*8R&cg)P8ne=P2cFmCnYC5wlB z2JlRNX_T?iNpccCAHN~Tfu=+0%Y+R2f72;idhM~I{oC&afl|d6xl-Uh8hc-#aLQv- zLOQA^@fJ8@>a&zAp>TmeW)Sb7SRkK2u*EFh-$^v>=n7mwu68KMSMh8i;?SFRJsD;* zw0s>}2pcMRuelXeTwCR=hrP@4G2ocXsOj$Xw*(P!TdSD5Zj=}ojhG7v#b$UIrw2!h zJO4}ufcioS(_P5< z9?TH@##sLyBy$S##hMX#BoKlO3NU8p|EDX2jEQ&^@2HIBTun``qb2zB z-YJMRB*sc0RIs30uLtMvsQ_3V3-}PT9~NPs>!`SB!fRv0uL=B@Gvd`GJr(3H*Q1k~oBF_GV zvBPqn1SuPhY%Ke;&t^LZIide19Z%XG^kw1N zUh{tBBw>%CJ8ac{Z-J>|AUL>O60JAarLPF(w*}V>6R<*Bo!UVE1NOHYOlM=Uxpe2= zmf3%SS(5#2QpJvUM)&aa-;hVuo5@^w$fA>IW+;;9WbSq8U*|R|!M-c!#;KP*}Q^Tzxkgh1U?m>CLgu+ym}b=q{JL&sZ>*KeF0(S6I^ z^Qs8~;a?%E|LI`Upx!IKmTnZRZ<6tcegtM=1B~*?JMS{9p6OyKh(%{R3hSl1SUks} z&bMii9om!64ShSl1+hL}3Il5}o(n8Bum6FTEolld>w(4ht^f(rM-M-X16it!2Hd;u2}qzE=FZMg`# zd1;08U!s2^yB-iAsy4S4Y*sV>)F>HAx?5zpbiE7xr7(M?gloNP3~!d`8PHpo^US++ z#r1n_?U08h%%{Cd|F@@z$RKCc*H)sLtB&B&Wk8bogl}p1T(ptGNO|`mPoTQKPqDGx zl0XEDs4*0fNJ6lcI#!Jufj>5VdEjt)_E3r5i|BnQh{CNGdj=SyF7U&T;P@TDmYuP) zD3f<^MMxg)b&+9=^T-7L3+{kbmcfE9EmNaTDcoPIw^2ugV(?=~>xT^*_Y&&;K`Kei zJW-k>AKtRYLPoF3VZ_8e5#fg)0=8r8b~9x8v?KIpBT!-h?|jzQUyi72k#0gw=2AEN zBFO#G-*N-Zo6#yCO?}-ijDJ~lR~yya3EQMW;9DSoG^-qfb8!UmD5#c&_^Cdxmt8jf z>)c|eetEq$x$6cLpN-GL=_9g`ZF%R(WZ?~#ex0SJCB8j6zV~%wtvZlAzM(Dd-8~nK z6n;a$-*vzFYo+P{yC?hp14)fghylo?)3X?RxI~Y_`Ms5^_l9oSwk;y~mhoRp+Y4Os z|9OW1!K8~#0>kX_ewzydNlAs17$S~mX=gAnQy>kGelJaWG1vP?P!o31BTiYjB4zW! z;$#SLs!sfkYK?v z$_3BIg9iAAM%{`&6z+~e6N$d<`Lu~LPY!(7nF+-Y3&2{{0pjhSh#Hfgf=Y1PqAz^~ zo;|X#b=E>b(jIJQz(9|p=y<-QyqV5mB_a$epZac8wg2b*jQTvq}Jd_ge;P>>x5y9tNI{kX^jCk&NfFSlo z)DRe^pHC4+@iGY%GH-#JrUaeiOhtSOP(x|_>3Y6{3}C%j4>xSFO=^VIANt=@zD|o} zH{v9ii#!@)CIN)}8Yz@CWAcrw0RCkT174mdHy3Zcn+y4l!Be1P@ggbDH~K<5!^pyH zk@hh1%EvH^W@{|YcJhDHYi=+(cn9k#f3=*0LT8Qsp{0F-(za9d5NN8Z?a>h5B24Kj z-KIB0$w?;n(%+3=tGWo0o%G*Y7UM+RZF@#0)im0F>vOTjo}H4d4A9*uRB|?}?C|-X z7=+^ap%aDhrg0|ngNTLo8AMI*^Z697{-Mwy>qG#l#b5avvq2GkQ_)xdp8*EG)cSEyrW*AB0V|?2W9V*#&v@_KGVM^{Qe+ z)uQ}0Y#S|8{BMMQ)=MdIvE{?JQirB=&zVJ3@F^+=IeIKNdxw$G&hAI}ms0yjuaa9g zZy+7D5_q}0GVlvbhJu7pSof==`l&vC_n!|K@W$XVktBHWYYsPwUstjd71gmqD*S*f zmE$shU!XGO5P6iJ?*s|^v$izuB#A$bRNl6L@(U-~GEjR&Yh7$rYJ2PT=*K`#YApWI~o7XRR9=@$Pr{T0(3cy+q0~uNdW3R?4ZKIkR zcrAGWmMNu*kmGCt%m=YCIpQN;3@=#*W`6ZsKY*r*i+O$U^p+`6h2hrT$)a8r`@=cd zC9j?~S=x1|iz0!hL3w+83982LFZ7#mM*b3FUTlvqKHhB{#SPV) zk5%vn*EXVI?l2ntKGWG3WqMGZzbxc2jNXy~(>?alF@p^3&%gSz`aajJanjoIA9>&b zJ(4cXgpj$(MG>?1Urq*#x*ZgUAt2Uvsbcg!El9R`jHH2zm#?WFjaJs)0$c`TX2bMf z2^m0^APU+ht+{vXqqGf9Yn?{r0pwV*%Z-QrJz*YppJ9e>5g(9?VPpGh2?$C@Zd`tS zx0x7fz8jqamCu~7<}X&h`eg^~T2%diE!%pj89l**ZTEVYFJeP(skbt{pYc9g)6HYq z)$-qat=W+LR5FTr^H{JU=j!5)vjmxUF^m<(R|jH%%A9?FpIDrzGdhuF>VzC9jMGWo z$-zwMPDOAGD-dR5hr?^M;HI*N#YRRrd|avWSx>#!fua!(eUFIYwkLCNiy(jxVgE&x z@lo=m1Bz@-$bfzIJfWXrMUt2tQe z&R4tKyEMb*Ci0v>r+2>O!n${tA@wJ${Tq-^nzz7CgsLj|yoJRqH*zHIy1hIw0^$4B z?6TaQi#uwiD`T>*xXk)<8Ai}AynXpWI7AzLS40ytdk@61NBk;K%$mmey33!f#$^XW zY&rPW;i{yLJpS`-0wa5Zc5*!_6Mbiq2XyDE3w};+3QO%c$IjS5%5s;h6wP&POrvth zFCZ11g9JHzLt{+`CM%NPNB#BBMBm;4F@R)YC3dGVi|xy-v~~%%iy1>Pn0JS`05sLl zK*+?sptX3|L|KPz74FLqH%mP?PtMDDA|iwd_2_D-=bJzU>XnNwJm9NOy1zVmk^3tz zB}KUFvr4{JZ0v9?l2LAG_~-ABkrh{7QIfP>XndTRPIbgcp&evDxgQGF4{G?+W#$s1j>N#YZ3 z?znZv)(VW-8z4QDczX|lxT?L#BqRiHD6xeeMD+fb?+c35X8lD4KA0oihW|Ov}7x2Xu3n?7(d4Ju3D`LN9THBZvP0l%CV%&g= zZKiLbuaBNaGbciH&JG1slCpaZ@oURJna~RIdK z&~!G}m!m2C(R|ULoIoKq>P!%0lO{wBQyE*yZSKudumZeSX&GxgYW9@o(AOj#ZuZli z{m{9x_Ik?YdWnXK&IzYqY0OfO|Kw6h3r6%GjZR#!G~Cqev0}B{_Vg?K#c<7iTW1#_ zdLvC$STXq+makaCnq*He7pMYI*%$j|-T~VCGg7G!*wvJFJy=?bYaL?u3W3Q||eR%XzfB9pcbPoi{OB6Z3k>*zemxgz&!G1XQM0%34wLF_qn zQk~ur9Y(ta_Xlh^I#x=peY9A48eWv-C0ge$bLxJxgeh9q=Abht2gL&uZC5=Eo-A#y zkeG6uUOLwaDK4X7*rwuiNPIBrrJU~1-bC2NEq<9%Q?;|fYsiz6@g$4nu*}F6?~aqy zr68JQ7-3sJP(9}Vv&_Prk=pav5}Y!LU_>5?i;_UAtqq>ZejCbt=3XJmuc0-zm0%)V zHhf|8PRx2P;xmTjwqvYboW%h4`ai{`Kh%ku4-cRj8dg9&ywcjoEs!PjgU$8=(h1W< z;XXLsQi>wPzeGmSL2|RQcTu^S>lRjC?gj5t2Z2bJ$?PPrnVdMtF|D=ljx;`s^wNjI z^RYFg-cX}O6G7#A_EvV46LD>h*@qEN&^%bJN-9mG)c+UCcksn2}e zy~H0T+cealO)1MKdGc@Wlbt3sj9)J{4d1d1ne)4Zeoh1q$Os~e9z~DBnIdvT>-4DY zY*rHCjm}(*ZNa^THY2bOkA|F1fAeSdw9YYDn~-6S3#dm(PEoUJi~KDGP!Y+|R}0~g z@DW(!%D`Y76<$>^mcfWkR&u_JbEFaJM&CtLQ($a^$yU!biliGbRLwcWEhM(B10|B8 z1pwig;G}mE%T^EEWKrVGpK>_HIWIXFk_#nsYLA2ikRVd6abNj6kVx1t33;>-e`n^77_e5s zHy8dlU|G!CTzfWtjLt~FF_r`-i2byn`LbnkKv`2rd)8SO#q>62Lox5(JrWiTTf1iB z$14Cp&`7Yu-W`+bzygBlf#f~6U#Toe)P}n+Hz}$Al6%_XqOdf-X03Zt15G9br%_9x z;H6=A)1Hm14Lr9^i}6->;I5O%3u9ajjgYoOGKuL+WCB}{Kgjkxnn5;zBaaxFqWkG) z8pX@zy`(VddCh7_*4LT<*xydp{DyCgvFF#2HX2;Q*vNmAAeIPDfJg9@BLZ zC$^IWdYl{ey;W+^c#1)$KBj82I$sA6HRB`wn!sBuxKo#QP?@-l$JP)aFrq5ZCj}5X zu50pa>qc2hiC(7D&2SqfkR#?cq)X*IZDqx*|s8Qi8lc!ES?=rJppngKmWrJWGSza(+kN#tY{QV#Q`za5HPN4*H zjFS{W@SVM37hiKmj0mwb{YFc4>YC{F@l*ecsi%?mvb@H+HQM|6>}a86I+c#!TMoyE zyo9&!1BaFleQvhypvqL$82Ni+14aFVx&u5|w^i%R3w|~gYz9lTf{ch;2heD=ww_oh zMRtFsjKt{}{yaca5&tdiDmFXyl}8v$Yp<~{JFES5rI;egYzMuw)2vK)SpxEj2(~6- zn&QmAg7+#*&~($si=a^8vJXMF6vRKKn&v;wuNDAkF-0hf>IDKSBZ+qqaN{Ea>DBxr z3)7>~;y#bqV8v3>=h9le8kh_eu6&G9x|s+XPMHymc}<;e_u-CzZ-2oIG#o%++*$Fy z<`k%EoF~o)UiUUvV@>XFIy5Y1)A3?dH6CTwAD(l>QktKwy1sp--tY2$j%a-b&e=V# zAt*Q=wv(}Ng`!!hRSE`lM!d`~lO_>kF^WCabtC>L@p#nD@S-gp;{q0#H`rJN=(LtlxiE&(FgHT~kw3_9ahul>RBj~j z*oO<%H-?g{Ug%dM%W>`$<$>q0n^sFwX&etDHZ&8rvPnWoqu9wNk*Bd&rT`M8u0-ge z5Zi25a{t8+96cAj-WSPds?)idZ6OzZ=GyL?BM65aI0M_h2Hy4E8Uk(pd(NjmaM0u; z%>c>U&;=m!c}^vxh0_L7*HB_}ZDSwBt2V_;)?||)At}*;(g+X{k3^sA`Wp`5MKOw| zQ(fN!kNTxVv$`95YRJB(x+iv&vcZcqj4#_}N z*>hP6c3864F{x&Z*aV2^_mjNv}K_mfg4 zAmc)WB|`-4h0El$Zi@O^Y!j890|v7GZW|-;3prwjjDT{hxF`a2jRANS^NUgP{W8Az zZCBUh4bt=W*zw08LP_w#l6D(Bk|b8Kjoly zvsbzUXQJXn3K-HFV#rsZ?vCXW#>Tr4b}9mL%Yda->0eIe00a!>-TNHCr1^ zugA0oZIt@I`(T$}hm$s_yL@utpuK%KYCj8z76Fbej?RB7GxP%m8=!|9=P`FGS(u!M zPVAx$F}k`4_5Z18*QG#y3!-CV0q+@kxiU6PPBTI489>HK^*(N=h2+RH>FbtWh@#dXSM0SS`6{rJ=6E2o?-7j4?2mYdz~=$ zew%xA!FBSkKkW~)Li)Og^6t#^&^OW_0_fj;SrW!4Xpt2#S=M-x_-Y*Y&ROSE18#A- zD~k<0vj#xwxT-;@W!3b|R{E|%q)Kz{4{hte-Y`AckB`UK5K-!Q&{lMvo6e_x;$2?m zc>273zx1t{KjmmU7PJ{#^23^!yX|pQ?4SuZ4%3mfiDnOUJSey>()hOhr?rV4Pr61N8F_^UXu*O_RLcR zWm609eJ;Yd?Y@Qcf_}v)uO8U0W3JrPDfEdO{V~-W)FYrUB}+HJsiPrbaa#5g0b1z1 zv-?3a;a48&Mh`ggQN{+Lv4d?QOvf_-)sIiI48I8;3RbtdZrNji>t=Ci_izc(sSX;2ctx-js)@L;3K_46sOiVcAnzrj9`fXbc%Jx^mDoJT)PI6w2 z(bl7+D$gMpJ126t8&Ly}=RNT~louQ0vB7+^;UM1z&Mq2c90QIy>`E751ZH=lD!8MY zUyUZ9(yP!Gp@@n?D6EPCD8B1P&LiHNByF;zQT@2D(7zYqtRl5!kr zSPo^Kk>>m_s9XQaV|mnum4(hnj7zy4JkQDq@X$fE6spRM{Tua-$W&Bg8Y^3z3X}tU zB4-Ar{KOdZ$3c^5+a>W8DB~4U>Ojz+;&l3*`~;JC=JE-OIVj?rc zLn;@nS7a>|*d;-5IOwCD${eJsd3;g6b&3r*D{2ko{prSKn`!l#afF{(emOPPV6ECV zv!^8MccGK-p8RkU{?oFVP-*ir*_n0vs?7J)8=l&o^DCISQBo-v5E&e zAs%>}^+RrUC+>^L1K9ox3;bZpzUZQHilv2EM7I=1zv z*IDbVwa?lAzBm`(MAbKHR@J;1b3Qdjy)&DDO&RGwr?D`WxEL}I3gmndr@TQ;vcO0^ zq;8zFHAw-ZzjwU^BpMa=z{H#&s}Z^t5hthGI5|c%wmSU{mEmvl$3=RP0^~_%UvTGF zbqIJg>8)ikf7)ELL4bycS7<=-+$CZtgovXrThjbMywT;{~@XgXbj5{)$W@}WV>ql3dg+lCqSp9F$R^)9Z zUL43!+=mTNdz6?RYG-YUQ5@s$zI2mhF;jM&%I<0>e1>RnuGg)Axf! zZ5z6_Zr+4t&~*Un)`w@t^}0?2<<KE*+^htLVCM!mdMH5deXFNNnHmG!Rb%sqJ- zMNj0)gLyRd?pBEG`zU@vZv@v-=8Qx1m=mx|`&MeKY~L_KywX3NZiFUdgobd#=cSsv zuKt8I?I_O5G`C{j!s`j@Q8y+rYzTUQ@jh_))wXnfUsofH@@u@(6ZIH#pru@k*(1_% zER-hghY3YOl>gvQ zR`_H(za-o)G-6U)j)?~(S^Dcl?lQSjHrsaUvNsgYnzbEde>;Eng2%Eiy5`zrGxn~a z_`{pG5UxQaF_84I+q2$Z-J^s7dn;c{b3Ev32{iBw$yLf<}eRkudc4C8;cl?pK42g+fblhD327%iYD6 z-3X}qP(32n{2VkOv*m0d0O2}=n<4KIz918?WDXux>p53r^cm>&6hEA0#BST>0I_~0 zNS4UiAw@r1LbVF*d!nVL4WNtQPP$JmGP{C(ic4(4Jsm%4p2^DBlx7B)>T#nS8Es*F z1+5cDJr|!FHQj_IW*{m^jMdqfrfEyK*o|ce0rH!Zas~%}YEz!Jw6Vc6d-V%6ayKJ zDTJY{ZieDqiIkOZw9UA>?-hyzn^lM=uWJ*$l zQG0j4hU)Npr@F9y;LpbqeYaBJAO~D@{#!P|_h7B=m|T3}3>dKu*FJa8-)NW?jka6q z!L1hyF?dY2+MHS;#t+ulB6~!1{%Dhpw_i;GF`e4*A5h>##yr~lG4c+SKK2;V-;hwZ zVS1He^^_qXb;ZbX;_a0K1eN{+3(3&N$_|%M#Fo6^U@+NVH_#$U1ZOdgK}9AKW^{Fb@x8_BdS5?e%Z3zI>s>Kh*Mk0znQAVI12%WYxpb1jq`QOlKJ5b=ZKmp0 znnGfK*FvK0_|bk=B!FdUtZ(<0?Y|p4*P3D`5G^WlJ#WPYb6EMKKl+mv<(DIx<#D@V%1y#`V}XKZJ8* zIPuDcIsve;@S0Soi3&GK1-+zwb*;z?BY0ha{W0-BBfC>^RgcForxdyI+7qlS``3u%}wJgM1<`$y$~HbYAcuj51}Fo(swWCsPrqa8-j6IYCRtSL(Q^NHsEg@k{nulFWxB@Vqgi3$Urv8Uqets5WotqMYp(iYrqjl^zreSv0l?amR>DRc6tw0u4*N486qq>0AUM@zuNGKVqlNfMp~AkL^rH-HeLv&|R4^bi0ev3-k*PylymV%W zgU^xSNiIkPV8xF4S8_wzJ)?>Mx`u?S*PFip$akl@p>Hc549**?6RU`|(mjz{&n{S> zyu{R3^Kxv%d@$+6`PThc5IPfdg~Eq$hw?&onhB|pULW8+_j-upf0Lc>|3*NTubIwu zxIp(Jav%nTWGlCuo5&PrODUbO$l5-CjunpQjzwSqtx&ZEh6}vnpn^f|GuzMLQ1#Q% zoQkL;v$mrH5BUjj$fIg*4e}gdXGAX<+RX+!SWS07-yec{qT!TM=}83}PlE>3WDen; z{RS`znxVZQR}xz_*_`-8AjSq_fB3H*z2)I_vT|qj=VS7-XYVhc zJm%A)fWS3zxb6h;oKx(~G%#)RDad(XRHb&&-`)9vBroMd8$LQ(-akll(djwK{9qAo zj{WNx!VPX9LU4~|_*R*(MQ7`ICwcOI9@b6L)7V=r%=mM3tO>Hxtq6I<8_HKdUOQfE z45$FxD9eywf=Dt1nYtPs6dkv1G4qS#)I%ygbNy<{(pb; z0?=rNkjG+5e3cU1ZpvS-3DmusW@Y@cbeT$d!7B`Vi#r?svE8Q`xOD(^0cJ?3dR7Wz zKmU~)CgL_bGkG_KKw8K&xavFvwnAld5OHMId<3w65rSu61IYcWSLy~AH0uKCJ4<2*TovcAcG)6^IQTj4*%54&#g1Ns=AA`!y=O< zunTXY|Jy}op#Kk~^}o8v-z4k*$mg! z9e*|1|9t-Y`oD{R@VI}I|8>B366AlE_}`WNp(Lze`CVm@mDhbN5{YCll5=>bCiGEWc|jA zS(v};zvX4tZ-AJY;d=(YJ!)olCP+3Wrf-b+TTuR+|5rr*#*FD1zGs2$dwyBjzbR$* zzf?2}9@}49neMM#{GH73{ekohcV)~GJl)Ff1jDY`|+>#|24q>Z1W$3{pY~{Sr7SdA^ESP z{VzxOUqUh~^Zy_uv$K3t#s5o4{vP0eA(9!`@u>e+)6;)HnEZ{!g9!f1;8Z*crdE>VIm;-&8U?E91Z3{iTxW82|sN zNHb>DEN0(th*iD$Z%}s0|e>`$xd~*EFs6-}F$c_GYW3a5W zug<3%x5Umz?{1swtIgWj+g5*ufY`7~1ksJ9p)kOakqKbmIo0I>n!BcuHj(7JlY zN1v24OMc*Z=6`@#07n>r3N5YpqRA2onw&3Tbr~H;W(_*8mJ*vL^VKVp%aT)d3T&qI&U2pSE!TbG?l&4-AL<2M$h8$4pHxMoiCihhzld zY(Y8GfD`$m@xW1IA$I&2`KD$t?nzmNk;Hw&*;j6SV2v~sNL2*E;Q@eD{i66;qGJiC zSV-`|0S+DjjKt7=JFrKNe0mwapmM?AUz`B=gm<=M-tO-E1A16|uvnN_ni`uJ8|=W> zQvoG=mGOK#TA&LOG7!0YW`+*%XJ*D`QS7&72H`D@OhF)?37rIeTVe@%;011EyrYL; zbva2zghgr=fklc#hWWbcQ=`H{D;t?dhDH%U5ApfsYJf-3ZVxxpK0CA}Ff~*#r@N^X zYNEnUfWOyMIZ2c-YdRT4<3zf1%vdH~PCW{iW8@ZfviuaB>C9Pycr^(_nzKhZvivA+f%Y!p>jmSA>9d-`B> zcaC(y0Q~r10Z2(Xd)}VF-=#ii*L-pzgLio6Uk}6LS&EwgF29W6UMzhW0O(rZ)sPl^ znxJ8JnpFT&4=nKqp@C$1m%3mXIqT^UXm`G@V4wF#d=@|Y6+U?-zI4dLI5pHh5cod2 zLw&%})B`FvzM%aS>+2B<84a!QKlJXZRh?$OFw{{Dft;H@Q#9sA;pYeiSd3#iOaSWW z8R|by064<~Hu*>(bx95Gz82?lU>8Gp8i3Y)ku0%$;R?X*qtjk;q(z_5H3W_#2oEkl zul?{QAKos)SJBqfxg7DKIKg{ndV6QELG26pLPue*1K**{pkvo@jbL^48)}{Wv4HqW z-GL>ouw`++cPi*HglV#K_T>8l@C6{@9AREf=S#=A6~n`QqlJSaT38)}umpE|(Ltm1 z9i2h=3Fh1oxKZ-5&4%ea3Id%Lm)ni-2tF{gwls9m%m+Ji_B#{XXWH*>x)6uUCqNT` z#Godi7Wofr;v4NfE>-H+&{rBmA@7&t2caO|_%^IYiAdrD?ZR_jn~xnI+nNALDMH~K zUw03D{#PI#5&z?qE~=o2iv?#%h_Tzf-REIF&??$jJak|c@W=w>!OTojr;n0v5CH+& z3`;OX1X^HaW@@T;=>oSFPUTGZEq4D3T9pkK`^&KBffsCPYAA*nD9zRf;Tgun$a~@o z0pv65l@JUc@&rahg);VJzg<=BOX!QW_A_|(GZ3HmE2{P@ERBz4jBxCR)ZFk*g1_GW zGi+^zgRdB?kP}Bs)29u~D(mZH@AIwy^AmYB^~=36ZqeqX9uOD1n^D)m?fzZBKk5PD zGYTAYyu!xF}C-S61fT-eC4%vWEJmsFH7tNjI;Z> z%*by8ApTbr25%htA8Mi)xx?iKA+do}a{au)m)m$}gg=_>+y?NWt@w<6Lb*gJ1&2k# zW9df?txS*q7{2}BFJ)i%#ZBI#9j)iRFT+G*IJ3;`>E44ww)mdPQA_VMjB|KnJkjd* zFZIn>>d8XGQ(e>ZgLlzDU;(LH9lXG-q`dilWv4o?4guID-e5+c-3PyiE*e!68L zM4;j$?h8Oie;v*G9}v8m^4rIGG~l(4@X`Q?(m;Nvpbq{`nfsga@|}`@`<)`^^_^16 z{oUNccS|%Ff72$18-YiEoy`J$$}u#>a1T!2%C1Vo%}=hJu!5iEV?w}97O{O zE`_h(-t(L2>Bj!z0?M-lDKnVig{}F7_+os|)3=(y(M-lG@yXQtT6~%gfPWP3PUMMA zyMqH@==APw@lKnh>E7Aso^I38gp2R}aoe{I4sy*;)52qk>o6kAXvgwNoB-atqicE_ z67B_pd+&eIZGbBbf53|0uY>pawHTV#Z*$E4Nd+(Rro%V2J$}N98+iJ~^1I;UsvY<_ z7x{Co_RF59G{{Gp^qQY46)ipOgTc%Xj$_wn#U|&OpJ)Z015IG=%jg^cm&m_1seYDr zq~QJyG;k*HcaP4)LI90z1LM0?oZZLr5f~4&yy!u-{fS=Wryvjy^z4`57!WSWNq5pN zuEKry0aa|khmJqWGo8Yh#2gUr&{_8{>TA8JPmLuY_LeV&6(IJsFF3RD(N`af^^n%L z?Y&1mu&!UlRK~k#}E$2SB*NZ{12X z*Lq`LB@>`LlJ8$jtvw+(UpWG(&oz*rWsZJun7iGD-wEvRB}<_8j9)qW-!&|sMb3V3 z75m)>-!)F}5?4UDEW^B@w6Pfv~MwA8-RWr zd_!dQ3Qa>Dekiy=8Z;Gwp<3u&F!)r}O+ka^&LMT2a#F|c>En3C5|OJ=n5l~Jo?>#T zNi z!Smi}xjYYyq4fz)754SlP2YQ>M7(kKV90t#KX9e3kc*SGUvNqW9yx?7l>ka%|2D+} zzfd$m7F&$dNc~}ZA7m>*w43zS=IprEv!}0PEaL)sCf4P+>=FIFk|e+dS9ZLrv)4V9 z-5n}R^^h-D8}PcSa%}}0jM6nR*6@3^Lr;+I z(M?lLxgpZ~F7wo`-YDkH8c{$)e9n_AymrLapJ}7va#S4Ed}1Z|4!i9{2z?}{r_kja zX~V~cRkLRGt@U@>0)E32SLi;?7+JP*+%en{VSP&9T5IOuKHn9vgmy+QCC3&x=rdd= zF6hB@Tv1iDE@n?G&Pn9lGe|U)p=(QMS`^`CAv@AC_rn6~Evdq_hqG>tTdrR1MgaK& z2@Z0?0y_06)Qh)u;qh@({Ig0bJ>=kVl0{Ey_3QS03H`~n(}FASqOXt&AZ@>F1E^CX zyg7oWF8>5u3{3Ns&nl=DVHJpZ*Orb`c`<_~M(<4@e|AKV#RyG3vKsrU`RL>k8oqq= zU{2c)C`kiCuLtUNWIzX5QNisXfU!fm|7yNUxF%tHxuGB(&)cC8=cGLLV`xLgH$L3w zHQT>}CxBqG4PHV#+q)%!8Ytz}BUcp39j-b8&cp|c_hM)dsl`cV1)PeB?)}$jZ@phs zAUG-!JTDhZih_C;VPk*9PPyzBpK(knKga_SYx(YIo3f+j=ATh}hX~?ovK;gtieaJM zRDZC3!tAQ#?nsqwIQJ*>xhWEwx1(W*BljJR^Pl##L%6MF;VI(kKj>PcAaLcKca+y@ zqloNC)n4Zy%=J_dVv7}YqoQ8t1g=kfw`q0zOj7Bu*8X0{GKf zcHdkE6NM(u*I&trXixus%B^#GW&+>bdw{Z@=LdE&Vd-C*56DE@Q`0+!jF7&m#$p^! zz@o%V+tIcz2{*Jg3P~!YmX3CwTq6W;WuVr^KLo?RIxfN5M`P}R%nW=)zv`R5?(Pn! z3>n@9GKhwWM_asl1~rSK($mRpkd`kaT8+{!9Et%HR+t%~r`;?mxizKH|pe2!U1*mADq3?5}zNV3-R8liXtfNajGK#5!?LnqkZ@? zq6^`12)vU;R^{vQo^vF79?u`1z5wTedmpi$b+dZ1if~owX`AZaaPBybdG(73&R$bn z;X2t!zlxY2pZAAr*kPxE+T+hoY-sKF?m%+1r4B>_l7XF+_rPJ2h15z#*aSFD6flXW zmL4^ay2QMigR<{mBWfNmO_i0sMadC*eA~v>aUC$eQv@$qM)!KkjgKZA?^72vt>Kr) z*%DtVG%A)2>8GMo1)Q=T8YCv9n$1X7u(^DD{g`uz3%yzj=D?Nhi>M($;w)73(Wq!OuAD>VmFb=#-hM)rD?D z-X)1rsk|!?MeU^1SH|Kx^I0VR@tn?5UP4*`nl2+_PEI%i8a>L-=Q!HGsW#eqj zDvS&K_&!u_e13nY;O%JaektWrAAJJA#W_TK{>6!SW^AjWyiOOjV=!X8>XvEm7`f1Y zYKbU)E$Z-(yUEJguRJ~Zni*b;g|K#ULmuwVIOB5TqDzNacnE%Mc&V;ik}MB>COU~Y z4!(`Y%A9*MRIzN|v8m@3^0zbe)-9K^X1~y;^ZM|bTYH`#cGcxHgR=E67ztV)>kdDO zw!Mf6lx7gTg-5MC&+?mvADKx)SH^mJxgv&Bk}mw6Y(k8LX%;ZZ$;A)_37cw6>Mx`P z%B-&Z*?*A@nl{QN~XwmIyN;w=2T$nk*b@VYFrX`hTom?pF8Z7Kh;ArFV4eh+W1$L~!> zPl!D1ZC3>t_k78NFd@4PV2%6=vT&B!B27?RQZLPzHv|aDY)f6uE1y?3I}*i5FX<~u zG(2WKh9)P;Ujix#mcR3mN8{!8q2%?>_Pjd z37^@b2twtdUrCI?KanJCHV!xLXRJL+UT%+9?^^+eK>?HR93w9Hmr}S+ov4{*dOM+LQdbT!O+dzHs~*h*fm>6)9Lymf*xLpmwU1yAE+0>NBxl~30P6?)@jQ0d&& zU&Hnnf;Z2{t;MA4+E8`HcAwlA<RbsE=|o+&`SArNl*|B5i}kMc=MT z2GL>@hn)~4Z)tcH~fVX!t=-`Gb_{J|M4d<*wpR0K=N5_5k=qcM);gGBFiEy@k? z=zq!d6WVR)Wmj8FjS2h6dZ`(f>6cAV^cuceE*SM@VeGAa>Otn?IZ^1xFsfyJFCI}L z()BWH_~X?ktu_x_@vlbnSxsBkg~fI#i~MPg*S+&U{&esU0_du=q2sRCG|v2UA9}7ygufWte0UGMiC>m(xVME!q-~O zvq?F-AJk$&b?9c3(9;rHrhB*vGcVBoIFvW1g41SPFkU-C)gB#;CXYYjP{LOZ*43x=1SDq6=rBWL5a}-%H|5Cgz0q+ z@UM48P(NzS3=OV*-VB(hS)*L$$XZ6RpAYh3(lUe}U3WDFE68^$=s!u$2Bdvx!Rn7a z)x6J$-w&agJHrPO_*UkDF|v)2^TzqizD zqRm55alLw`UO&u{fr1}7pnvYUW#A>mbMya|X#OG>XH;c1*o~$GaUoufi%(CT0ueXe zE;Z6Ce?#g?u8~?-*fb%JBrZv`p%T-SPC|x?vz_q3p(7SHx#ET+vz0x)Ys^b>@5D#% z@w9MR>I}heP)En>HfujuagZ(C;lK%bJ(cQvLe>pC5Rv9402swoC|YidXk`ynWS3i4 zODoLBw0?f-)dTs)9(8XoQ`t3dwes9?$CJ&TdAtASR-4-_dI(g*=%()b_RvrSlkPn^ zo_=V=8>b@Be{~LyQuCMdntPdqb! z>M8mz$sz%Y$lm^RpChlJ+uT~p>ak3&*l&Zi+^R22PbL5O)EJ+Ls&<@_A$?u6V7iO} zb=EBWVsORS!B5vis{&qqe4IzItbEDyE3FmVq=~6bH5b|#TEVRCr%nm9N7omovJazt`-FKtqBU}EXME|ZDQ0dz%rm2zvZFqlb-+Q` zBg}7CWEHr4f^_hwU6Z+1A_4NNcfF}!)P(z*%Ww&^dH;6fc3C1nwc%RdQWx$G85Swi zu46G3ONSXex(s8S>(^+}6^GEPM3e$@r@bp2a-!z;DJ)f6@9{=r-V-)Kv?L6Hswm;Y z;rHp$k@wlzCzM zf*0z@UN(|08&Ov(^WU}LuqJRi02RqSLbVX%j4h!FDL`rRXEI`=@GLG5pxKe<5%(Rc zY1)YE-4HC&T0E-~h%WXwp8c!^Ts_vkek*U}B=cP79_h9(V9#`kghJ24B7Z9I57Dsp zt74fl6lr_4j9`8;CSd5dx5@*q1+AkEYo7xs`$x)68Jbgd#f`ozi5S7e06q*09#D+a zfBXmUmEYy1=Vx@03(Sp6g3b*PLT3O$2WU$ewqUaROCl+B^Hjl(|&gV|-)48phpvANkq30kghcJ@2+3 z1HDR4ra1G9`*gT|XN+7i-j04phv=Ydy!xpgn$1ZDqt#ch7I=r6*KZ$3I%$V}8d+Pf zLO(#YV5JIb*O_)mMZL>f!V#4zx}&QPwEiPSG0wul4YuHxuEzCtRIJ>F+wVVw9Uc-2 z4G2DzGM?}8^1r$UuQ+hU32d|bdEiS8wwfFixRDW;znGI-{9U9!yAYca4vT^cw{Aj& zNuoJnJs6h?vI4*Fa3Vzq5eZ+a6+;wQKIQU%5tgHhOQhTf6#f%T$`Lj9(8ue?K}U}X zd2o&zTohr2id=EHzSZ;Ow=2$H3Apg~zS{uCU+b_kx@)s;lB;JXu7W+r;n7V1zki{k z3$@*NORXEpANPH6(7qa^DbjHf)sa`8;YW|1*IqTMlgV;y^aOL3+p;KKXJa0XF6M@P3!D}cHltd>3E9S~O<D9!FVRi1lLoaMrXF%L@XN(^~vw(k3#cj2UYw!8Bz$ zl&M{UmcuN2>-4)Cd$2;p_7l3B$5%mDJMH)b4x)|o?_gHoj7N`50;5mqapfkrYPEJ! zJ27$f`t~GVcZ+!Tb6U4dY`QFf#v7XE^t8+6FxE)@DRKSz^EOwO&61GfGToTm!>mP} zMPmh`u1cK*N>y0XcLvV%LGk+zzZVqH*j#JpAjMkH>!Yg=Yqz4`p#nWde%p1RakB7R zi`xu|0LmwpDV_(k3ZbhC`2i_sa^<`&)wgEYQBu?0o}bPMXoJ5-g%C*2ocBD(8&#cL z_^|e%Pgn~aTo2!0Az+2y!|IQ8ZFd03Y**1>1YvAp{%TkV`C*8<1Fw1d;#CA&;LVnvp~9cY(AbSY53;_-VoC4*c=Aalm)F;NM>y$%Mf?e^ zPqt%9Fkr5DdJ!t(tr!Ged}CTwTrJ@v_h6G#lU+WrFEpX&6vK@EU&cvm8BX8G5wRWGuf$32ne0UcH3m z5wU22+yT*tJhBaW($Xl&!moHMs9D2Y!|^9Mo4UyIPD`W9n!}aKxUa<-AW;3FduDX+ zieOFO-bKm8INr9@#>m`br?6%%Amg^sE)4eWNO&CMJ<}c|$#yR~IfjZil+Kg@C57#a zo;5)}Z&p^DGx%llR-JuSkP0BR7SCkB-Ym^H>vg-PguUQYFO`OhwaHZ~+tHY*61Ch2 zdK&|CjVZ$Mc_{B*Ht(h-v7l*9ow~-aG@Cw(hDQ3kq%!!pxD5hTyj2h z(R?1XxA%-v&MR7AmG^9*w@?-AQf`#o6B7z2)mJpZ1ghBO86Z#;pBZ)~#n7pn`jroF zIHWjo36DwKW+mC4m(_3c<{<>$SWQqlTZD2LG>BrnrcRO|6Gb~%uRiXr+fIy#BUYVo zVTwa4A^kS=BSop)3YPRfXDJfa4I!=a9eS;z)C)Uva7f%UD=nI6#g?^=FN7dew1g-f zP)k~;81|G3bQv)v3=5QIGeKAfacsTe7f-?roYbjW<|I~V9QAEi5Vf>?*J}wyEcQjX!9CC*(TRX7^#+O*$5|dFHhX4;RE}irnkoK2NX%D+Ch8hbM6Z+| zXE=066p7H@K%tz}x2I+94GIsCDGz-jj3FW}8siG8lu`YX%$!Z4?f3YQ?zqD;=ML** z-jhpReSKaMFb5=J!7j#Nap;J_|c?GWP`)h6X7py5S_5y&s|GDHqpH| zF4oqfs;9kP`mIU8x-@wp_Kr`EfF3lC7vV8^EbBuh5A7;ajAhz9Ib93O?v95N4O2U?ecj$sg_01Q64&M?NT6P*h;;w zEO*|%sQ#dJWa#S6oe709gT9=>V`LeEkK5FeR;LcB4e(?Y#{2MrS;cqCuHF*ctZm`0 zEYLPb@V?_D^J$dZyle<hRSY(s`kJQ(o` zmtNg*Q#M4d5wvYu$m}b8>wlIAC44plU_NU@CoM_SAn`M}8wrvMFDDNSAF(*L`v}81=mHUG%7g%{Po|}ryQbT|o3EvI6#EPcFfkgAz z?=|0^lIa)HA9u6GT3&NLJE=Z4{hvVVKAd%q!>kT`7_g+ABSXVMhQo9AY>kUS7XfQT zZ&Ancb^u^`OX|nejYS5&y)kG90RdB#Q=ynNzYC-dm+PagXvwjUBtvB8GsGfp^A`RX z(AGby>SrIyPyrA>MlRpP1stxs4DGw67LX#tSpeEUa}fEDpc+#`Dy>k8K2WO#*~rU0 z(VR-Y2V5v)@P;aD#jw6~TpAh#v(M$`x*ksEy+$^je&2UR(@0NRZH}EiLQPNiv}idT zX&mJmOS|$^KQD8dQL?$SsNt??#*WE8RbzMKIu^F;@AB~x{l_WUtGSU|NPl)f@o3+5my*V zD3G5>FZ)2N43DcgOc1Z}GkwVcS0T1*ApS`V-)4EDYCjWXd{T*he4JXdliVx+_V?Xk zkN!rgya6v!xpz-o1K))#kGgJF!FfKpHvk>}d1Z}Y;ARrnO!wtxZecmz$6BgdC%`8g zI&cK|&^wrMZi|4$CAl`l+Fp;fFqyScwZ|Wko;eJzu;Go2H-F|=D}ttEfPMUOtI^ru z2eiHpt&B*S-*!??xMS(ATCg_pjS`d|s{Qfpevqg6+3B-`wI(KIMzY}_cW%|!QqQ~J z2*|X;ye`MW)-^iaei_XCgep9l>{OGxPj!vgNgsR^5^lb=&5wKdCB>b(BzdUpft{aG8!X=z~}3MTyu%>*XjU&J}zc z0x}5nV*xQ@R|+q!x#sbrtD>Yfrh5=SClKXyPK=&N^~P~BvBeq=l4n$!*4uj(D?`wG z+6`59pYo=fZ>2ah^Vm7VhllY(vJs^PX`3PH=cCv7GiMo3h2T%v&!dL#HCxkFVqQAX z;u!o*P^#Ao;o7$3`Q@``|M^ifsvzrR;&dx>Bq^Phf*hJt2iT#J079%CAa+6b{37-| zgX#N#pj` z8E&Id3t1(-I^GGLyBh?@$hEuGK(_DAxuV4KxiH7)Pg~Qd6>x)&W6qoRKh6zqKBg2F zsU(?;h0%B_ZN=+xzP6x)8X+oa?pHO>%F{=2onhY7)$AGL)*H$C>11+T^AozlFs|#1 zcxmOR7&BWDO3|C^#Sg`_xyOz3WN$8pB~W&ubLy&j>$a<_;DCjoByMh;-1XZXHnsrw z6hRD3(~t(UJya>OJv)7Ah@a~Ufpa$)i=g=_a_32~UsR9M0`N_c_j$!hEvm+T!`bBH zb-iU>D~k6^+MjH!c4(`Vq?Ulrf;HblBF|zMC$e5SeMXfLdt-@x*QDG8QcLV|&S1iSj6Q zygRqXg6+ttvB+{T>Dlln%rTqYu)`$OyI#!#RtXm_M3~SF^YB0)Ip@$~;Nz-Kj_HFu z{@j`DC{S5r*Ue`cy=3tkkCD$r+<7@QpfA63mfR~9u^KR)7`@Yk+YL`8b%rHct5b1T z^^{=(U&woStQuShNzbMJ{v;1upa2}?5cTBDQ0X5NqN3OZ;OPjj zTnfNq;~RW57&i~q8(>Ls0e)M^70573-Kx693R&>pBr_yrtQ|ixSb^BQASIxjC|qv# zBrtFobsF>>T{PhtH4Y=KmXOPYQo9iai5jPJ{O!wo^mvoXvN_8fl5Um&MPJa=<5U`l zWL^D=1&dC$sF1X95G~4+vBDbp=wgSVs?bHis)gNvjFk{*F7AU(cik}fI72~R7PTz2 z@l{_`;csSv;@;DxrC$jSMnWpvtBuh%`wru?k{Qhg_~+b2Sd%CH%B}w#5omc? zhEe2DfdxVRP7X{`0VCjyB-FT${CSQkk>QlsAvTtqPHnm%a>PwrWLo%6%8y8Y=;xuS z1hr@~Yxi3-EU(i+ztQ6O-aR+%u{{g|bCNp-XQ#TpbVJwtt`qeBl||eLNF5jRBGcEi zFl@n(w@P(XKlMA}U?6hLOUeT*usv4w+BONZhny4m>h_MJinPhWKCjFRMqA2ObmFG~ zSY_~T|K;82{DSmdVCiH4g2|%g-E#REh#6eab&w1nS4$}_P>{|((*D|$h&6JVj)>=M zyDLOfs6(@33gcfrJ!Vub8Op83HtF0u?iAt^j3M6FtCwo}iNzOszQj~+Tyk4SNJ^(F zF^*p>YkMbCirpq=H`jj1QImAIEO!C{%!ljZ`UyK`~=A$_yeho*OS? z_>0ThWpOD?ga^e(VsBx8G}@M=et^x}*zhP~F>Sj zftIlzCWIGr_>g(RJaXW!|J+HVRn?m}jR8Uy5fE-!PN{F&neF&BF<>dA#2~E!J;&89 zo8Y6BwtG#IbUiF%X8@l$FF#gm8^=tqpmCA4qr z)DVE5+DoX*Y;M9~<6hc7Bo=6_^9&04Q%r{ApO=i#W~L^8G4X^Eo(PgnVPxLO+&?Rl zBRGxfO|^)VY4~O-0uzC-(xRnh)&J58D#*~oFuh90o}h+I&^X=>PIGUI$h3a_O%ner zWK51stZFy1*5}4t$f|0v{Kw^FYSyCTjdL*RuwcFvna71Q|1jRDY5V0i++8+V%dyff z{_^*7OjW2Q4w?Arva${Buv_zbSD=FN4@%q;C3x#z9GvxgqD5oo7HFmWnCnggb;F-k zH6&%BuG&|V3lpC=i#yR$w{+VQ(>t_gc zb932*XyaErWpc~#GI=X*XmZ|L>S}pB)p318zwK~vdGPf?| z-Y$&<&+{W=OJNC?4?8O}*v#PCorFZ^4;7O@Fx6`+$`^)OIE)9I^ucNM|4R2F(K(OJ zMtX!{pQjDh99R-qKPLJzIy<|#Dm(F%+1*dsC=xeEZAKrt#5^rTyOc~TZk#LS+^V0& zV)2qy(wWl>?JJSz^e4%;bhg*rRG@rT<^!HOuNq1rxK?keJ2pqX{rxQuDRu{HxKREXVAZ)k5t>CXaclGN)yD=2KH-)SwQ9IXZUAXj>g=CP=8 z^XGx?w$?BEb(j6dK^J0_1C#>1+Mi@rfACFX4U?0tOojQ)>|5(PzanwEMFw5ypYvJ z!;?XvNmXjCEjyD}FP)~BSs&iC0885EQ9S4&s@~S*EaHsoH2bagt1QVfgkkhrGo+P{ z)=5=|OqzBy7lMePYP4N7MoKH=BYML`8_Q(Aog@c9eF`WaI1cfkSVK4Zz5L51Qw*&8 zm-S|^c04nC_?uYK_tgYDqqdz!2~ebHa^Th`tbL1)aE@WY^?`>X)@}(dhhzbcAsB3D z&K%^FoRYnYzomL9-)j>cx_1|n@5gF6a@_LB!)c_8{c+9sDnZUcO$sKr+8mj{K>7N- z+XK``k?cE_qh?;Q7lm5Ip>qho@9h$gr5*@$-;z@^)nN<;>=2ZZd|aflR_P;YSh+7% zQw+nlVxUH; z2z~s5o3jOmci(DzkdG>Q*UOga?!bI8yO(vF7F;${ehZ~w?ra(>btjA-Gca)_(Ty>B zPz7zOQ~0<gQ{sy~>WW36WGobrKxRCh|(<^tJ>nL79sVWC(Fzrx26}BV#5|Dkox93{9_&SiGRf-yeUU5&V~h^#h)EL3!QL zFKYEVbwCYxqxRRaI?khZmg7Z{_%}hObBS9K ze8@2KjMyZ%-e=1=baR{BvBdAM7*En0ROrxV>3K&KC-Ei*lNHF?#pHse6Hsq-fxM&x{e#?N-SRM~J4 zqQVCOb2GJwwSV+H4x8FvcV^oxLV1{s^Bv)8#qV2emeAw9XMJcal}d;wutgZiSYt<3 zr3kyBR2m0pMFc$zK)+mAjJf0Kn=CH;(ErU?H1mUS>47u3w|t!Q|KsePgLHYCeczc` zW81cE+qP}nwzbAK*4XwM+qP}%&TsF#_j%8|&pm(KiYMaf_;zMxc11^bM^{&V7EF&) z-3}4J^T?FJG3>oMTPQ6%?c~ZwniJC@i(23RdZN#l(8t>1FmCGU(PJSj*)NN<_;r;| zL%W=SoBIsaOx5Op7=YfAK%{iB`dYCn)r9i*j!~5wuKs34Er1!qsgvWC#mi+J3>nss zn@&W7jZPK)*y&r|EfuUU5&z*^M`XMVjk=r%2g()DUF6|6oAxyvy5`c%KFuG`_R?A7 zX`I3*0@oj*?q@NlQ*XPxX1xMXfojxKrm1iolJ4+M??lt#q+Fsayh^@x2kAlxNEHeq zl1j{9s#*M=CD#&VAlCx-(>u5*DxzT)Lu}{LbX$=+z=3z9sZD~YSqG(lPmg& zTDi=F{NwY2%Tw{I^M#k@mm(uZBj&FUNZfFNrth@|1MNubnu(yTC(jA=6W6a@rgG#5 zQQSXfmUALp=mC8pvB;W^D3Qk3C!1Z?^$O!ujBAT$qCqW;DAs|7NHSYg@=|nvRz4qP zpu)$t;H-~yUuo>2%5dUmU@)mia{q?YYA-*ZTW&(G`OWc8ePP&6PGJs^_N#$AQ3$w0 zvh2M`Yr6QJ2kO8^5||sHniW(=z#e&no-T%UQ+c+#TU)bu@jhC+QRUkbn;6yI{ZxVm z(I|0rx1L-&G0zkUpyGGhR>7hYB5|lxD}+imzy%y-Q#G$TuxzC&ixF~`UbW~Hp1J;D zWVD%_D<@O`#j+G0H`^J^zEp4mPv19(QR{;}6*4IIoV1yr93ReIe2apZt77CO8yKbj zlq*3iu6utd%ffKo`G;M7oEV8m^5+;6OdxBCWoKc5!9yf-(b(;kr>|r_iXfT(GiNiqP z@iD=H;E0hU)Tec~EKm-hSF^N%R>gc(QqMEd$?9AukGL^zq-GuII;G%f*>R&i_Lk;* zZ)-{CtYKC=Z;YviXT)|kmMb^F_GP#Hr2DHS$Jv8ud3NYCk`WOqO79u3q9nSfbU3}J zTGwvzv2gLGVd==JB9$7HqaWam;VAK32cDkJWuEt-(CA_Lz49m^YbZQ%FQO%)r}eDJ zQK9XJBa*=haxSDC44G|(gm`!Ih{i7oRwtm-+Fjk!%<;-2vpZ7_!rkl(Yj@5->9}`b z78y5FMnEV7g_g{$=54z896E2^>Sx~Hv(~~uHowyCkIARWQUDAR=XciVlI5nff#hzKkZ)%* z4RYYR&-tRpX6=|VvSpQ$5E-*dPedX==RgKS`eh-V>@A0qdSir)TaFWGuRlI=bB6XYE81m%aYCnrLomj2U*&L@UAQk7YoBrBh>0sD z-#dG2I+W-Hy|dHqS$?$#c<*vmm-La?BIiN9q_Z1sn#A7q?@aXOvt0twDTXO34}bgt zcRy&-5{C7Z3IgG|4|p$JOd>4tIUe0zK!ls0P_k`T0kl0ysB~y#Q6J2d7Ledn7`64j z-=Ixja`gVJ$uXHMOZ>dBaURG^y{0_ZNhN!|PFi0?GdhS_F?&4{&FmRHYpcUIW;lwr zPCX~z`=DnK@x<(P#8rcxCVeR>(cgGOA^4UW&{D^^k9Nq`RjtDK1?|Q{4%qa0KXC4u z(hBPaRUn^1@m4wxSRi8u)0jl0K~F>~zN%OFd=X0k7H7LAmofjlRrVQyspRf#uB_eN z91=Gk#lUKdyd58D`&Zhc8B#37d*@=Wm!ik}U-cTbp<>~=I>%QT)*kW!A%{A7&r@ho z3M9?KKRx$!nolQuZ9alNN*}Q{5Lz~_Ciz3cu$Q6${|066ufJsZ|~UH<|>05iOk5WF>G6(&du*b&TZhrMncV z<=QJr3{sHM7&$L|H{JnSp2fV)lf9_8Jg9@UGJ=mnF)hzh*|UUy3FhyW_gjI^BVzj2 zZLZfNas!(7Mwl1(Nk~|RLUkk!fPZzIN>T%yB$l;!l2@d0Z zjUmi>(PHn0D429d07EEtV4A8}?mbj2E>DNoj8+SGckYTfYK$0UbtMER%uT4en2uUq z%D|+0)|gxWtUWUdBb%|rPNOn%SUWk7dcI%QH}ATexQNm`wT@!RBmq4Q@i7L_nDKoh zuge($6>ve6)b{3Kvp;p`UHW8#o;1*LbeR!Ha{INFeQ-s$$EGI~1?hg#qfR3BKyLU) zI#;|022Edt3Szqw(gya)NzC9Gc^i^7&uSl{(s97((vD}teox2D@|137oAfPS zIDO14S-www(J0``I~>DV<4om2ru)R+HDdlTfZfJ=1M`bfx0DV3gAu3;mJTu48x2kRg@SM|hql^F-xy zl=r9Nr9&?v;PA_KO&*ZL_^X#rD%dK|_?4XiWoZ7c-R)nCB_|-&KJ7gL9R2>(=loOC zrcOJc$^F*Qib`r&5rebdPO7~fVWsRYC66kKCcuO6<1@-DMOI5)m%41R7YRJX@NpX) zFCDElWanI%uwL~lItU&E%hyiIfKzk#QuvWDj_DKs4+HzQ4mZ?uq$nwnmhP9V7{)g6 zl<0>~+?v_U+9SkvgpQdFNEQtDKc%kYR*P1%389hcY`+9-_n)X4<3UO zGu{0v#divIvccj){VY$CM16LnZEp#(r1xqq2kRZF8QSaNtZW`un|>QIS$(-&JObuJ z&NT!C2QI>oCJ-W%njJD4kt1vNcf3F{J4=*$k@2b$0Dab$b^(TzWV`YhXs1({E?@}D z7qTlKeiYSljZEcJ;W#qAPolj5h~jLHG2`PI;NG3jbqo~tWy+9dWe%5w)c?-zZp>5`OZ7%<(9L=Dw6KT&n+ zK{=scwVY1IJ@Rz23ALcjKjGgtIMHj=jarU98!&>#V#jKsSkxa)>F8U|VFwUflWLXO z1+%CUVdRK6NT?L8!#diO*3;8%I*tcYhXat)H-;_AJ_Fu zp3knPJ2ef5`up3NBnkI7hXa(_``3OWUpH8?#26FyN6AO_ni;J^vr(h6bfSmNbFsr{ zUK0f?#hz;{LCEXUkSTe|K?^}`3PSNZFvNYEW(%Z_JZhDu>=){#08eV<5mkB6=CB=m z>&>8_<5MjY&20!3{XPbLb=%})0r#dH?%UEwbp&fj)~GH#i{O!}Xp+Cm#Y1_lb>8MX zvaw!8yc+=#QOe@LP|5L!IcaFtPF8JylUFX}JgmqvLKR_RZxMTeVA^8`XBFdXnuXZo?ON{LZtLPP4rMm)2T>qud7} zf`*9x0h8rz?HX^cnIWXQ&i()mynJThZ8jI??1oBhRSp(J@AGIVnqm5+saZfJ$CNL@ znD9f%C6;F6e)lorPZ-XYwQ;(}-}Z)s^m@}=YMBntvTc$2L5_n5l)+ZQxszb(nmB~J za@!=12rCgRFd(M6^&&ls8+U@+iq5(^k{k|ohI!mrc}9pNldIj8((M@-_KoC!ZouOb zU%{>9P#Oe`|9Y{hk$iuCPQEtn5+eAO{;8NSHtrlYV}wk_?ghqMgS;Y6K+>XYrh-kV zh(wcdAD>h(;jhJxrP)?{-6WbgX1SO2vbQ&a>F1!)#5p zXL=jut@e4L0oPnjplfUkH+ayuv|SHs@$6zyWw6P8|Bn2n7Mg^YN7IFNC5C1*b-zgWh#n1gt4% zVukf)XQ|7SQ}MKiu1PQ|dcUd|EKQ?#FHhZiUdb`a#+$hEU)^mSfgFx5VTVy5BL+gO z`t4yQ+I}!C(}^f*m1*JU=4?r>OfTfy5I-)EY|g-ZXszQ>Lb9gR!h)xGE2gR>d-CUy zSE|(!08|RTM+vEKufeoxjTNa)OQuXv?P3@$LxGuwK0KhUTL9sfk9kpoaw@fF(v!tO zyNKdVEKT9gwanHlbO&YFPKomlWlvQ7CiPWMp$TTXWfao)i_hq3a;*}bt(I&|VPaL$ zcDD$3)(t4a3_v6~SXlPi-y_lrGySWSX|G#`(ij%f;aX{}4OaTVa=)h&;clCG1XN(? zr1{_7<+a#V4=Cae4$^WJ_5&q#Ey`k4(88IlHO!^a)c7tpY|nHf{tgCDELlg;I^ox3 zzg3%gDt@GkH1~it1M2C{gc|aBc{w39`{o)f#bcZcvea4jit9%>721Xb1~AxXD=9wa&x!3irp z;7*`#?cFKZGe;So5rJE7HXnaODA85U7%u+yGwyT1#v#?Dj^9+Rd|IAh8aCo-@cI<; z)8=tc? zFzorIj}UYe9dy#|z)7ufN^ct{hz{9Pa2?b`a}4&u(rDLQa_JbQn$_#I&r*{ltTTgAdr`*c>n6Sq{>(hfCHtC zP*iU_Bq*ZVGJM*Dc%vibdPNUWEJ~V(32JEYLjXVFeo#6W(`whlsrQGFA-HHdYKVVi zlROSA<_#jIQ8={K@YdAuBwsnSfvI*0m8LODVr}&hk8RFQw@&hBQfw>x@wX|FHMdl^ zj=ZQ&`z^=ALk^9tc6+YaqI!F;&3hUWR?o@TQNZFsDH{;36q*)xo9#?fEkfrK$Eb^4 zG<{7~yMkd+Oq_iDUyHKc>la>CBnj`Bl%MfWzuHOgU^H=;?9Hih))lKVvoxE;!NP6EBd`O$eU-S>uWySV*h1FNV}YHlST?jF=I zT1i9&)6R3IZKbhD?}e|2>7%=fHr-HgdZh2)bJ$>&hd86{^b{(Mt1+XU=Zfr8^&EWt zs1)Cb_|PsK_gfK;MxbM0Qc*!B8nJ7_Vqp+ai`lTA2|zS3C><#Grye1EJahd@am~bS z?KD1HqyV}Ra?Drub5pw^##bJ}Y9)NV;D<^XMXmf5|L_O_H=15J^;2Bzb{TYCNjC@= zQ-*!rL?Q+bO8@+>?W7*E=GkW}_f)uqe5C!rX90Oqy!|OWXEG zFjK`ujXY%(lpUC!v&Pm%#e1NiD|8_LO6()W1G$uRdz);iLRXT4KFBknc^_8_x!HA; z2gh!$8!@x{3x~pMa_dm4d~7PSukNVbr?W<%cb2zaH&^jA$_30^7^VII8tdyoQjkL( zWIWFkW$u9fZ#L08aESii1Sg;`NDe>Kz(?$>CWwbTdu3~9!fiW2EPou(hH*O|@U8tr zPJ%Tp0aJH0FpJBCJ#S_KsHyI_?^P)pE5ehwdAAMtc0z7HdTkr;^~nP1;`)`mRvx4D ziz~5iVS;j9tD19PrR)-(X~tNU2cn#{FTZ2LPAe*Exs?0@MV zr^F|S+M}j*IbPMo`&L-6AAqX^3D{DrE|X)i1Ok)g23mcPZsUQyXC2T7m8&N0&Cc0< zO1(GmhJnDbg_u0Vqr2pbM9hc%DWBe{Z4s*!OUO3a7pgXic~(|e0Gr=B%Vs->fL-5G8q5|BMApNls) z0QseYEQozdJ_9I+!y(HQ0l8g&`sXYY8dZg0M>dCn=OY(=vZ1Kf8&-UxeyaGok`}@F zCB@Wm&0|74&p-O~xp_Mg_h1bBaMG+U3k9QpP$u}|lxzN84|P>wfBt+sY&aN;BYN{9 zq2OLhs@dFTdwi`heOn+qNXTGHlp8^H?nSFE-ZbOf{p%t;f~}I3CI_}tp=~LPsjri} zgLc*X`W>oyOuQu&;RVcvkqyqFH0S`i>a@RX4-slLI9D-@+N9|GTUnN-@XpySpIJ>p zPmpG;D_XEsmY0TnDl>9QUc?&4Y`H@Ws2B&4GckTy2^HaTd4}*Qp2vzW*?B<8+N!2W z3c#K&yYGCVTbnitfiDppo70c>l@|D>nAfZ?Z<}n>q$}G04uBN$Pqx|5T5LPeH2FcB zQ>wiWxsOvjNP)!Tlb%~=q}!cdyf)fbdoeHJ8>EKbs?8AFr{mSlcgk=JGW{*@1FjMf z@+;~6Bh{c>=K!i@^_O@&URY9ha+JH!8{F`VHW;eprhIgE)}`{a`fIh*G=q?GY^J zJqy4kDY|x?s;s-b<4SwUl)}G{OXJDUG5NTwuQ9CM;ugHs^pjX$+}nTs8ZFz8mmJMD zw;>sFUbGY9#-5JUq*MtAVt~KUjh~hGXsYo6u8_1+by?{_`Q1ymcfu|LT8-2I;vk&h z7xcX03zW)X95_ATte|+7-loel@)bCHrfc!$jcx-3T_Bmh$ytQhe4S={L07tKuCdN! zM+U7#&-G@kLe7X}566SWO!_Hqg12M1#`L3^ovDV(Sm`u9k}42ZQ~?It>m4&Aws-a$ zJ$qt+SiO|8$UfZ2*;hbE6NYNJvMg^w-SF(3)t_pL3fITF{BlIT`YFxvczK-B7~pTvD->uAbS)K#htB@!ef|!$erJxW=WW%P`zk6AC;#@fk%K^XIUub;v4uR!EHl{2cz3*M30tc6dUhK6{cpd26} z`MQda-TG*1T35hdn25okcK#b^)=lYAD6 zYlFn(=8u^XBV6C(ZV|TM^EcCIS_rg?pXI6!DQp-oVz4rtL|fW-xa(VH`i@Sq$Tq?k zha!TaWtuP|ka9O>D^THI6M0SUbi_pz0P=uW`8l6kK!hGa*c--ENh@+OAE%W9B}NSW z<=G`eA3C{2DryV+D~VUHWwA@p!9umlZlPBGtzFNaJ-q~cr_SKwbogLs(jgO{V)R?2#kO;9 zhXRa%nR46ZTlKpahMs29#dR7Gar9;%Rzhe50UQTQIfTuvrJXp(KKN;Hd79fm^r7%2 z$apR9_Z$73I1T!G1X`<`h z2s8O?IX;^KhB$Y!Ud-8~J1zVT%53n1u$ZC0MHE0&D0ADe2@7r;9a?#FwCiGcV7^$9 zzz+zxXTZDliNAufFbtrP>%TVomev|nyWtK^$9IFl=^~E%hhb|d6^KA31JDyPL{0;ofdulGRW^fq7X~W+8ZqI%17l79 z(0&n5jYZ=oYZAF+*$SWU5_wSzJUqnAB1Gz6H>q5YbtaSZD)Y}@a|C&-^$jhNm&lAF zNzTG~5atBnBSkFzz0i`jts|YehUs?QRPA;xVs4sY zv}Z`7&>E+#J~o|8wy+jVP?4_YkUSUL$;PT=Bl&zRWJ?4 zjY|g#Z{YWtcj(HpmI78$CSk?IhtTRdNE?+K8~N*8pB&^g*72ai2UX+@No0fgYAO{5 z$7{To0ZTW0?c)2}b5DpH!lt_OD6q0D;*v%r{Q>Dhm*!gFZDet2tmd6BMSpE@arWG2 z%ttsEIfdO7ZWb~dk*W6|W(y~n6FxH2yp|_>l2d^rfWm3zju{)z^HnYXk{hkQh3`XH zzsuhWZ>&<}RFXaC8_6+~$il;OA>c%979yt-1rmVnq9ZXVwGZedQdM$m3sMyj2k}>$q zY=cRWXvpsPtUoY3?$3$Fx#HAPhFo0w!&ROsB4wcQuS6h&AR?h2%mXYwHU|Y@rp8{A zXT!PAP5|_4D`nFL`X?6;>H&f|JSCmg#A-(<0#`O_$I}7(vYxe?S)m5kU+yz&MPWe@tMb@GyKaQr4o%m#|IMV zkGM6r*I63s&x;i=H79}2H+WEP&n7cOsvyr5Z9Op~yhWKn2JT*2EJCJN+7scD4Zzgn z)N zAEX=HrEr=Q%Z7aMEXx>~-7_3Kq^bXBp#N1fMl6TX0=IUE(DgALP@Z~B`mV6JV> zk;Up5YIiHapg(UTz~av~3VMRVJ`x#SZQ_op+A|rAMFQv*>tnD| zvaX@R6bYHW)U>fB9&hw*T%~}T&cZteVxL=9tpv{ZY>uRKIxMR0JE>bZfvxZd3dZnU z0A^o^4F4SGyhdW4hNW7+ZF>RZ1-`3O=?(QvTA+IcWz~10e^jOgtYb(92KEqHMq=Xa zdrNDl$g6?W&t=wP*)wnl{>@e}1jK1~*2kD-xQt5L1K2&MbUe&Z8(vkH5=}=zzI(5V z7wPD^aj`NAa5J;4Eh&X3}L<|qHdlJI)XxG||o04Qur}^ArHxDUi1Vg{H`szx7(~+lAx>sT%BtcLKaEE~Gl`py+J-P!!i=o=5nfym) zti)!hlyhu5Fev2nb{yE-634h=P)%9=`njxDAx!{D$itcfB$qaj42DfwL zav$M*{jc4Z^kJfc^SilOS6JYC({@DRP9`iTlETF0t5!m;e1r#I;8&B^Q2wu%Dce@b zN6^1jOOW9dT98vUXTKKEOy$!4;@7`*Yix!D`S>aOOKavyj=T5c8L_ARX2FM5BRbz} z#>4z#W0RUC$7%NMV4w7|dq|cKTg7QU1R<+rnTyzh59P|Lp+hf_lJKMr@=Dx|8K;O% zE>=RJOv=k!qx@OOXjh;Aot>3;B2GuUN|^7j?P&tkcLBY#<+I9@{R?^nnZ%VI{Lx9X zX1=?~&~Bz>WK9B44ZzDLP@wR;aAdy@WRL0T4X=n5QuUx@cFU&+ z<+Z#%4Ml)rfv>z3KbpT7Lp4*{or^3iID&q&e9&|-g6Y1aJgI$k&}jwh;=+ zU;<eSIFFRaI+f7WnliL%eAPI9}kml!k1otFjp}1udPJCGXKd+*o&H9 zI3UM&zh{gkzYLkw8R_#@81~4u;K%t$-lJOpGpwO*l#AauQ^ze^b2k%V6&f}p$45=_ ze}BV4ti|_wU^_b#qQ{iO(&lbYd7qI`zrEcVm}q>thxQ-#H>BTSCiVaX03x6$tVpO^ z)TiKg_W*uEVD|(NZ^sGoF7M(QyxZ`;w=zXXXJoACou3D&-Scu}y@TdChZV%;B^id8 zdL1w>2qa_BL01s#9}>m^IT>cAqUWWkR+)Q2L3m&wd)5L&h6f&Q>zma~9bUrRX(0c; z!8wPwp?W8#3#614i{w`KTFc>6L{FR9q{knCPERE$+S~l&_^l7SB9801lBEmDsT?d= zyo)vhp!FVzdf5@Ofe6Y76VI;F`K5Rz4t1v>X0Uh+IZ1ZGN+A_{!eHu}iRFQ?@H6{j zUPe_u)PW%P@@tS$0}AUEHsNbEiuyUoy_!Ep9?1zsY*74siR!9qfT|HKr4y;YfF7J| z@EL8PQmFywem78Zc5uNcCT|P5Zndg&ZA@emNDi5d`u^MNF7?8yJmQtl`?IH>(g&P&k%)Qb#@a)sF6NmGqoO6 zgEZ%=70}_g2Ys#9L4i#tB)2p8%Yqd3y8JE9`O~&LloY{Y{fBE>;(!9SoMg(hJL+8D zR@A33C2I-I0W#{M04PT#Pn+{Ub>(&0{~nzbufT)p4En(1V_@|LRrh<=y?tNhf1Q)``SVTluj-H)u1gh>uPwYd-^7wDZl9LV(QdSsm0_OC z)B?bRCgr(lGn%{Yw+H`nqBe#K{N@ltyIu(Xr2=UWP##!x(8P6v4cA=w>|*;SO8W+S zXi!S+_fFifb@Fq?*gdDG-ZFC~*ANY~U7H?#)6n*Dq}kCZ`9^Ebcav*WF-i+Z0H5QK zz$>Z>QDiGq_AJ%_T|WqXZ1eCK3F$D|r)5uySKRx|agcGuK*cGSo$MJ*=(4>jj+TBL zN9s#b3c!00Sdg2W3`qCmEz699-S-H^-5s6{H9Mm8W#!&Xwu<#G?+WpdLXjQ3Dxd#s z)8BJM^A~!!SwrQkdZ&ELJR@qi<@&&+U7qLxlu9{_D%kNJjAjaMh#NVWbax^^Gl7ot zU34W0>qIr`60H)6*!B9&`>DbZ%z>3ZD;{vUOphV16Ap_7pLC4h0IP7JWO8E7I!y&a zJQD2IyNpW>a#G74Raq6)_i4)1h>2dzz40e>u5=p<3ZBu5BNjjQa-Fx^UqL_N-COkf zl{Dw`Nl_9Y$wKG+^Ba(Wn(Y!nd}}m~rV%*e5N6>=BS)nrX_s z+656N!kz*&kShSgYTLvX;kFnIokNM$DR$2^lQeQ+ZD-cqm8zSfb?hag9&-mCrpMCsTit}TK!pyCCFX969RcPI9vksa7W|R zt#jM8S6f%}+j!!uom9K&z?WbigJ;HE^&N?1QPuHqNc0cEAOu1d&1**+tz-|<68uuaAsp+3;@m0_5B z)8o-zA$X>T?TAusIVh%?HpbK<^;Ft|njvg>CGV@=h@8K#xq}@%D6ZPq)Wc=OM=p5J z#&}+!OP9u#IuoHK$%{WsOs#``dt@tE3|_H*1$YhZKSxS|^6!j6;-M7{1VpS#Cy>Y; zLq*#N{!@Ly&$Cex&US85ARW%{%=Up|!cxZEpWAD5Wudq0Br_F(b-jWV8rgrzmY0!5 z(T&UvLb=Dj=1;Bl8;4Q15x_5BMe^0=s4#Hib&_Anya5Pe<)J1-4fUop$bGg5g$BNO zA0tLB;RD-8@~CrGtGeS#Nq-v0op-oSOX!swe*>a954A%9t9M;mZUgT@Fhba;P5lG$A0`=vEj1`?o5aO1i0}B=dJ|-A;FIkRU5K+Pi*_^q^Io)# zO!+ha994KERS&b@VOdX@9KO$}pouJ?g{p>d4s+Q@;5mq73soByfVg`4<^eG`r(a0RFUL>=q5%Yii=c9(Wrn+w zO0ZiuZ0SlU@dU_Mgq=5YiRu>Q?X?gVI-vrOTV-4!UB`Q!tThQ3sM#yCJ{wCzWrT^{ z=oyJ9l&y%8D^j&PlPj8Fbpm)dth?s2*p7i^L;9B8?DDTvy^_7j`?U37qnRzI&o72% zR%k~GOrD(N;JgE3#5w6D#krXh%-y&3%6=DrV0I;RqJ5;G`yQ7nYPZttu%TF^ch2rp zA8)RRZ3+Te1FpbR^BUlTJXDUjtwB9Lo#{v3gONwb)Z8S`q=*&wmQHJ%Se~c?{0BiX zypHuA{HG?tQqxu~Rx)Wn4IupV)NzF*yCEQ_W?W6RE75@v9=k(9T9umoTus7sZej6mK--&SO7TsBT)m<QygsCduqagrVA(|D5$BN*7NU#S9HXf(C*s7UelT4?>5*fH2Pq4TeFL7WG-me*oPxb-T3&8m#NeG%C*tw=hjW4#^ z%;9jPw3QL$f`PoxK;^oWN^jd4LP$F$nZk1q)ex5@tVGcUzUD8L)bbkF(Pj;|K}rEh z^uyTnR?UR@S?^s-(Qenh(3Q(eK1Nz088!+SK zC@xYQUSB6+I_5k^JO$=6#o#6QJ_7ZzN)0nz#6{f^FB4 z_NbebRo}IK6E-A^LFuveA}^^?a`W9n5s>@WRJuAPcQBFc4=NO3K00>1*P)MUNNP?H z^)@K-3cXn_Q;`>X2EJ9Ku>wp#QIAUkC+O@aoNGD+ee25c%BYv?A7oHv6FFbl6a$|q z=Qqb`Q;^i?1-kumVZh+XG%r3V zrcEucrQS~I2mF)`S$`jF_Lx(LZURET*{_aUnz37ZaR4SQBsM_g7J)+c1Ox|nQot#a zP2O^`+BrgTaIFyPIxIgHNnqJA+E~mhd3MV>=7lJTr~D}~w`sUj%T(I15Q#kyZ83-O zA$$U3(6>3X|EPo-fs_V2_sy=}ZOu-t^@$tEOs^iZ>zwON0FKO-pHM8*=08JTMZdJw zS)#~{4QDGwfVWTVH-^FFkD{H7#BO&5((uA$w2iOh1E2g05PQ(#@rX{i-3NDyWjA~DL3E{uJzTaX#AS51e7J*eH77FVhrhjY)?qVfj0y;M#o&Zs*rS_%>{$e-Ok^ z>Frph93`I@Nx3S&VN-oLvc}afOt@e05H%t$0`3SUS=q(xb1d8RuCGDZP;pqN=y&ARZRVHBm%mBYZx<473F<(&()%k#Xc1^>2sk7902M(+jRSlRgM^fR4)-!oSe|YA`&6jzb z(?e6?o;{u6&TO;XENmwB;<+9|1&?@K$si~|0wMHexb2z4%GRGcDe$(oxQZxzBW1vo zlX|}sgEDd5j?OZM4xh!-lBp8d)(I&@%zr>APkf{T^SkNUz?&FcFxR_4P9D|BZtU1x^)2`TUg}d#YV_r{Fjdot{{G zl~lva_AC=N$~!BQ(x><49w=I6>(se-LQD)Y_&z^_7uk%)MEOXvDI8VC=xL}wH&+(??ir*%LG9(SLISsnr45k+wRG&Nq#1kmggX%Nz-zt~oN+9RxC^3%Ssl98MX{-7U^liN-J2uMkh5 zT07wP0zgjeUC67QsJVwHBx^a1wLNe}P7IrL#lu+ojHAD2OF_vkTT6c3Mt-^i`vc=` zw0I2uL4QghuOlJ>;3eO5w&tw0WCCU~uEyQ&MKhKqb(-*@Lnv(}L}@&Q$Lo=Og{Ig6 zykE_L6%@&c7JOD(u%A0Z3JfnWc=>i?G(rEtBmX_}5TU%;OZLb;YuF2HXu`9zkK2dc4iIQ!-^Wd4NrNy2@YCE3H5gU71QnRn-cCGBu=x#1?!t z@?4-zTOz^xEnJ=RixCvYUC=Jj>eO||lY5roPa~GnHe(<5Rt70@XUZ7sPwQVENMn5P zEna|3L*5DWV_}Tn_omko^N-$y+~nhil%uL5^bW1spfO0eX}Ifc-6c*7Z63aPT5s;p z?>);XEPR#n03cc|6|j2Ygh@ySgjxtWa`?C4MFq>w_&q%<>z>@SAlwopJ$K$8TX#gq zJTo*ZPnSV#2^G$z*(bAh`A>(|O}pe$xWvd?Eq{QZWLK(%du)#mm`*5R42V`7HtJ+* z{X`j#axv}qH#`jvn0rOzDuCG07`yA+)Jy)1H((Dk`(c=u8BJKD8R|pSi48o@xyp&Y zB)R?~A^n8H36}_{f$6u8!SwcOR*qeZUmo&Wt7Y|)W%ZH9i;0ow|Bbd^#LzIStvlc<_g){Kg3#je9utVJ72S_S77H@VmP}NT40MRw~<;r3>UV6-!P!hDBbEqB??) ze3)@Ff!5pM9!IMLPNr~sGH(qV{Y%<^c@V!Bh_LcY+L3gYJiiSK-w01$L8tV;VlWc7w1n`8_ZF%eh%2dF~k2MWa zoXMC#*<)Ke&f_e)@p-f2$wvO+5377>{00s(uUTcj97f?tjZ|mSmu;sg^*K)L3Na7p$ zd_-0y8aS=8n9v!Trc^fbM6Xn^j@3)#3KxErMy7Xg-P#q?7MPjQl)FvSXnx>Ir1TcN%G1jEbrrvDdV% zS?X;5%=h0}K^d4ej*Wp|)V81So7cD@v zeh~EAruE#?D;w^dd;-mK&cZPL!qHC@W>RPJFN#VvvFHozFtSaX&rLU75207@2C5i7 zbsz43(20X7Rer>75RoJqzY3I_N)4QGYn?C)wrk+-_VzR0>4 zPSi8=l02axErC1FrhnC4(`dB@RY#X24lKXPE>dcZA+aR(`2Mx>i!}frQeH=>uKWnI z1<~+|+jm^y&2S0WGM7ce5=3ZggqB!QJ`SeLTKDM>ybk;)9+`zarrfkdD0X*=h1Ixg z+5U-4{R2-)zP-}vJ9VQ*g>N~g{~iP*yG<_VW&sbaB=+M{yHlfEpky~{yU3Biy#TO% z0L4lQr%oz<;=F#RHEJ=>hbxL7p|8V*A=c>b=o**=;NSjmdlJ*)5u+>)WZ8XDlrYrg z$t0&#!8YNPEu2;oCMUt?$)zpxt|l??iW~_Gd4^h_S5|A@nd)_xE`0?~;*5qEtG~jI zMv-VuY%e)hx0|f?sWk^{>JYE1fIOO?(TN4qB|H3gkzKx!FsePNR|DFUF$A=lZd6RO z3M8Tr13G2Q0S-f1_`1gf?*Eaa=eyI2&J=-DEBF@a6(T#eZtSuFpQ!julrwJQz^f! zEdMg-hfIz;9)Bq=Gq&=WJjCh{H6NaeA68Q>npnyc^lsHISfXponjv^NmV(Cra&m6uaf_ zRc&RP(75wMZ=fz-C)b0kq@}oCCQAvCxzj03^kM)+gBtzHEel5H9c#9w>TCL;pqL zB-e3g%A(Guf9@-vK~A$8tKwZ+7g(}R!^DCTg>&Y}v!-LnprfsqeiGNU5*7A~k7{%R zcYy!{aN_Q#)&(*;T5hK+RImiZFN5m3@6NAN|-%>}Wf14|P=aRYneJMQE_Yw#F z`sHK^%`m17C@lty-n4`+pQzuEg*B3(CEFZ`P~24dcfUZVXT4lS6=4t5|A9x#@-MK@ zf928sCvx-u3Z(sq6Z-!PNc#_e_FsXtO#hODC|t+$PbcE;ET-ga;QX^`WPs1`4+@t~ zfbqZ0XnzQw{})51|1xCw|2F#Dv zb}qKgKl##0Sr|LvYyB{9|6>S#5UT%~uDpSniPOJ?X8Z?r``^xvCI-$HcDBO*95=Zz zCnG)M&ySvsfrFWag_(+;os6EIi~^EQ=pQ>fM|^S_17~|{J7;SPLwr{zS|(aXYJ78N zXL~13Iy#&G-bHKYX!heI?`UW2Vr25ad)XVC;u{(mS^XUI|Jwt_za;1GXkrTagR13# zr2p^sL)&Hf;j5bB|F`x-iT#hg__qJ4F)%T+GW-nbKkZ*MrXR@eziZ4tpxuAhSXllK z+7H9^f7o&SoI?M!|BCU?HtWB|{gIgI|1Iv1#z_A&?Ekd?<;%>>_-`?OT>k;^{_l1_ zIA5lJ%kRgIgZ+PKER6IV|HGH#f8@u;$j?^e`|q zFvc{`+!%B#GOBsz#^xqyVFlESX`Zo(ks(?b7#f(E8l#y96f-wNR|iZ7z}ti2_LdYS zX6B?8ae=bBXI@&q0yymhOQWFtd{CAF7Zt&oRjI%P2TFv+C5c5P;3No4QNU2=QdM>J HcjE#8VaXe` literal 0 HcmV?d00001 diff --git a/docs/source/_static/API/clock_driven/surrogate/S2NN.svg b/docs/source/_static/API/clock_driven/surrogate/S2NN.svg new file mode 100644 index 0000000..b771af2 --- /dev/null +++ b/docs/source/_static/API/clock_driven/surrogate/S2NN.svg @@ -0,0 +1,2379 @@ + + + + + + + + + 2022-02-07T19:45:46.121645 + image/svg+xml + + + Matplotlib v3.3.2, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/clock_driven/13_neuromorphic_datasets.rst b/docs/source/clock_driven/13_neuromorphic_datasets.rst index 7377f6c..190f8c9 100644 --- a/docs/source/clock_driven/13_neuromorphic_datasets.rst +++ b/docs/source/clock_driven/13_neuromorphic_datasets.rst @@ -58,9 +58,7 @@ DVS128 Gesture数据集不支持自动下载,但它的 ``resource_url_md5()`` 运行这段代码,惊蜇框架将会完成以下工作: #. 检测数据集是否存在,如果存在,则进行MD5校验,确认数据集无误后,开始进行解压。将原始数据解压到同级目录下的 ``extract`` 文件夹 -#. DVS128 Gesture中的每个样本,是在不同光照环境下,对不同表演者进行录制的手势视频。一个AER文件中包含了多个手势,对应的会有一个csv文件来标注 - -整个视频内各个时间段内都是哪种手势。因此,单个的视频文件并不是一个类别,而是多个类别的集合。惊蜇框架会启动多线程进行划分,将每个视频中的每个手势类别文件单独提取出来 +#. DVS128 Gesture中的每个样本,是在不同光照环境下,对不同表演者进行录制的手势视频。一个AER文件中包含了多个手势,对应的会有一个csv文件来标注整个视频内各个时间段内都是哪种手势。因此,单个的视频文件并不是一个类别,而是多个类别的集合。惊蜇框架会启动多线程进行划分,将每个视频中的每个手势类别文件单独提取出来 下面是运行过程中的命令行输出: @@ -202,6 +200,52 @@ DVS128 Gesture数据集不支持自动下载,但它的 ``resource_url_md5()`` .. image:: ../_static/tutorials/clock_driven/13_neuromorphic_datasets/dvsg.* :width: 100% +固定时间间隔积分 +---------------------------- +使用固定时间间隔积分,更符合实际物理系统。例如每 ``10 ms`` 积分一次,则长度为 ``L ms`` 的数据,可以得到 ``math.floor(L / 10)`` 帧。但 +神经形态数据集中每个样本的长度往往不相同,因此会得到不同长度的帧数据。使用惊蜇框架提供的 :class:`spikingjelly.datasets.pad_sequence_collate` +和 :class:`spikingjelly.datasets.padded_sequence_mask` 可以很方便的对不等长数据进行对齐和还原。 + +示例代码: + +.. code:: python + + import torch + from torch.utils.data import DataLoader + from spikingjelly.datasets import pad_sequence_collate, padded_sequence_mask, dvs128_gesture + root='D:/datasets/DVS128Gesture' + train_set = dvs128_gesture.DVS128Gesture(root, data_type='frame', duration=1000000, train=True) + for i in range(5): + x, y = train_set[i] + print(f'x[{i}].shape=[T, C, H, W]={x.shape}') + train_data_loader = DataLoader(train_set, collate_fn=pad_sequence_collate, batch_size=5) + for x, y, x_len in train_data_loader: + print(f'x.shape=[N, T, C, H, W]={tuple(x.shape)}') + print(f'x_len={x_len}') + mask = padded_sequence_mask(x_len) # mask.shape = [T, N] + print(f'mask=\n{mask.t().int()}') + break + +输出为: + +.. code:: bash + + The directory [D:/datasets/DVS128Gesture\duration_1000000] already exists. + x[0].shape=[T, C, H, W]=(6, 2, 128, 128) + x[1].shape=[T, C, H, W]=(6, 2, 128, 128) + x[2].shape=[T, C, H, W]=(5, 2, 128, 128) + x[3].shape=[T, C, H, W]=(5, 2, 128, 128) + x[4].shape=[T, C, H, W]=(7, 2, 128, 128) + x.shape=[N, T, C, H, W]=(5, 7, 2, 128, 128) + x_len=tensor([6, 6, 5, 5, 7]) + mask= + tensor([[1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 0, 0], + [1, 1, 1, 1, 1, 0, 0], + [1, 1, 1, 1, 1, 1, 1]], dtype=torch.int32) + + 自定义积分方法 ----------------------- 惊蜇框架支持用户自定义积分方法。用户只需要提供积分函数 ``custom_integrate_function`` 以及保存frames的文件夹名 ``custom_integrated_frames_dir_name``。 @@ -220,8 +264,9 @@ DVS128 Gesture数据集不支持自动下载,但它的 ``resource_url_md5()`` def integrate_events_to_2_frames_randomly(events: Dict, H: int, W: int): index_split = np.random.randint(low=0, high=events['t'].__len__()) frames = np.zeros([2, 2, H, W]) - frames[0] = sjds.integrate_events_segment_to_frame(events, H, W, 0, index_split) - frames[1] = sjds.integrate_events_segment_to_frame(events, H, W, index_split, events['t'].__len__()) + t, x, y, p = (events[key] for key in ('t', 'x', 'y', 'p')) + frames[0] = sjds.integrate_events_segment_to_frame(x, y, p, H, W, 0, index_split) + frames[1] = sjds.integrate_events_segment_to_frame(x, y, p, H, W, index_split, events['t'].__len__()) return frames 接下来创建数据集: diff --git a/docs/source/clock_driven/17_loihi_sim.rst b/docs/source/clock_driven/17_loihi_sim.rst new file mode 100644 index 0000000..bfe8a75 --- /dev/null +++ b/docs/source/clock_driven/17_loihi_sim.rst @@ -0,0 +1,387 @@ +Loihi仿真 +====================================== + +本教程作者: `fangwei123456 `_ + +LAVA-DL框架中Block的行为 +----------------------------------------------------- + +`lava.lib.dl.slayer.block` 封装突触和神经元到单个Block,可以通过如下流程使用Block来进行Loihi仿真: + +1.使用Block导出hdf5定义的网络 +2.加载网络,转换为LAVA中的Process +3.使用LAVA提供的Loihi仿真器仿真Process + +Block是为Loihi仿真而生,它并不是像 `nn.Sequential` 这样简单的把两个模块包装一下,而是有更复杂的行为。 + +根据对源代码的分析,我们的结论是: + +在 `slayer.block` 中: + +- `p_scale = 1 << 12` + +- `w_scale = scale` + +- `s_scale = scale * (1 << 6)` + +- 若不指定 `pre_hook_fx = None` 或其他特定的函数,则 `self.synapse.weight` 会被量化,然后限幅,最终取值范围是 `2k / w_scale, k = -128, -127, ..., 127`,共有256种取值 + +- `p_scale = 1 << 12, self.neuron.current_decay = int(p_scale * current_decay), self.neuron.voltage_decay = int(p_scale * voltage_decay)`, + 但在计算衰减时,衰减后的值会通过 `right_shift_to_zero(x, bits=12)` 还原 + +- `self.threshold = int(threshold * w_scale) / w_scale` + +- 计算神经动态时, `x, self.current_state, self.voltage_state, self.threshold` 都会先乘上 `s_scale` 进行计算,最后的输出再除以 `s_scale` 进行还原 + + +下面的内容是源代码的分析过程,不感兴趣的读者可以跳过。 + +以 `slayer.block.Dense` 为例,对其行为进行介绍。 + + +`slayer.block.Dense` 的参数说明如下: + + - neuron_params (dict, optional) –- a dictionary of CUBA LIF neuron parameter. Defaults to None. + + - in_neurons (int) –- number of input neurons. + + - out_neurons (int) –- number of output neurons. + + - weight_scale (int, optional) –- weight initialization scaling. Defaults to 1. + + - weight_norm (bool, optional) –- flag to enable weight normalization. Defaults to False. + + - pre_hook_fx (optional) –- a function pointer or lambda that is applied to synaptic weights before synaptic operation. None means no transformation. Defaults to None. + + - delay (bool, optional) -– flag to enable axonal delay. Defaults to False. + + - delay_shift (bool, optional) –- flag to simulate spike propagation delay from one layer to next. Defaults to True. + + - mask (bool array, optional) -– boolean synapse mask that only enables relevant synapses. None means no masking is applied. Defaults to None. + + - count_log (bool, optional) -– flag to return event count log. If True, an additional value of average event rate is returned. Defaults to False. + +`slayer.block.Dense` 前向传播的流程为: + +`x` -> `synapse` -> `neuron` + +突触的量化 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +在 `synapse` 的前向传播中,在进行计算前,会对自身的权重做一次变换: + +.. code-block:: python + + # lava\lib\dl\slayer\synapse\layer.py + class Dense(torch.torch.nn.Conv3d, GenericLayer): + def forward(self, input): + # ... + if self._pre_hook_fx is None: + weight = self.weight + else: + weight = self._pre_hook_fx(self.weight) + # ... + +根据 `slayer.block.Dense` 的构造函数: + +.. code-block:: python + + # lava\lib\dl\slayer\block\cuba.py + class Dense(AbstractCuba, base.AbstractDense): + def __init__(self, *args, **kwargs): + super(Dense, self).__init__(*args, **kwargs) + self.synapse = synapse.Dense(**self.synapse_params) + if 'pre_hook_fx' not in kwargs.keys(): + self.synapse.pre_hook_fx = self.neuron.quantize_8bit + del self.synapse_params + +可以发现,在不专门指定 'pre_hook_fx' 的情况下,`self.synapse.pre_hook_fx = self.neuron.quantize_8bit`。 +因此,`slayer.block.Dense` 中的突触,默认是进行了量化。 + +我们查看量化函数的具体做法: + +.. code-block:: python + + # lava\lib\dl\slayer\neuron\base.py + class Neuron(torch.nn.Module): + def quantize_8bit(self, weight, descale=False): + if descale is False: + return quantize( + weight, step=2 / self.w_scale + ).clamp(-256 / self.w_scale, 255 / self.w_scale) + else: + return quantize( + weight, step=2 / self.w_scale + ).clamp(-256 / self.w_scale, 255 / self.w_scale) * self.w_scale + + # lava\lib\dl\slayer\utils\quantize.py + class _quantize(torch.autograd.Function): + @staticmethod + def forward(ctx, input, step=1): + return torch.round(input / step) * step + + @staticmethod + def backward(ctx, gradOutput): + return gradOutput, None + + def quantize(input, step=1): + return _quantize.apply(input, step) + + +在 `spikingjelly.clock_driven.lava_exchange.step_quantize `_ +中提供了一个量化函数的示意图: + +.. image:: ../_static/API/clock_driven/lava_exchange/step_quantize.* + :width: 100% + +可以看出,`self.synapse.weight` 被进行 `step = 2 / self.neuron.w_scale` 的量化,然后再被限幅到 `[-256 / self.neuron.w_scale, 255 / self.neuron.w_scale]`。 +因此,`self.synapse.weight` 量化后的取值范围为 `2k / self.neuron.w_scale, k = -128, -127, ..., 127`,共有256个取值,因而是8比特量化。 + + +神经动态的量化 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +在 `neuron` 的前向传播中,首先进行神经动态(LAVA的重置过程被融合进了神经动态),然后进行放电: + +.. code-block:: python + + # lava\lib\dl\slayer\neuron\cuba.py + class Neuron(base.Neuron): + def forward(self, input): + _, voltage = self.dynamics(input) + return self.spike(voltage) + +神经动态主要包括电流和电压的计算。电流和电压的衰减系数分别是 `self.current_decay` 和 `self.voltage_decay`,它们在初始化时被缩放了一次: + +.. code-block:: python + + # lava\lib\dl\slayer\neuron\cuba.py + class Neuron(base.Neuron): + def __init__( + self, threshold, current_decay, voltage_decay, + tau_grad=1, scale_grad=1, scale=1 << 6, + norm=None, dropout=None, + shared_param=True, persistent_state=False, requires_grad=False, + graded_spike=False + ): + super(Neuron, self).__init__( + threshold=threshold, + tau_grad=tau_grad, + scale_grad=scale_grad, + p_scale=1 << 12, + w_scale=scale, + s_scale=scale * (1 << 6), + norm=norm, + dropout=dropout, + persistent_state=persistent_state, + shared_param=shared_param, + requires_grad=requires_grad + ) + # ... + self.register_parameter( + 'current_decay', + torch.nn.Parameter( + torch.FloatTensor([self.p_scale * current_decay]), + requires_grad=self.requires_grad, + ) + ) + self.register_parameter( + 'voltage_decay', + torch.nn.Parameter( + torch.FloatTensor([self.p_scale * voltage_decay]), + requires_grad=self.requires_grad, + ) + ) + # ... + +因此,它们实际的值并不是在构造时给定的 `current_decay` 和 `voltage_decay`,而是乘上了 `self.p_scale`,也就是 `1 << 12`。 + +它们在神经动态中进行计算时,又被 `quantize` 函数量化了一次: + +.. code-block:: python + + # lava\lib\dl\slayer\neuron\cuba.py + class Neuron(base.Neuron): + def dynamics(self, input): + # ... + # clamp the values only when learning is enabled + # This means we don't need to clamp the values after gradient update. + # It is done in runtime now. Might be slow, but overhead is negligible. + if self.requires_grad is True: + self.clamp() + + current = leaky_integrator.dynamics( + input, + quantize(self.current_decay), + self.current_state.contiguous(), + self.s_scale, + debug=self.debug + ) + + voltage = leaky_integrator.dynamics( + current, # bias can be enabled by adding it here + quantize(self.voltage_decay), + self.voltage_state.contiguous(), + self.s_scale, + self.threshold, + debug=self.debug + ) + # ... + +在训练时,每次前向传播前都会调用 `self.clamp()` 进行限幅: + +.. code-block:: python + + # lava\lib\dl\slayer\neuron\cuba.py + def clamp(self): + """A function to clamp the sin decay and cosine decay parameters to be + within valid range. The user will generally not need to call this + function. + """ + with torch.no_grad(): + self.current_decay.data.clamp_(0, self.p_scale) + self.voltage_decay.data.clamp_(0, self.p_scale) + + + +结合限幅和量化过程,我们可以得知,在进行神经动态计算电流和电压衰减时: + +-- 真正的衰减系数是 `quantize(self.current_decay)` 和 `quantize(self.voltage_decay)` + +-- 衰减系数的取值是量化的,取值范围为 `0, 1, 2, ..., self.p_scale` + + +接下来我们关注状态和阈值的量化。 + +收件根据构造函数,我们回顾一下几个系数之间的关系: + +.. code-block:: python + + # lava\lib\dl\slayer\neuron\cuba.py + class Neuron(base.Neuron): + def __init__( + self, threshold, current_decay, voltage_decay, + tau_grad=1, scale_grad=1, scale=1 << 6, + norm=None, dropout=None, + shared_param=True, persistent_state=False, requires_grad=False, + graded_spike=False + ): + super(Neuron, self).__init__( + # ... + p_scale=1 << 12, + w_scale=scale, + s_scale=scale * (1 << 6), + # ... + +根据 `base.Neuron` 的构造函数: + +.. code-block:: python + + # lava\lib\dl\slayer\neuron\base.py + class Neuron(torch.nn.Module): + def __init__( + self, threshold, + tau_grad=1, scale_grad=1, + p_scale=1, w_scale=1, s_scale=1, + norm=None, dropout=None, + persistent_state=False, shared_param=True, + requires_grad=True, + complex=False + ): + # ... + self.p_scale = p_scale + self.w_scale = int(w_scale) + self.s_scale = int(s_scale) + # quantize to proper value + self._threshold = int(threshold * self.w_scale) / self.w_scale + # ... + +可以发现阈值实际上是做了一个 `step = self.w_scale` 的量化。 + +最后,我们看一下 `self.s_scale` 在 `leaky_integrator.dynamics` 中的作用。查看源码: + +.. code-block:: python + + # lava\lib\dl\slayer\neuron\cuba.py + class Neuron(base.Neuron): + def dynamics(self, input): + # ... + current = leaky_integrator.dynamics( + input, + quantize(self.current_decay), + self.current_state.contiguous(), + self.s_scale, + debug=self.debug + ) + + voltage = leaky_integrator.dynamics( + current, # bias can be enabled by adding it here + quantize(self.voltage_decay), + self.voltage_state.contiguous(), + self.s_scale, + self.threshold, + debug=self.debug + ) + # ... + + # lava\lib\dl\slayer\neuron\dynamics\leaky_integrator.py + def _li_dynamics_fwd( + input, decay, state, threshold, w_scale, dtype=torch.int32 + ): + output_old = (state * w_scale).clone().detach().to(dtype).to(input.device) + decay_int = (1 << 12) - decay.clone().detach().to(dtype).to(input.device) + output = torch.zeros_like(input) + + threshold *= w_scale + + for n in range(input.shape[-1]): + output_new = right_shift_to_zero(output_old * decay_int, 12) + \ + (w_scale * input[..., n]).to(dtype) + if threshold > 0: + spike_new = (output_new >= threshold) + output_old = output_new * (spike_new < 0.5) + else: + output_old = output_new + + output[..., n] = output_new / w_scale + + return output + + # lava\lib\dl\slayer\utils\int_utils.py + def right_shift_to_zero(x, bits): + """Right shift with quantization towards zero implementation. + + Parameters + ---------- + x : torch.int32 or torch.int64 + input tensor. + bits : int + number of bits to shift. + + Returns + ------- + torch.int32 or torch.int64 + right shift to zero result. + + """ + # ... + + +可以发现,`input, state, threshold` 都会先乘上 `w_scale` 进行计算,最后再除以 `w_scale` 进行还原。`p_scale = 1 << 12`,因而 `right_shift_to_zero(x, bits=12)`。 + +最后的结论是,在 `slayer.block` 中: + +- `p_scale = 1 << 12` + +- `w_scale = scale` + +- `s_scale = scale * (1 << 6)` + +- 若不指定 `pre_hook_fx = None` 或其他特定的函数,则 `self.synapse.weight` 会被量化,然后限幅,最终取值范围是 `2k / w_scale, k = -128, -127, ..., 127`,共有256种取值 + +- `p_scale = 1 << 12, self.neuron.current_decay = int(p_scale * current_decay), self.neuron.voltage_decay = int(p_scale * voltage_decay)`, + 但在计算衰减时,最终的输出会通过 `right_shift_to_zero(x, bits=12)` 还原 + +- `self.threshold = int(threshold * w_scale) / w_scale` + +- 计算神经动态时, `x, self.current_state, self.voltage_state, self.threshold` 都会先乘上 `s_scale` 进行计算,最后的输出再除以 `s_scale` 进行还原 \ No newline at end of file diff --git a/docs/source/clock_driven/5_ann2snn.rst b/docs/source/clock_driven/5_ann2snn.rst index 3d2fe3f..01077f4 100644 --- a/docs/source/clock_driven/5_ann2snn.rst +++ b/docs/source/clock_driven/5_ann2snn.rst @@ -4,10 +4,7 @@ ANN转换SNN 本节教程主要关注 ``spikingjelly.clock_driven.ann2snn``,介绍如何将训练好的ANN转换SNN,并且在SpikingJelly框架上进行仿真。 -目前实现了两套实现:基于ONNX 和 基于PyTorch,在框架中被称为 ONNX kernel 和 PyTorch kernel。 -但是这两套实现各有特点,ONNX kernel的实现更加通用,支持更加复杂的拓扑结构(例如ResNet); -PyTorch kernel主要是为了简单测试,支持的模块比较有限且在现有配置下可能有很多bug。 -更多模块可以通过ONNX拓展,用户可自行实现... +较早的实现方案中有两套实现:基于ONNX 和 基于PyTorch。由于ONNX不稳定,本版本为PyTorch增强版,原生支持复杂拓扑(例如ResNet)。一起来看看吧! ANN转换SNN的理论基础 -------------------- @@ -107,7 +104,7 @@ SNN相比于ANN,产生的脉冲是离散的,这有利于高效的通信。 .. math:: \frac{V_T-V_0}{T} = z - V_{threshold} \frac{\sum_{t=1}^{T}\theta_t}{T} = z- V_{threshold} \frac{N}{T} -其中 :math:`N` 为 :math:`T` 时间步内脉冲数, :math:`\frac{N}{T}` 就是发放率 :math:`r`。利用 :math:`z= V_{threshold} a` +其中 :math:`N` 为 :math:`T` 时间步内脉冲数, :math:`\frac{N}{T}` 就是发放率 :math:`r`。利用 :math:`z= V_{threshold} a` 即: .. math:: @@ -123,19 +120,14 @@ SNN相比于ANN,产生的脉冲是离散的,这有利于高效的通信。 .. math:: r^l = W^l r^{l-1}+b^l- \frac{V^l_T}{T V_{threshold}} -详细的说明见文献 [#f1]_ 。ann2snn中的方法也主要来自文献 [#f1]_ +详细的说明见文献 [#f1]_ 。ann2snn中的方法也主要来自文献 [#f1]_ -转换和仿真 ----------- +转换到脉冲神经网络 +^^^^^^^^^^^^^^^^ -具体地,进行前馈ANN转SNN主要有两个步骤:即模型分析(英文:parse,直译:句法分析)和仿真模拟。 +转换主要解决两个问题: -模型分析 -^^^^^^^^ - -模型分析主要解决两个问题: - -1. ANN为了快速训练和收敛提出了批归一化(Batch Normalization)。批归一化旨在将ANN输出归一化到0均值,这与SNN的特性相违背。因此,需要将BN的参数吸收到前面的参数层中(Linear、Conv2d) +1. ANN为了快速训练和收敛提出了批归一化(Batch Normalization)。批归一化旨在将ANN输出归一化到0均值,这与SNN的特性相违背。因此,可以将BN的参数吸收到前面的参数层中(Linear、Conv2d) 2. 根据转换理论,ANN的每层输入输出需要被限制在[0,1]范围内,这就需要对参数进行缩放(模型归一化) @@ -155,7 +147,7 @@ SNN相比于ANN,产生的脉冲是离散的,这有利于高效的通信。 ◆ 模型归一化 -对于某个参数模块,假定得到了其输入张量和输出张量,其输入张量的最大值为 :math:`\lambda_{pre}` ,输出张量的最大值为 :math:`\lambda` +对于某个参数模块,假定得到了其输入张量和输出张量,其输入张量的最大值为 :math:`\lambda_{pre}` ,输出张量的最大值为 :math:`\lambda` 那么,归一化后的权重 :math:`\hat{W}` 为: .. math:: @@ -171,65 +163,28 @@ ANN每层输出的分布虽然服从某个特定分布,但是数据中常常 到现在为止,我们对神经网络做的操作,在数值上是完全等价的。当前的模型表现应该与原模型相同。 -模型仿真 -^^^^^^^^ - -仿真前,我们需要将原模型中的ReLU激活函数变为IF神经元。 +转换中,我们需要将原模型中的ReLU激活函数变为IF神经元。 对于ANN中的平均池化,我们需要将其转化为空间下采样。由于IF神经元可以等效ReLU激活函数。空间下采样后增加IF神经元与否对结果的影响极小。 -对于ANN中的最大池化,目前没有非常理想的方案。目前的最佳方案为使用基于动量累计脉冲的门控函数控制脉冲通道 [#f1]_ 。当然在ONNX kernel中没有用,不过我们在``ann2snn.modules``依然有实现。还有文献提出使用空间下采样替代Maxpool2d。此处我们依然推荐使用avgpool2d。 - -仿真时,依照转换理论,SNN需要输入恒定的模拟输入。使用Poisson编码器将会带来准确率的降低。Poisson编码和恒定输入方式均已实现,感兴趣可通过配置进行不同实验。 +对于ANN中的最大池化,目前没有非常理想的方案。目前的最佳方案为使用基于动量累计脉冲的门控函数控制脉冲通道 [#f1]_ 。此处我们依然推荐使用avgpool2d。 +仿真时,依照转换理论,SNN需要输入恒定的模拟输入。使用Poisson编码器将会带来准确率的降低。 实现与可选配置 ^^^^^^^^^^^^^^^^^^^^^^^^ -ann2snn框架在2020年12月进行一次较大更新。最大改动就是将参数配置回归到了模块参数,并且尽可能考虑到了用户对灵活度和渐变操作的需求。这里我们将简单介绍一下这些类和方法。 -针对理论中提到的分析和仿真两大中心,设计了parser和simulator两大类。类的定义在``spikingjelly.ann2snn.__init__``中。 - -◆ parser类 -1. 类初始化函数 -- kernel:转换的kernel。可选范围为'onnx'、'pytorch',这将决定您使用的是ONNX kernel还是PyTorch kernel -- name:模型的名字,通常您可以取一个和任务、模型相关的名字,之后的文件夹生成将可能用到这个字符串 -- z_norm:许多深度学习模型会存在数据标准化(Z normalization)。如果您ANN模型有这个操作,这个参数的数据格式为:(mean, std),例如对于CIFAR10,z_norm可以为((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) -- log_dir:保存临时文件的文件夹,如没有此参数则会根据参数name和当前时间自动生成 -- json:历史配置文件名。当您运行过一次parser后,程序会自动保存json文件到log_dir,您可以使用json文件进行parser快速初始化 - -2. parse函数 -- channelwise: 如果为``True``,则控制激活幅值的统计是channelwise的;否则,控制激活幅值的统计是layerwise的 -- robust: 如果为``True``,则控制激活幅值的统计是激活的99.9百分位;否则,控制激活幅值的统计是激活的最值 -- user_methods:默认使用``spikingjelly.ann2snn.kernel.onnx._o2p_converter``;当发现ONNX kernel遇到ONNX转换PyTorch的方法缺乏的时候,可以通过用户自定义函数的形式进行转换。函数接口可见``spikingjelly.ann2snn.kernel.onnx._o2p_converter``的staticmethods - -◆ simulator类 -1. 类初始化参数 -- snn:待仿真的转换后的SNN -- device:仿真的设备,支持单设备(输入为字符串)和多设备(输入为list,set,tuple类型) -- name:模型的名字,通常您可以取一个和任务、模型相关的名字,之后的文件夹生成将可能用到这个字符串 -- log_dir:保存临时文件的文件夹,如没有此参数则会根据参数name和当前时间自动生成 -- encoder:编码器,可选范围为'constant'、'poisson' - -2. simulate函数 -- data_loader:仿真的数据集的dataloader -- T:仿真时间 -- canvas:plt.fig类型,用于对仿真模型标量性能(例如准确率)的绘图 -- online_drawer:如果为``True``,则在线绘图;否则,仿真结束后绘图 -- func_dict:用户可以通过自己定义标量性能函数实现绘图 - -除此之外,用户可以通过继承simulate类进行仿真器的功能细化。 -比如``spikingjelly.ann2snn.__init__``实现了仿真分类任务的``classify_simulator`` - -3. classify_simulator.simulate函数 -除去继承的参数外, -- ann_acc:ANN转换前的分类准确率(0-1间的小数) -- fig_name: 仿真图像的名字 -- step_max: 如果为``True``,则图像中标明推理过程中的最大准确率 +ann2snn框架在2022年4月又迎来一次较大更新。取消了parser和simulator两大类。使用converter类替代了之前的方案。目前的方案更加简洁,并且具有更多转换设置空间。 +◆ Converter类 +该类用于将ReLU的ANN转换为SNN。这里实现了常见的三种模式。 +最常见的是最大电流转换模式,它利用前后层的激活上限,使发放率最高的情况能够对应激活取得最大值的情况。使用这种模式需要将参数mode设置为``max``[#f2]_。 +99.9%电流转换模式利用99.9%的激活分位点限制了激活上限。使用这种模式需要将参数mode设置为``99.9%``[#f1]_。 +缩放转换模式下,用户需要给定缩放参数到模式中,即可利用缩放后的激活最大值对电流进行限制。使用这种模式需要将参数mode设置为0-1的浮点数。 识别MNIST --------- 现在我们使用 ``ann2snn`` ,搭建一个简单卷积网络,对MNIST数据集进行分类。 -首先定义我们的网络结构: +首先定义我们的网络结构 (见``ann2snn.sample_models.mnist_cnn``): .. code-block:: python @@ -267,156 +222,179 @@ ann2snn框架在2020年12月进行一次较大更新。最大改动就是将参 .. code-block:: python - device = input('输入运行的设备,例如“cpu”或“cuda:0”\n input device, e.g., "cpu" or "cuda:0": ') - dataset_dir = input('输入保存MNIST数据集的位置,例如“./”\n input root directory for saving MNIST dataset, e.g., "./": ') - batch_size = int(input('输入batch_size,例如“64”\n input batch_size, e.g., "64": ')) - learning_rate = float(input('输入学习率,例如“1e-3”\n input learning rate, e.g., "1e-3": ')) - T = int(input('输入仿真时长,例如“100”\n input simulating steps, e.g., "100": ')) - train_epoch = int(input('输入训练轮数,即遍历训练集的次数,例如“10”\n input training epochs, e.g., "10": ')) - model_name = input('输入模型名字,例如“mnist”\n input model name, for log_dir generating , e.g., "mnist": ') + torch.random.manual_seed(0) + torch.cuda.manual_seed(0) + device = 'cuda' + dataset_dir = 'G:/Dataset/mnist' + batch_size = 100 + T = 50 -之后的所有临时文件都会储存到文件夹中。 +这里的T就是一会儿推理时使用的推理时间步。 -初始化数据加载器、网络、优化器、损失函数: +如果您想训练的话,还需要初始化数据加载器、优化器、损失函数,例如: .. code-block:: python - # 初始化网络 - ann = ANN().to(device) + lr = 1e-3 + epochs = 10 # 定义损失函数 loss_function = nn.CrossEntropyLoss() # 使用Adam优化器 - optimizer = torch.optim.Adam(ann.parameters(), lr=learning_rate, weight_decay=5e-4) + optimizer = torch.optim.Adam(ann.parameters(), lr=lr, weight_decay=5e-4) -训练ANN,并定期测试。训练时也可以使用utils中预先写好的训练程序: +训练ANN。示例中,我们的模型训练了10个epoch。训练时测试集准确率变化情况如下: .. code-block:: python - for epoch in range(train_epoch): - # 使用utils中预先写好的训练程序训练网络 - # 训练程序的写法和经典ANN中的训练也是一样的 - # Train the network using a pre-prepared code in ''utils'' - utils.train_ann(net=ann, - device=device, - data_loader=train_data_loader, - optimizer=optimizer, - loss_function=loss_function, - epoch=epoch - ) - # 使用utils中预先写好的验证程序验证网络输出 - # Validate the network using a pre-prepared code in ''utils'' - acc = utils.val_ann(net=ann, - device=device, - data_loader=test_data_loader, - epoch=epoch - ) - if best_acc <= acc: - utils.save_model(ann, log_dir, model_name+'.pkl') - -完整的代码位于 ``ann2snn.examples.cnn_mnist.py`` ,在代码中我们还使用了Tensorboard来保存训练日志。可以直接在Python命令行运行它: + Epoch: 0 100%|██████████| 600/600 [00:05<00:00, 112.04it/s] + Validating Accuracy: 0.972 + Epoch: 1 100%|██████████| 600/600 [00:05<00:00, 105.43it/s] + Validating Accuracy: 0.986 + Epoch: 2 100%|██████████| 600/600 [00:05<00:00, 107.49it/s] + Validating Accuracy: 0.987 + Epoch: 3 100%|██████████| 600/600 [00:05<00:00, 109.26it/s] + Validating Accuracy: 0.990 + Epoch: 4 100%|██████████| 600/600 [00:05<00:00, 103.98it/s] + Validating Accuracy: 0.984 + Epoch: 5 100%|██████████| 600/600 [00:05<00:00, 100.42it/s] + Validating Accuracy: 0.989 + Epoch: 6 100%|██████████| 600/600 [00:06<00:00, 96.24it/s] + Validating Accuracy: 0.991 + Epoch: 7 100%|██████████| 600/600 [00:05<00:00, 104.97it/s] + Validating Accuracy: 0.992 + Epoch: 8 100%|██████████| 600/600 [00:05<00:00, 106.45it/s] + Validating Accuracy: 0.991 + Epoch: 9 100%|██████████| 600/600 [00:05<00:00, 111.93it/s] + Validating Accuracy: 0.991 + +训练好模型后,我们快速加载一下模型测试一下保存好的模型性能: .. code-block:: python - >>> import spikingjelly.clock_driven.ann2snn.examples.cnn_mnist as cnn_mnist - >>> cnn_mnist.main() - 输入运行的设备,例如“cpu”或“cuda:0” - input device, e.g., "cpu" or "cuda:0": cuda:15 - 输入保存MNIST数据集的位置,例如“./” - input root directory for saving MNIST dataset, e.g., "./": ./mnist - 输入batch_size,例如“64” - input batch_size, e.g., "64": 128 - 输入学习率,例如“1e-3” - input learning rate, e.g., "1e-3": 1e-3 - 输入仿真时长,例如“100” - input simulating steps, e.g., "100": 100 - 输入训练轮数,即遍历训练集的次数,例如“10” - input training epochs, e.g., "10": 10 - 输入模型名字,用于自动生成日志文档,例如“cnn_mnist” - input model name, for log_dir generating , e.g., "cnn_mnist" - - Epoch 0 [1/937] ANN Training Loss:2.252 Accuracy:0.078 - Epoch 0 [101/937] ANN Training Loss:1.423 Accuracy:0.669 - Epoch 0 [201/937] ANN Training Loss:1.117 Accuracy:0.773 - Epoch 0 [301/937] ANN Training Loss:0.953 Accuracy:0.795 - Epoch 0 [401/937] ANN Training Loss:0.865 Accuracy:0.788 - Epoch 0 [501/937] ANN Training Loss:0.807 Accuracy:0.792 - Epoch 0 [601/937] ANN Training Loss:0.764 Accuracy:0.795 - Epoch 0 [701/937] ANN Training Loss:0.726 Accuracy:0.835 - Epoch 0 [801/937] ANN Training Loss:0.681 Accuracy:0.880 - Epoch 0 [901/937] ANN Training Loss:0.641 Accuracy:0.889 - 100%|██████████| 100/100 [00:00<00:00, 116.12it/s] - Epoch 0 [100/100] ANN Validating Loss:0.327 Accuracy:0.881 - Save model to: cnn_mnist-XXXXX\cnn_mnist.pkl - ...... - -示例中,这个模型训练10个epoch。训练时测试集准确率变化情况如下: - -.. image:: ../_static/tutorials/clock_driven/5_ann2snn/accuracy_curve.png - -最终达到98.8%的测试集准确率。 - -从训练集中,取出一部分数据,用于模型的归一化步骤。这里我们取192张图片。 + model.load_state_dict(torch.load('SJ-mnist-cnn_model-sample.pth')) + acc = val(model, device, test_data_loader) + print('ANN Validating Accuracy: %.4f' % (acc)) -.. code-block:: python +输出结果如下: - # 加载用于归一化模型的数据 - # Load the data to normalize the model - percentage = 0.004 # load 0.004 of the data - norm_data_list = [] - for idx, (imgs, targets) in enumerate(train_data_loader): - norm_data_list.append(imgs) - if idx == int(len(train_data_loader) * percentage) - 1: - break - norm_data = torch.cat(norm_data_list) - print('use %d imgs to parse' % (norm_data.size(0))) +.. code-block:: python + 100%|██████████| 200/200 [00:02<00:00, 89.44it/s] + ANN Validating Accuracy: 0.9870 -调用\ ``ann2snn``\ 中的类parser,并使用ONNX kernel。 +使用Converter进行转换非常简单,只需要参数中设置希望使用的模式即可。例如使用MaxNorm,需要先定义一个``ann2snn.Converter``,并且把模型forward给这个对象: .. code-block:: python - onnxparser = parser(name=model_name, - log_dir=log_dir + '/parser', - kernel='onnx') - snn = onnxparser.parse(ann, norm_data.to(parser_device)) + model_converter = ann2snn.Converter(mode='max', dataloader=train_data_loader) + snn_model = model_converter(model) -我们可以保存好我们转换好的snn模型,并且定义一个plt.figure用于绘图 +snn_model就是输出来的SNN模型。 -.. code-block:: python +按照这个例子,我们分别定义模式为``max``,``99.9%``,``1.0/2``,``1.0/3``,``1.0/4``,``1.0/5``情况下的SNN转换并分别推理T步得到准确率。 - torch.save(snn, os.path.join(log_dir,'snn-'+model_name+'.pkl')) - fig = plt.figure('simulator') +.. code-block:: python -现在,我们定义用于SNN的仿真器。由于我们的任务是分类,选择类``classify_simulator`` + print('---------------------------------------------') + print('Converting using MaxNorm') + model_converter = ann2snn.Converter(mode='max', dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_max_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_max_accs[-1])) + + print('---------------------------------------------') + print('Converting using RobustNorm') + model_converter = ann2snn.Converter(mode='99.9%', dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_robust_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_robust_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/2 max(activation) as scales...') + model_converter = ann2snn.Converter(mode=1.0 / 2, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_two_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_two_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/3 max(activation) as scales') + model_converter = ann2snn.Converter(mode=1.0 / 3, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_three_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_three_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/4 max(activation) as scales') + model_converter = ann2snn.Converter(mode=1.0 / 4, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_four_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_four_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/5 max(activation) as scales') + model_converter = ann2snn.Converter(mode=1.0 / 5, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_five_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_five_accs[-1])) + +观察控制栏输出: .. code-block:: python - sim = classify_simulator(snn, - log_dir=log_dir + '/simulator', - device=simulator_device, - canvas=fig - ) - sim.simulate(test_data_loader, - T=T, - online_drawer=True, - ann_acc=ann_acc, - fig_name=model_name, - step_max=True - ) - -模型仿真由于时间较长,我们设计了tqdm的进度条用于预估仿真时间。仿真结束时会有仿真器的summary + --------------------------------------------- + Converting using MaxNorm + 100%|██████████| 600/600 [00:04<00:00, 128.25it/s] Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.44it/s] SNN accuracy (simulation 50 time-steps): 0.9777 + --------------------------------------------- + Converting using RobustNorm + 100%|██████████| 600/600 [00:19<00:00, 31.06it/s] Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.75it/s] SNN accuracy (simulation 50 time-steps): 0.9841 + --------------------------------------------- + Converting using 1/2 max(activation) as scales... + 100%|██████████| 600/600 [00:04<00:00, 126.64it/s] ]Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.90it/s] SNN accuracy (simulation 50 time-steps): 0.9844 + --------------------------------------------- + Converting using 1/3 max(activation) as scales + 100%|██████████| 600/600 [00:04<00:00, 126.27it/s] Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.73it/s] SNN accuracy (simulation 50 time-steps): 0.9828 + --------------------------------------------- + Converting using 1/4 max(activation) as scales + 100%|██████████| 600/600 [00:04<00:00, 128.94it/s] Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.47it/s] SNN accuracy (simulation 50 time-steps): 0.9747 + --------------------------------------------- + Converting using 1/5 max(activation) as scales + 100%|██████████| 600/600 [00:04<00:00, 121.18it/s] Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.42it/s] SNN accuracy (simulation 50 time-steps): 0.9487 + --------------------------------------------- + +模型转换的速度可以看到是非常快的。模型推理速度200步仅需11s完成(GTX 2080ti)。 +根据模型输出的随时间变化的准确率,我们可以绘制不同设置下的准确率图像。 .. code-block:: python - simulator is working on the normal mode, device: cuda:0 - 100%|██████████| 100/100 [00:46<00:00, 2.15it/s] - --------------------simulator summary-------------------- - time elapsed: 46.55072790000008 (sec) - --------------------------------------------------------- + fig = plt.figure() + plt.plot(np.arange(0, T), mode_max_accs, label='mode: max') + plt.plot(np.arange(0, T), mode_robust_accs, label='mode: 99.9%') + plt.plot(np.arange(0, T), mode_two_accs, label='mode: 1.0/2') + plt.plot(np.arange(0, T), mode_three_accs, label='mode: 1.0/3') + plt.plot(np.arange(0, T), mode_four_accs, label='mode: 1.0/4') + plt.plot(np.arange(0, T), mode_five_accs, label='mode: 1.0/5') + plt.legend() + plt.xlabel('t') + plt.ylabel('Acc') + plt.show() + +.. image:: ../_static/tutorials/clock_driven/5_ann2snn/accuracy_mode.png -通过最后的输出,可以知道,仿真器使用了46.6s。转换后的SNN准确率可以从simulator文件夹中plot.pdf看到,最高的转换准确率为98.51%。转换带来了0.37%的性能下降。通过增加推理时间可以减少转换损失。 +不同的设置可以得到不同的结果,有的推理速度快,但是最终精度低,有的推理慢,但是精度高。用户可以根据自己的需求选择模型设置。 .. [#f1] Rueckauer B, Lungu I-A, Hu Y, Pfeiffer M and Liu S-C (2017) Conversion of Continuous-Valued Deep Networks to Efficient Event-Driven Networks for Image Classification. Front. Neurosci. 11:682. .. [#f2] Diehl, Peter U. , et al. Fast classifying, high-accuracy spiking deep networks through weight and threshold balancing. Neural Networks (IJCNN), 2015 International Joint Conference on IEEE, 2015. .. [#f3] Rueckauer, B., Lungu, I. A., Hu, Y., & Pfeiffer, M. (2016). Theory and tools for the conversion of analog to spiking convolutional neural networks. arXiv preprint arXiv:1612.04052. -.. [#f4] Sengupta, A., Ye, Y., Wang, R., Liu, C., & Roy, K. (2019). Going deeper in spiking neural networks: Vgg and residual architectures. Frontiers in neuroscience, 13, 95. \ No newline at end of file +.. [#f4] Sengupta, A., Ye, Y., Wang, R., Liu, C., & Roy, K. (2019). Going deeper in spiking neural networks: Vgg and residual architectures. Frontiers in neuroscience, 13, 95. diff --git a/docs/source/clock_driven_en/0_neuron.rst b/docs/source/clock_driven_en/0_neuron.rst index cddadfb..ef438d0 100644 --- a/docs/source/clock_driven_en/0_neuron.rst +++ b/docs/source/clock_driven_en/0_neuron.rst @@ -7,7 +7,7 @@ Translator: `YeYumin `_ This tutorial focuses on :class:`spikingjelly.clock_driven.neuron` and introduces spiking neurons and clock-driven simulation methods. -Spiking Nneuron Model +Spiking Neuron Model ----------------------------------------------- In ``spikingjelly``, we define the neuron which can only output spikes, i.e. 0 or 1, as a "spiking neuron". Networks that use spiking neurons are called Spiking Neural Networks (SNNs). @@ -235,4 +235,4 @@ The results are as follows: :width: 100% .. image:: ../_static/tutorials/clock_driven/0_neuron/2.* - :width: 100% \ No newline at end of file + :width: 100% diff --git a/docs/source/clock_driven_en/13_neuromorphic_datasets.rst b/docs/source/clock_driven_en/13_neuromorphic_datasets.rst index 22806b0..bce30c2 100644 --- a/docs/source/clock_driven_en/13_neuromorphic_datasets.rst +++ b/docs/source/clock_driven_en/13_neuromorphic_datasets.rst @@ -203,8 +203,55 @@ We will get the images like: .. image:: ../_static/tutorials/clock_driven/13_neuromorphic_datasets/dvsg.* :width: 100% +Fixed Duration Integrating +-------------------------------------- +Integrating by fixed duration is more compatible with the practical application. For example, if we set duration as ``10 ms``, +then a sample with length ``L ms`` can be integrated to frames with frame number ``math.floor(L / 10)``. However, the lengthes +of samples in neuromorphic datasets are not identical, and we will get frames with different frame numbers when integrating +with fixed duration. Fortunately, we can use :class:`spikingjelly.datasets.pad_sequence_collate` and +:class:`spikingjelly.datasets.padded_sequence_mask` to pad/unpad frames. + +Example codes: + +.. code:: python + + import torch + from torch.utils.data import DataLoader + from spikingjelly.datasets import pad_sequence_collate, padded_sequence_mask, dvs128_gesture + root='D:/datasets/DVS128Gesture' + train_set = dvs128_gesture.DVS128Gesture(root, data_type='frame', duration=1000000, train=True) + for i in range(5): + x, y = train_set[i] + print(f'x[{i}].shape=[T, C, H, W]={x.shape}') + train_data_loader = DataLoader(train_set, collate_fn=pad_sequence_collate, batch_size=5) + for x, y, x_len in train_data_loader: + print(f'x.shape=[N, T, C, H, W]={tuple(x.shape)}') + print(f'x_len={x_len}') + mask = padded_sequence_mask(x_len) # mask.shape = [T, N] + print(f'mask=\n{mask.t().int()}') + break + +The outputs are: + +.. code:: bash + + The directory [D:/datasets/DVS128Gesture\duration_1000000] already exists. + x[0].shape=[T, C, H, W]=(6, 2, 128, 128) + x[1].shape=[T, C, H, W]=(6, 2, 128, 128) + x[2].shape=[T, C, H, W]=(5, 2, 128, 128) + x[3].shape=[T, C, H, W]=(5, 2, 128, 128) + x[4].shape=[T, C, H, W]=(7, 2, 128, 128) + x.shape=[N, T, C, H, W]=(5, 7, 2, 128, 128) + x_len=tensor([6, 6, 5, 5, 7]) + mask= + tensor([[1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 0, 0], + [1, 1, 1, 1, 1, 0, 0], + [1, 1, 1, 1, 1, 1, 1]], dtype=torch.int32) + Custom Integrating Method ------------------------ +---------------------------- SpikingJelly provides user-defined integrating method. The user should provide a function ``custom_integrate_function`` and the name of directory ``custom_integrated_frames_dir_name`` for saving frames. @@ -224,8 +271,9 @@ a function: def integrate_events_to_2_frames_randomly(events: Dict, H: int, W: int): index_split = np.random.randint(low=0, high=events['t'].__len__()) frames = np.zeros([2, 2, H, W]) - frames[0] = sjds.integrate_events_segment_to_frame(events, H, W, 0, index_split) - frames[1] = sjds.integrate_events_segment_to_frame(events, H, W, index_split, events['t'].__len__()) + t, x, y, p = (events[key] for key in ('t', 'x', 'y', 'p')) + frames[0] = sjds.integrate_events_segment_to_frame(x, y, p, H, W, 0, index_split) + frames[1] = sjds.integrate_events_segment_to_frame(x, y, p, H, W, index_split, events['t'].__len__()) return frames Now let us use this function to create frames dataset: diff --git a/docs/source/clock_driven_en/5_ann2snn.rst b/docs/source/clock_driven_en/5_ann2snn.rst index ab421da..7c6a05b 100644 --- a/docs/source/clock_driven_en/5_ann2snn.rst +++ b/docs/source/clock_driven_en/5_ann2snn.rst @@ -2,22 +2,89 @@ spikingjelly.clock_driven.ann2snn ======================================= Author: `DingJianhao `_, `fangwei123456 `_ -This tutorial focuses on ``spikingjelly.clock_driven.ann2snn``,introduce how to convert the trained feedforward ANN to SNN and simulate it on the SpikingJelly framework. +This tutorial focuses on ``spikingjelly.clock_driven.ann2snn``, introduce how to convert the trained feedforward ANN to SNN and simulate it on the SpikingJelly framework. -Currently support conversion of Pytorch modules including ``nn.Conv2d`` , ``nn.Linear`` , ``nn.MaxPool2d`` , ``nn.AvgPool2d`` , ``nn.BatchNorm1d`` , ``nn.BatchNorm2d`` , ``nn.Flatten`` , ``nn.ReLU`` ,other module solutions are under development... +There are two sets of implementations in earlier implementations: ONNX-based and PyTorch-based. Due to the instability of ONNX, this version is an enhanced version of PyTorch, which natively supports complex topologies (such as ResNet). Let's have a look! Theoretical basis of ANN2SNN ---------------------------- -Compared with ANN, SNN generates discrete spikes, which is conducive to efficient communication. Today, ANN is popular, while direct training of SNN requires far more resources. Naturally, people will think of using very mature ANN to switch to SNN, and hope that SNN can have similar performance. This leads to the question of how to build a bridge between ANN and SNN. The current SNN mainstream method is to use frequency coding. So for the output layer, we will use the number of neuron output spikes to determine the category. Is the firing rate related to ANN? +Compared with ANN, the generated pulses of SNN are discrete, which is conducive to efficient communication. Today, with the popularity of ANN, the direct training of SNN requires more resources. Naturally, we will think of using the now very mature ANN to convert to SNN, and hope that SNN can have similar performance. This involves the problem of how to build a bridge between ANN and SNN. Now the mainstream way of SNN is to use frequency encoding, so for the output layer, we will use the number of neuron output pulses to judge the category. Is there a relationship between the release rate and ANN? -Fortunately, there is a strong correlation between the non-linear activation of ReLU neurons in ANN and the firing rate of IF neurons in SNN (reset by subtracting the threshold :math:`V_{threshold}` ). We can use this feature for conversion. The neuron update method mentioned here is the Soft method mentioned in the `Clock Driven Tutorial `_. +Fortunately, there is a strong correlation between the nonlinear activation of ReLU neurons in ANN and the firing rate of IF neurons in SNN (reset by subtracting the threshold: math:`V_{threshold}`). this feature to convert. The neuron update method mentioned here is the Soft method mentioned in `Time-driven tutorial `_. -The following figure shows this correspondence: the left figure is a curve obtained by giving a constant input to an IF neuron and observing its firing over a period of time. The right one is the ReLU activation curve, which satisfies :math:`activation = max(input,0)`. +Experiment: Relationship between IF neuron spiking frequency and input +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We gave constant input to the IF neuron and observed its output spikes and spike firing frequency. First import the relevant modules, create a new IF neuron layer, determine the input and draw the input of each IF neuron :math:`x_{i}`: + +.. code-block:: python + + import torch + from spikingjelly.clock_driven import neuron + from spikingjelly import visualizing + from matplotlib import pyplot as plt + import numpy as np + + plt.rcParams['figure.dpi'] = 200 + if_node = neuron.IFNode(v_reset=None) + T = 128 + x = torch.arange(-0.2, 1.2, 0.04) + plt.scatter(torch.arange(x.shape[0]), x) + plt.title('Input $x_{i}$ to IF neurons') + plt.xlabel('Neuron index $i$') + plt.ylabel('Input $x_{i}$') + plt.grid(linestyle='-.') + plt.show() + +.. image:: ../_static/tutorials/clock_driven/5_ann2snn/0.* + :width: 100% + +Next, send the input to the IF neuron layer, and run the ``T=128`` step to observe the pulses and pulse firing frequency of each neuron: + +.. code-block:: python + + s_list = [] + for t in range(T): + s_list.append(if_node(x).unsqueeze(0)) + + out_spikes = np.asarray(torch.cat(s_list)) + visualizing.plot_1d_spikes(out_spikes, 'IF neurons\' spikes and firing rates', 't', 'Neuron index $i$') + plt.show() + +.. image:: ../_static/tutorials/clock_driven/5_ann2snn/1.* + :width: 100% + +It can be found that the frequency of the pulse firing is within a certain range, which is proportional to the size of the input :math:`x_{i}`. + +Next, let's plot the firing frequency of the IF neuron against the input :math:`x_{i}` and compare it with :math:`\mathrm{ReLU}(x_{i})`: + +.. code-block:: python + + plt.subplot(1, 2, 1) + firing_rate = np.mean(out_spikes, axis=1) + plt.plot(x, firing_rate) + plt.title('Input $x_{i}$ and firing rate') + plt.xlabel('Input $x_{i}$') + plt.ylabel('Firing rate') + plt.grid(linestyle='-.') + + plt.subplot(1, 2, 2) + plt.plot(x, x.relu()) + plt.title('Input $x_{i}$ and ReLU($x_{i}$)') + plt.xlabel('Input $x_{i}$') + plt.ylabel('ReLU($x_{i}$)') + plt.grid(linestyle='-.') + plt.show() .. image:: ../_static/tutorials/clock_driven/5_ann2snn/2.* :width: 100% +It can be found that the two curves are almost the same. It should be noted that the pulse frequency cannot be higher than 1, so the IF neuron cannot fit the input of the ReLU in the ANN is larger than 1. + +Theoretical basis of ANN2SNN +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + The literature [#f1]_ provides a theoretical basis for analyzing the conversion of ANN to SNN. The theory shows that the IF neuron in SNN is an unbiased estimator of ReLU activation function over time. For the first layer of the neural network, the input layer, discuss the relationship between the firing rate of SNN neurons :math:`r` and the activation in the corresponding ANN. Assume that the input is constant as :math:`z \in [0,1]`. @@ -55,393 +122,279 @@ Similarly, for the higher layers of the neural network, literature [#f1]_ furthe For details, please refer to [#f1]_. The methods in ann2snn also mainly come from [#f1]_ . -Conversion and simulation -------------------------- - -Specifically, there are two main steps for converting feedforward ANN to SNN: model parsing and model simulation. - -model parsing -^^^^^^^^^^^^^ +Converting to spiking neural network +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Model parsing mainly solves two problems: +Conversion mainly solves two problems: -1. Researchers propose Batch Normalization for fast training and convergence. Batch normalization aims to normalize the output of ANN to 0 mean, which is contrary to the characteristics of SNN. Therefore, the parameters of BN need to be absorbed into the previous parameter layer (Linear, Conv2d) +1. ANN proposes Batch Normalization for fast training and convergence. Batch normalization aims to normalize the ANN output to 0 mean, which is contrary to the properties of SNNs. Therefore, the parameters of BN can be absorbed into the previous parameter layers (Linear, Conv2d) -2. According to the conversion theory, the input and output of each layer of ANN need to be limited to the range of [0,1], which requires scaling of the parameters (model normalization) +2. According to the transformation theory, the input and output of each layer of ANN need to be limited to the range of [0,1], which requires scaling the parameters (model normalization) -◆ Absorbing BatchNorm parameters +◆ BatchNorm parameter absorption -Assume that the parameters of BatchNorm are :math:`\gamma` (BatchNorm.weight), :math:`\beta` (BatchNorm.bias), :math:`\mu`(BatchNorm.running_mean), :math:`\sigma`(BatchNorm.running_std, square root of running_var).For specific parameter definitions, see ``torch.nn.batchnorm``. -Parameter modules (such as Linear) have parameters :math:`W` and :math:`b`. Absorbing BatchNorm parameters is transfering the parameters of BatchNorm to :math:`W` and :math:`b` of the parameter module through calculation,, so that the output of the data in new module is the same as when there is BatchNorm. -In this regard, the new model's :math:`\bar{W}` and :math:`\bar{b}` formulas are expressed as: +Assume that the parameters of BatchNorm are: math:`\gamma` (``BatchNorm.weight``), :math:`\beta` (``BatchNorm.bias``), :math:`\mu` (``BatchNorm. .running_mean``) , +:math:`\sigma` (``BatchNorm.running_var``, :math:`\sigma = \sqrt{\mathrm{running\_var}}`). For specific parameter definitions, see +`torch.nn.BatchNorm1d `_ . +Parameter modules (eg Linear) have parameters :math:`W` and :math:`b` . BatchNorm parameter absorption is to transfer the parameters of BatchNorm to :math:`W` and :math:`b` of the parameter module by operation, so that the output of the new module of data input is the same as when there is BatchNorm. +For this, the :math:`\bar{W}` and :math:`\bar{b}` formulas for the new model are expressed as: .. math:: - \bar{W} = \frac{\gamma}{\sigma} W + \bar{W} = \frac{\gamma}{\sigma} W .. math:: - \bar{b} = \frac{\gamma}{\sigma} (b - \mu) + \beta + \bar{b} = \frac{\gamma}{\sigma} (b - \mu) + \beta -◆ Model normalization +◆ Model Normalization -For a parameter module, assuming that the input tensor and output tensor are obtained, the maximum value of the input tensor is :math:`\lambda_{pre}`, and the maximum value of the output tensor is :math:`\lambda` +For a parameter module, it is assumed that its input tensor and output tensor are obtained, the maximum value of its input tensor is: math:`\lambda_{pre}`, and the maximum value of its output tensor is: math:`\lambda ` Then, the normalized weight :math:`\hat{W}` is: .. math:: - \hat{W} = W * \frac{\lambda_{pre}}{\lambda} + \hat{W} = W * \frac{\lambda_{pre}}{\lambda} The normalized bias :math:`\hat{b}` is: .. math:: - \hat{b} = b / \lambda - -Although the output distribution of each layer of ANN obeys a certain distribution, there are often large outliers in the data, which will reduce the overall neuron firing rate. -To solve this problem, robust normalization adjusts the scaling factor from the maximum value of the tensor to the p-percentile of the tensor. The recommended percentile value in the literature is 99.9 - -So far, the operations we have done on neural networks are completely equivalent. The performance of the current model should be the same as the original model. - -Model simulation -^^^^^^^^^^^^^^^^ - -Before simulation, we need to change the ReLU activation function in the original model into an IF neuron. -For the average pooling in ANN, we need to transform it into spatial subsampling. Because IF neuron can be equivalent to ReLU activation function. Adding IF neurons after spatial downsampling has little effect on the results. -There is currently no ideal solution for maximum pooling in ANN. The best solution at present is to control the spike channel [#f1]_ with a gated function based on the momentum accumulation spike. This is also the default method in ann2snn. There are also literatures proposing to use spatial subsampling to replace Maxpool2d. - -In simulation, according to the conversion theory, SNN needs to input a constant analog input. Using a Poisson encoder will bring about a decrease in accuracy. Both Poisson coding and constant input have been implemented, and one can perform different experiments if interested. - -Optional configuration -^^^^^^^^^^^^^^^^^^^^^^ - -In view of the various optional configurations in the conversion, the ``Config`` class implemented in ``ann2snn.utils`` is used to load the default configuration and save the configuration. By loading the default configuration in Config and modifying it, one can set the parameters required when running. - -Below are the introductions of the configuration corresponding to different parameters, the feasible input range, and why this configuration is needed. - -(1) conf['parser']['robust_norm'] - -Available value:``bool`` - -Note:when ``True``, use robust normalization - -(2) conf['simulation']['reset_to_zero'] - -Available value: ``None``, floating point - -Note: When floating point, voltage of neurons that just fired spikes will be set to :math:``V_{reset}``; when ``None``, voltage of neurons that just fired spikes will subtract :math:``V_{threshold}``. For model that need normalization, setting to ``None`` is default, which has theoretical guaratee. - -(3) conf['simulation']['encoder']['possion'] - -Available value:``bool`` + \hat{b} = \frac{b}{\lambda} -Note: When ``True``, use Possion encoder; otherwise, use constant input over T steps. +Although the distribution of the output of each layer of ANN obeys a certain distribution, there are often large outliers in the data, which will lead to a decrease in the overall neuron firing rate. +To address this, robust normalization adjusts the scaling factor from the maximum value of the tensor to the p-quantile of the tensor. The recommended quantile value in the literature is 99.9. -(4) conf['simulation']['avg_pool']['has_neuron'] +So far, what we have done with neural networks is numerically equivalent. The current model should perform the same as the original model. -Available value:``bool`` +In the conversion, we need to change the ReLU activation function in the original model into IF neurons. +For average pooling in ANN, we need to convert it to spatial downsampling. Since IF neurons can be equivalent to the ReLU activation function. Adding IF neurons or not after spatial downsampling has minimal effect on the results. +There is currently no very ideal solution for max pooling in ANNs. The best solution so far is to control the pulse channel [#f1]_ with a gating function based on momentum accumulated pulses. Here we still recommend using avgpool2d. +When simulating, according to the transformation theory, the SNN needs to input a constant analog input. Using a Poisson encoder will bring about a reduction in accuracy. -Note: When ``True``, avgpool2d is converted to spatial subsampling with a layer of IF neurons; otherwise, it is only converted to spatial subsampling. +Implementation and optional configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^ -(5) conf['simulation']['max_pool']['if_spatial_avg'] +The ann2snn framework will receive another major update in April 2022. The two categories of parser and simulator have been cancelled. Using the converter class replaces the previous solution. The current scheme is more compact and has more room for transformation settings. -Available value:``bool`` +◆ Converter class +This class is used to convert ReLU's ANN to SNN. Three common patterns are implemented here. +The most common is the maximum current switching mode, which utilizes the upper and lower activation limits of the front and rear layers so that the case with the highest firing rate corresponds to the case where the activation achieves the maximum value. Using this mode requires setting the parameter mode to ``max``[#f2]_. +The 99.9% current switching mode utilizes the 99.9% activation quantile to limit the upper activation limit. Using this mode requires setting the parameter mode to ``99.9%``[#f1]_. +In the scaling conversion mode, the user needs to specify the scaling parameters into the mode, and the current can be limited by the activated maximum value after scaling. Using this mode requires setting the parameter mode to a float of 0-1. -Note: When ``True``,maxpool2d is converted to avgpool2d. As referred in many literatures, this method will cause accuracy degrading. +Classify MNIST +-------------- -(6) conf['simulation']['max_pool']['if_wta'] +Now we use ``ann2snn`` to build a simple convolutional network to classify the MNIST dataset. -Available value:``bool`` - -Note: When ``True``, maxpool2d in SNN is identical with maxpool2d in ANN. Using maxpool2d in ANN means that when a spike is available in the Receptive Field, output a spike. - -(7) conf['simulation']['max_pool']['momentum'] - -Available value: ``None``, floating point [0,1] - -Note: By default, maxpool2d layer is converted into a gated function controled channel based on momentum cumulative spikes. When set to ``None``, the spike is accumulated directly. If set to floating point in the range of [0,1], spike momentum is accumulated. - -The default configuration is: - -.. code-block:: python - - default_config = - { - 'simulation': - { - 'reset_to_zero': False, - 'encoder': - { - 'possion': False - }, - 'avg_pool': - { - 'has_neuron': True - }, - 'max_pool': - { - 'if_spatial_avg': False, - 'if_wta': False, - 'momentum': None - } - }, - 'parser': - { - 'robust_norm': True - } - } - - - -MNIST classification --------------------- - -Now, use ``ann2snn`` to build a simple convolutional network to classify the MNIST dataset. - -First define our network structure: - -.. code-block:: python - - class ANN(nn.Module): - def __init__(self): - super().__init__() - self.network = nn.Sequential( - nn.Conv2d(1, 32, 3, 1), - nn.BatchNorm2d(32, eps=1e-3), - nn.ReLU(), - nn.AvgPool2d(2, 2), - - nn.Conv2d(32, 32, 3, 1), - nn.BatchNorm2d(32, eps=1e-3), - nn.ReLU(), - nn.AvgPool2d(2, 2), - - nn.Conv2d(32, 32, 3, 1), - nn.BatchNorm2d(32, eps=1e-3), - nn.ReLU(), - nn.AvgPool2d(2, 2), - - nn.Flatten(), - nn.Linear(32, 10), - nn.ReLU() - ) - - def forward(self,x): - x = self.network(x) - return x - -Note: In the defined network, the order of module definition must be consistent with the forward order, otherwise it will affect the automatic analysis of the network.It is best to use ``nn.Sequence(·)`` to completely define the network. After each Conv2d and Linear layer, a ReLU layer must be placed, which can be separated by a BatchNorm layer. No ReLU is added after the pooling layer. If you encounter a situation where you need to expand the tensor, define a ``nn.Flatten`` module in the network. In the forward function, you need to use the defined Flatten instead of the view function. - -Define our hyperparameters: - -.. code-block:: python - - device = input('输入运行的设备,例如“cpu”或“cuda:0”\n input device, e.g., "cpu" or "cuda:0": ') - dataset_dir = input('输入保存MNIST数据集的位置,例如“./”\n input root directory for saving MNIST dataset, e.g., "./": ') - batch_size = int(input('输入batch_size,例如“64”\n input batch_size, e.g., "64": ')) - learning_rate = float(input('输入学习率,例如“1e-3”\n input learning rate, e.g., "1e-3": ')) - T = int(input('输入仿真时长,例如“100”\n input simulating steps, e.g., "100": ')) - train_epoch = int(input('输入训练轮数,即遍历训练集的次数,例如“10”\n input training epochs, e.g., "10": ')) - model_name = input('输入模型名字,例如“mnist”\n input model name, for log_dir generating , e.g., "mnist": ') - -The program searches for the trained model archive (a file with the same name as `model_name`) according to the specified folder, and all subsequent temporary files will be stored in that folder. - -Load the default conversion configuration and save - -.. code-block:: python - - config = utils.Config.default_config - print('ann2snn config:\n\t', config) - utils.Config.store_config(os.path.join(log_dir,'default_config.json'),config) - - -Initialize data loader, network, optimizer, loss function - -.. code-block:: python - - # Initialize the network - ann = ANN().to(device) - # Define loss function - loss_function = nn.CrossEntropyLoss() - # Use Adam optimizer - optimizer = torch.optim.Adam(ann.parameters(), lr=learning_rate, weight_decay=5e-4) - -Train ANN and test it regularly. You can also use the pre-written training program in utils during training. +First define our network structure (see ``ann2snn.sample_models.mnist_cnn``): .. code-block:: python - for epoch in range(train_epoch): - # Train the network using a pre-prepared code in ''utils'' - utils.train_ann(net=ann, - device=device, - data_loader=train_data_loader, - optimizer=optimizer, - loss_function=loss_function, - epoch=epoch - ) - # Validate the network using a pre-prepared code in ''utils'' - acc = utils.val_ann(net=ann, - device=device, - data_loader=test_data_loader, - epoch=epoch - ) - if best_acc <= acc: - utils.save_model(ann, log_dir, model_name+'.pkl') - -The complete code is located in ``ann2snn.examples.if_cnn_mnist.py``, in the code we also use Tensorboard to save training logs. You can run it directly on the Python command line: + class ANN(nn.Module): + def __init__(self): + super().__init__() + self.network = nn.Sequential( + nn.Conv2d(1, 32, 3, 1), + nn.BatchNorm2d(32, eps=1e-3), + nn.ReLU(), + nn.AvgPool2d(2, 2), -.. code-block:: python + nn.Conv2d(32, 32, 3, 1), + nn.BatchNorm2d(32, eps=1e-3), + nn.ReLU(), + nn.AvgPool2d(2, 2), - >>> import spikingjelly.clock_driven.ann2snn.examples.if_cnn_mnist as if_cnn_mnist - >>> if_cnn_mnist.main() - 输入运行的设备,例如“cpu”或“cuda:0” - input device, e.g., "cpu" or "cuda:0": cuda:15 - 输入保存MNIST数据集的位置,例如“./” - input root directory for saving MNIST dataset, e.g., "./": ./mnist - 输入batch_size,例如“64” - input batch_size, e.g., "64": 128 - 输入学习率,例如“1e-3” - input learning rate, e.g., "1e-3": 1e-3 - 输入仿真时长,例如“100” - input simulating steps, e.g., "100": 100 - 输入训练轮数,即遍历训练集的次数,例如“10” - input training epochs, e.g., "10": 10 - 输入模型名字,用于自动生成日志文档,例如“mnist” - input model name, for log_dir generating , e.g., "mnist" - - If the input of the main function is not a folder with valid files, an automatic log file folder is automatically generated. - Terminal outputs root directory for saving logs, e.g., "./": ./log-mnist1596804385.476601 - - Epoch 0 [1/937] ANN Training Loss:2.252 Accuracy:0.078 - Epoch 0 [101/937] ANN Training Loss:1.424 Accuracy:0.669 - Epoch 0 [201/937] ANN Training Loss:1.117 Accuracy:0.773 - Epoch 0 [301/937] ANN Training Loss:0.953 Accuracy:0.795 - Epoch 0 [401/937] ANN Training Loss:0.865 Accuracy:0.788 - Epoch 0 [501/937] ANN Training Loss:0.807 Accuracy:0.792 - Epoch 0 [601/937] ANN Training Loss:0.764 Accuracy:0.795 - Epoch 0 [701/937] ANN Training Loss:0.726 Accuracy:0.834 - Epoch 0 [801/937] ANN Training Loss:0.681 Accuracy:0.880 - Epoch 0 [901/937] ANN Training Loss:0.641 Accuracy:0.888 - Epoch 0 [100/100] ANN Validating Loss:0.328 Accuracy:0.881 - Save model to: ./log-mnist1596804385.476601\mnist.pkl - ... - Epoch 9 [901/937] ANN Training Loss:0.036 Accuracy:0.990 - Epoch 9 [100/100] ANN Validating Loss:0.042 Accuracy:0.988 - Save model to: ./log-mnist1596804957.0179427\mnist.pkl - -In the example, this model is trained for 10 epochs. The changes in the accuracy of the test set during training are as follows: - -.. image:: ../_static/tutorials/clock_driven/5_ann2snn/accuracy_curve.png - -In the end, the accuracy on test dataset is 98.8%. - -Take a part of the data from the training set and use it for the normalization step of the model. Here we take 1/500 of the training data, which is 100 pictures. But it should be noted that the range of the data tensor taken from the dataset is [0, 255], and it needs to be divided by 255 to become a floating point tensor in the range of [0.0, 1.0] to match the feasible range of firing rate. + nn.Conv2d(32, 32, 3, 1), + nn.BatchNorm2d(32, eps=1e-3), + nn.ReLU(), + nn.AvgPool2d(2, 2), -.. code-block:: python + nn.Flatten(), + nn.Linear(32, 10), + nn.ReLU() + ) - norm_set_len = int(train_data_dataset.data.shape[0] / 500) - print('Using %d pictures as norm set'%(norm_set_len)) - norm_set = train_data_dataset.data[:norm_set_len, :, :].float() / 255 - norm_tensor = torch.FloatTensor(norm_set).view(-1,1,28,28) + def forward(self,x): + x = self.network(x) + return x -Call the standard conversion function ``standard_conversion`` implemented in ``ann2snn.utils`` to realize ANN conversion and SNN simulation. +Note: If you need to expand the tensor, define a ``nn.Flatten`` module in the network, and use the defined Flatten instead of the view function in the forward function. -.. code-block:: python - - utils.standard_conversion(model_name=model_name, - norm_data=norm_tensor, - test_data_loader=test_data_loader, - device=device, - T=T, - log_dir=log_dir, - config=config - ) - -In the process, the normalized model structure is output: - -.. code-block:: python - - ModelParser( - (network): Sequential( - (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1)) - (1): ReLU() - (2): AvgPool2d(kernel_size=2, stride=2, padding=0) - (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1)) - (4): ReLU() - (5): AvgPool2d(kernel_size=2, stride=2, padding=0) - (6): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1)) - (7): ReLU() - (8): AvgPool2d(kernel_size=2, stride=2, padding=0) - (9): Flatten() - (10): Linear(in_features=32, out_features=10, bias=True) - (11): ReLU() - ) - ) - -At the same time, one can also observe the structure of SNN: - -.. code-block:: python - - SNN( - (network): Sequential( - (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1)) - (1): IFNode( - v_threshold=1.0, v_reset=None - (surrogate_function): Sigmoid() - ) - (2): AvgPool2d(kernel_size=2, stride=2, padding=0) - (3): IFNode( - v_threshold=1.0, v_reset=None - (surrogate_function): Sigmoid() - ) - (4): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1)) - (5): IFNode( - v_threshold=1.0, v_reset=None - (surrogate_function): Sigmoid() - ) - (6): AvgPool2d(kernel_size=2, stride=2, padding=0) - (7): IFNode( - v_threshold=1.0, v_reset=None - (surrogate_function): Sigmoid() - ) - (8): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1)) - (9): IFNode( - v_threshold=1.0, v_reset=None - (surrogate_function): Sigmoid() - ) - (10): AvgPool2d(kernel_size=2, stride=2, padding=0) - (11): IFNode( - v_threshold=1.0, v_reset=None - (surrogate_function): Sigmoid() - ) - (12): Flatten() - (13): Linear(in_features=32, out_features=10, bias=True) - (14): IFNode( - v_threshold=1.0, v_reset=None - (surrogate_function): Sigmoid() - ) - ) - ) - -It can be seen that the activation of ReLU in the ANN model is replaced by the IFNode of SNN. Each layer of AvgPool2d is followed by a layer of IFNode. - -Due to the long time of model simulation, the current accuracy and simulation progress are continuously output: +Define our hyperparameters: .. code-block:: python - [SNN Simulating... 1.00%] Acc:0.990 - [SNN Simulating... 2.00%] Acc:0.990 - [SNN Simulating... 3.00%] Acc:0.990 - [SNN Simulating... 4.00%] Acc:0.988 - [SNN Simulating... 5.00%] Acc:0.990 - …… - [SNN Simulating... 95.00%] Acc:0.986 - [SNN Simulating... 96.00%] Acc:0.986 - [SNN Simulating... 97.00%] Acc:0.986 - [SNN Simulating... 98.00%] Acc:0.986 - [SNN Simulating... 99.00%] Acc:0.987 - SNN Simulating Accuracy:0.987 - Summary: ANN Accuracy:98.7900% SNN Accuracy:98.6500% [Decreased 0.1400%] - -Through the final output, we can know that the accuracy of ANN's MNIST classification is 98.79%. The accuracy of the converted SNN is 98.65%. The conversion resulted in a 0.14% performance degradation. + torch.random.manual_seed(0) + torch.cuda.manual_seed(0) + device = 'cuda' + dataset_dir = 'G:/Dataset/mnist' + batch_size = 100 + T = 50 + +Here T is the inference time step used in inference for a while. + +If you want to train, you also need to initialize the data loader, optimizer, loss function, for example: + +.. code-block::python + + lr = 1e-3 + epochs = 10 + # define the loss function + loss_function = nn.CrossEntropyLoss() + # Use Adam optimizer + optimizer = torch.optim.Adam(ann.parameters(), lr=lr, weight_decay=5e-4) + +Train the ANN. In the example, our model is trained for 10 epochs. The test set accuracy changes during training are as follows: + +.. code-block::python + + Epoch: 0 100%|██████████| 600/600 [00:05<00:00, 112.04it/s] + Validating Accuracy: 0.972 + Epoch: 1 100%|██████████| 600/600 [00:05<00:00, 105.43it/s] + Validating Accuracy: 0.986 + Epoch: 2 100%|██████████| 600/600 [00:05<00:00, 107.49it/s] + Validating Accuracy: 0.987 + Epoch: 3 100%|██████████| 600/600 [00:05<00:00, 109.26it/s] + Validating Accuracy: 0.990 + Epoch: 4 100%|██████████| 600/600 [00:05<00:00, 103.98it/s] + Validating Accuracy: 0.984 + Epoch: 5 100%|██████████| 600/600 [00:05<00:00, 100.42it/s] + Validating Accuracy: 0.989 + Epoch: 6 100%|██████████| 600/600 [00:06<00:00, 96.24it/s] + Validating Accuracy: 0.991 + Epoch: 7 100%|██████████| 600/600 [00:05<00:00, 104.97it/s] + Validating Accuracy: 0.992 + Epoch: 8 100%|██████████| 600/600 [00:05<00:00, 106.45it/s] + Validating Accuracy: 0.991 + Epoch: 9 100%|██████████| 600/600 [00:05<00:00, 111.93it/s] + Validating Accuracy: 0.991 + +After training the model, we quickly load the model to test the performance of the saved model: + +.. code-block::python + + model.load_state_dict(torch.load('SJ-mnist-cnn_model-sample.pth')) + acc = val(model, device, test_data_loader) + print('ANN Validating Accuracy: %.4f' % (acc)) + +The output is as follows: + +.. code-block::python + + 100%|██████████| 200/200 [00:02<00:00, 89.44it/s] + ANN Validating Accuracy: 0.9870 + +Converting with Converter is very simple, you only need to set the mode you want to use in the parameters. For example, to use MaxNorm, you need to define an ``ann2snn.Converter`` first, and forward the model to this object: + +.. code-block::python + + model_converter = ann2snn.Converter(mode='max', dataloader=train_data_loader) + snn_model = model_converter(model) + +snn_model is the output SNN model. + +Following this example, we define the modes as ``max``, ``99.9%``, ``1.0/2``, ``1.0/3``, ``1.0/4``, ``1.0/ 5`` case SNN transformation and separate inference T steps to get the accuracy. + +.. code-block::python + + print('---------------------------------------------') + print('Converting using MaxNorm') + model_converter = ann2snn.Converter(mode='max', dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_max_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_max_accs[-1])) + + print('---------------------------------------------') + print('Converting using RobustNorm') + model_converter = ann2snn.Converter(mode='99.9%', dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_robust_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_robust_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/2 max(activation) as scales...') + model_converter = ann2snn.Converter(mode=1.0 / 2, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_two_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_two_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/3 max(activation) as scales') + model_converter = ann2snn.Converter(mode=1.0 / 3, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_three_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_three_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/4 max(activation) as scales') + model_converter = ann2snn.Converter(mode=1.0 / 4, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_four_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_four_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/5 max(activation) as scales') + model_converter = ann2snn.Converter(mode=1.0 / 5, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_five_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_five_accs[-1])) + +Observe the control bar output: + +.. code-block::python + + --------------------------------------------- + Converting using MaxNorm + 100%|██████████| 600/600 [00:04<00:00, 128.25it/s] Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.44it/s] SNN accuracy (simulation 50 time-steps): 0.9777 + --------------------------------------------- + Converting using RobustNorm + 100%|██████████| 600/600 [00:19<00:00, 31.06it/s] Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.75it/s] SNN accuracy (simulation 50 time-steps): 0.9841 + --------------------------------------------- + Converting using 1/2 max(activation) as scales... + 100%|██████████| 600/600 [00:04<00:00, 126.64it/s] ]Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.90it/s] SNN accuracy (simulation 50 time-steps): 0.9844 + --------------------------------------------- + Converting using 1/3 max(activation) as scales + 100%|██████████| 600/600 [00:04<00:00, 126.27it/s] Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.73it/s] SNN accuracy (simulation 50 time-steps): 0.9828 + --------------------------------------------- + Converting using 1/4 max(activation) as scales + 100%|██████████| 600/600 [00:04<00:00, 128.94it/s] Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.47it/s] SNN accuracy (simulation 50 time-steps): 0.9747 + --------------------------------------------- + Converting using 1/5 max(activation) as scales + 100%|██████████| 600/600 [00:04<00:00, 121.18it/s] Simulating... + 100%|██████████| 200/200 [00:13<00:00, 14.42it/s] SNN accuracy (simulation 50 time-steps): 0.9487 + --------------------------------------------- + +The speed of model conversion can be seen to be very fast. Model inference speed of 200 steps takes only 11s to complete (GTX 2080ti). +Based on the time-varying accuracy of the model output, we can plot the accuracy for different settings. + +.. code-block::python + + fig = plt.figure() + plt.plot(np.arange(0, T), mode_max_accs, label='mode: max') + plt.plot(np.arange(0, T), mode_robust_accs, label='mode: 99.9%') + plt.plot(np.arange(0, T), mode_two_accs, label='mode: 1.0/2') + plt.plot(np.arange(0, T), mode_three_accs, label='mode: 1.0/3') + plt.plot(np.arange(0, T), mode_four_accs, label='mode: 1.0/4') + plt.plot(np.arange(0, T), mode_five_accs, label='mode: 1.0/5') + plt.legend() + plt.xlabel('t') + plt.ylabel('Acc') + plt.show() + +.. image:: ../_static/tutorials/clock_driven/5_ann2snn/accuracy_mode.png + +Different settings can get different results, some inference speed is fast, but the final accuracy is low, and some inference is slow, but the accuracy is high. Users can choose model settings according to their needs. .. [#f1] Rueckauer B, Lungu I-A, Hu Y, Pfeiffer M and Liu S-C (2017) Conversion of Continuous-Valued Deep Networks to Efficient Event-Driven Networks for Image Classification. Front. Neurosci. 11:682. .. [#f2] Diehl, Peter U. , et al. Fast classifying, high-accuracy spiking deep networks through weight and threshold balancing. Neural Networks (IJCNN), 2015 International Joint Conference on IEEE, 2015. .. [#f3] Rueckauer, B., Lungu, I. A., Hu, Y., & Pfeiffer, M. (2016). Theory and tools for the conversion of analog to spiking convolutional neural networks. arXiv preprint arXiv:1612.04052. -.. [#f4] Sengupta, A., Ye, Y., Wang, R., Liu, C., & Roy, K. (2019). Going deeper in spiking neural networks: Vgg and residual architectures. Frontiers in neuroscience, 13, 95. \ No newline at end of file +.. [#f4] Sengupta, A., Ye, Y., Wang, R., Liu, C., & Roy, K. (2019). Going deeper in spiking neural networks: Vgg and residual architectures. Frontiers in neuroscience, 13, 95. diff --git a/docs/source/conf.py b/docs/source/conf.py index f571b37..2834fc6 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -80,7 +80,7 @@ napoleon_use_ivar = True # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] -autodoc_mock_imports = ['loris', 'readline', '_C_gemm', '_C_neuron', 'torchaudio', 'onnx', 'onnxruntime', 'gym', 'cloudpickle'] +autodoc_mock_imports = ['loris', 'readline', '_C_gemm', '_C_neuron', 'torchaudio', 'onnx', 'onnxruntime', 'gym', 'cloudpickle', 'rarfile'] autoclass_content = 'both' autodoc_member_order = 'bysource' autodoc_inherit_docstrings = False diff --git a/docs/source/index.rst b/docs/source/index.rst index 8e18604..7ff47c0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -15,7 +15,7 @@ 奇数版本是开发版,随着GitHub/OpenI不断更新。偶数版本是稳定版,可以从PyPI获取。 -从 `PyPI `_ 安装最新的稳定版本(0.0.0.0.8): +从 `PyPI `_ 安装最新的稳定版本: .. code-block:: bash @@ -76,7 +76,7 @@ * :ref:`search` -引用 +引用和出版物 ------------------------- 如果您在自己的工作中用到了惊蜇(SpikingJelly),您可以按照下列格式进行引用: @@ -86,11 +86,14 @@ title = {SpikingJelly}, author = {Fang, Wei and Chen, Yanqi and Ding, Jianhao and Chen, Ding and Yu, Zhaofei and Zhou, Huihui and Tian, Yonghong and other contributors}, year = {2020}, - publisher = {GitHub}, - journal = {GitHub repository}, howpublished = {\url{https://github.com/fangwei123456/spikingjelly}}, + note = {Accessed: YYYY-MM-DD}, } +其中的 `YYYY-MM-DD` 需要更改为您的工作使用的惊蜇(SpikingJelly)版本对应的最后一次代码修改日期。 + +使用惊蜇(SpikingJelly)的出版物可见于 `Publications using SpikingJelly `_。 + 项目信息 ------------------------- 北京大学信息科学技术学院数字媒体所媒体学习组 `Multimedia Learning Group `_ 和 `鹏城实验室 `_ 是SpikingJelly的主要开发者。 @@ -124,7 +127,7 @@ Note that SpikingJelly is based on PyTorch. Please make sure that you have insta The odd version number is the developing version, which is updated with GitHub/OpenI repository. The even version number is the stable version and available at PyPI. -Install the last stable version (0.0.0.0.8) from `PyPI `_: +Install the last stable version from `PyPI `_: .. code-block:: bash @@ -195,11 +198,14 @@ If you use SpikingJelly in your work, please cite it as follows: title = {SpikingJelly}, author = {Fang, Wei and Chen, Yanqi and Ding, Jianhao and Chen, Ding and Yu, Zhaofei and Zhou, Huihui and Tian, Yonghong and other contributors}, year = {2020}, - publisher = {GitHub}, - journal = {GitHub repository}, howpublished = {\url{https://github.com/fangwei123456/spikingjelly}}, + note = {Accessed: YYYY-MM-DD}, } +Note: To specify the version of framework you are using, the default value YYYY-MM-DD in the note field should be replaced with the date of the last change of the framework you are using, i.e. the date of the latest commit. + +Publications using SpikingJelly are recorded in `Publications using SpikingJelly `_. If you use SpikingJelly in your paper, you can also add it to this table by pull request. + About ------------------------- `Multimedia Learning Group, Institute of Digital Media (NELVT), Peking University `_ and `Peng Cheng Laboratory `_ are the main developers of SpikingJelly. diff --git a/docs/source/spikingjelly.clock_driven.lava_exchange.rst b/docs/source/spikingjelly.clock_driven.lava_exchange.rst new file mode 100644 index 0000000..4e43761 --- /dev/null +++ b/docs/source/spikingjelly.clock_driven.lava_exchange.rst @@ -0,0 +1,10 @@ +spikingjelly.clock_driven.lava_exchange package +====================================== + +Module contents +--------------- + +.. automodule:: spikingjelly.clock_driven.lava_exchange + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/spikingjelly.clock_driven.rst b/docs/source/spikingjelly.clock_driven.rst index 9d96b52..9c203fe 100644 --- a/docs/source/spikingjelly.clock_driven.rst +++ b/docs/source/spikingjelly.clock_driven.rst @@ -14,6 +14,7 @@ spikingjelly.clock_driven package spikingjelly.clock_driven.rnn spikingjelly.clock_driven.surrogate spikingjelly.clock_driven.ann2snn + spikingjelly.clock_driven.lava_exchange Module contents --------------- diff --git a/docs/source/spikingjelly.datasets.rst b/docs/source/spikingjelly.datasets.rst index 95081a6..10b9437 100644 --- a/docs/source/spikingjelly.datasets.rst +++ b/docs/source/spikingjelly.datasets.rst @@ -28,6 +28,14 @@ spikingjelly.datasets.dvs128\_gesture module :undoc-members: :show-inheritance: +spikingjelly.datasets.es\_imagenet module +------------------------------------------ + +.. automodule:: spikingjelly.datasets.es_imagenet + :members: + :undoc-members: + :show-inheritance: + spikingjelly.datasets.n\_caltech101 module ------------------------------------------ @@ -44,6 +52,14 @@ spikingjelly.datasets.n\_mnist module :undoc-members: :show-inheritance: +spikingjelly.datasets.nav\_gesture module +------------------------------------- + +.. automodule:: spikingjelly.datasets.nav_gesture + :members: + :undoc-members: + :show-inheritance: + spikingjelly.datasets.speechcommands module ------------------------------------------- @@ -58,4 +74,4 @@ Module contents .. automodule:: spikingjelly.datasets :members: :undoc-members: - :show-inheritance: + :show-inheritance: \ No newline at end of file diff --git a/publications.md b/publications.md index 2c66afb..a6a868c 100644 --- a/publications.md +++ b/publications.md @@ -1,15 +1,29 @@ ## Publications using SpikingJelly -| Papers | Codes | -| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | -| [Incorporating Learnable Membrane Time Constant to Enhance Learning of Spiking Neural Networks](https://arxiv.org/abs/2007.05785) | https://github.com/fangwei123456/Parametric-Leaky-Integrate-and-Fire-Spiking-Neuron | -| [Pruning of Deep Spiking Neural Networks through Gradient Rewiring](https://arxiv.org/abs/2105.04916) | https://github.com/Yanqi-Chen/Gradient-Rewiring | -| [Optimal ANN-SNN Conversion for Fast and Accurate Inference in Deep Spiking Neural Networks](https://arxiv.org/abs/2105.11654) | https://github.com/DingJianhao/OptSNNConvertion-RNL-RIL | -| [Deep Residual Learning in Spiking Neural Networks](https://arxiv.org/abs/2102.04159) | https://github.com/fangwei123456/Spike-Element-Wise-ResNet | -| [Spiking Neural Networks Trained via Proxy](https://arxiv.org/abs/2109.13208) | https://github.com/SRKH/ProxyLearning | -| [StereoSpike: Depth Learning with a Spiking Neural Network](https://arxiv.org/abs/2109.13751) | https://github.com/urancon/StereoSpike | -| [An Odor Recognition Algorithm of Electronic Noses Based on Convolutional Spiking Neural Network for Spoiled Food Identification](https://iopscience.iop.org/article/10.1149/1945-7111/ac1699/meta) | | -| [Cascade Spiking Neuron Network For Event-based Image Classification In Noisy Environment](https://www.techrxiv.org/articles/preprint/Cascade_Spiking_Neuron_Network_For_Event-based_Image_Classification_In_Noisy_Environment/16571043) | | -| [Keys to Accurate Feature Extraction Using Residual Spiking Neural Networks](https://arxiv.org/abs/2111.05955) | https://github.com/VicenteAlex/Spiking_ResNet | +| Papers | Codes | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [Incorporating Learnable Membrane Time Constant to Enhance Learning of Spiking Neural Networks](https://arxiv.org/abs/2007.05785) | https://github.com/fangwei123456/Parametric-Leaky-Integrate-and-Fire-Spiking-Neuron | +| [Pruning of Deep Spiking Neural Networks through Gradient Rewiring](https://arxiv.org/abs/2105.04916) | https://github.com/Yanqi-Chen/Gradient-Rewiring | +| [Optimal ANN-SNN Conversion for Fast and Accurate Inference in Deep Spiking Neural Networks](https://arxiv.org/abs/2105.11654) | https://github.com/DingJianhao/OptSNNConvertion-RNL-RIL | +| [Deep Residual Learning in Spiking Neural Networks](https://arxiv.org/abs/2102.04159) | https://github.com/fangwei123456/Spike-Element-Wise-ResNet | +| [Spiking Neural Networks Trained via Proxy](https://arxiv.org/abs/2109.13208) | https://github.com/SRKH/ProxyLearning | +| [StereoSpike: Depth Learning with a Spiking Neural Network](https://arxiv.org/abs/2109.13751) | https://github.com/urancon/StereoSpike | +| [An Odor Recognition Algorithm of Electronic Noses Based on Convolutional Spiking Neural Network for Spoiled Food Identification](https://iopscience.iop.org/article/10.1149/1945-7111/ac1699/meta) | | +| [Cascade Spiking Neuron Network For Event-based Image Classification In Noisy Environment](https://www.techrxiv.org/articles/preprint/Cascade_Spiking_Neuron_Network_For_Event-based_Image_Classification_In_Noisy_Environment/16571043) | | +| [Keys to Accurate Feature Extraction Using Residual Spiking Neural Networks](https://arxiv.org/abs/2111.05955) | https://github.com/VicenteAlex/Spiking_ResNet | +| [Human-Level Control through Directly-Trained Deep Spiking Q-Networks](https://arxiv.org/abs/2201.07211) | https://github.com/AptX395/Deep-Spiking-Q-Networks | +| [Deep Reinforcement Learning with Spiking Q-learning](https://arxiv.org/abs/2201.09754) | | +| [Event-based Video Reconstruction via Potential-assisted Spiking Neural Network](https://arxiv.org/abs/2201.10943) | https://github.com/LinZhu111/EVSNN | +| [Optimal ANN-SNN Conversion for High-accuracy and Ultra-low-latency Spiking Neural Networks](https://openreview.net/forum?id=7B3IJMM1k_M) | https://github.com/putshua/SNN-conversion-QCFS | +| [Optimized Potential Initialization for Low-latency Spiking Neural Networks](https://arxiv.org/abs/2202.01440) | | +| [AutoSNN: Towards Energy-Efficient Spiking Neural Networks](https://arxiv.org/abs/2201.12738) | | +| [Neural Architecture Search for Spiking Neural Networks](https://arxiv.org/abs/2201.10355) | https://github.com/Intelligent-Computing-Lab-Yale/Neural-Architecture-Search-for-Spiking-Neural-Networks | +| [FEAS: A Faster Event-driven Accelerator Supporting Inhibitory Spiking Neural Network](https://ieeexplore.ieee.org/document/9720483/) | | +| [Neuromorphic Data Augmentation for Training Spiking Neural Networks](https://arxiv.org/abs/2203.06145) | | +| [SIT: A Bionic and Non-Linear Neuron for Spiking Neural Network](https://arxiv.org/abs/2203.16117) | | +| [Building and training a deep spiking neural network for ECG classification](https://www.sciencedirect.com/science/article/pii/S1746809422002713) | | +| [DynSNN: A Dynamic Approach to Reduce Redundancy in Spiking Neural Networks](https://ieeexplore.ieee.org/abstract/document/9746566) | | +| [Object Detection with Spiking Neural Networks on Automotive Event Data](https://arxiv.org/abs/2205.04339) | | If you use SpikingJelly in your paper, you can also add it to this table by pull request. + diff --git a/requirements.txt b/requirements.txt index 75b8c1d..66d49ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,3 @@ numpy tqdm torchvision scipy -onnx==1.8.0 diff --git a/setup.py b/setup.py index debce56..951736c 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ with open("./README.md", "r", encoding="utf-8") as fh: setup( install_requires=install_requires, name="spikingjelly", - version="0.0.0.0.9", + version="0.0.0.0.13", author="PKU MLG, PCL, and other contributors", author_email="fwei@pku.edu.cn, chyq@pku.edu.cn", description="A deep learning framework for SNNs built on PyTorch.", diff --git a/spikingjelly/clock_driven/ann2snn/__init__.py b/spikingjelly/clock_driven/ann2snn/__init__.py index 635b1ff..c9da244 100644 --- a/spikingjelly/clock_driven/ann2snn/__init__.py +++ b/spikingjelly/clock_driven/ann2snn/__init__.py @@ -1,476 +1,2 @@ -import numpy as np -import torch -import torch.nn as nn -import os -from tqdm import tqdm -import json -from spikingjelly.clock_driven import neuron,encoding,functional -from collections import defaultdict -import copy -import time -import inspect -import matplotlib.pyplot as plt -import warnings - -from spikingjelly.clock_driven.ann2snn.kernels.onnx import _o2p_converter as onnx2pytorch - -class parser: - def __init__(self, name='', kernel='onnx', **kargs): - try: - with open(kargs['json'], 'r') as f: - self.config = json.load(f) - except KeyError: - try: - self.log_dir = kargs['log_dir'] - except KeyError: - from datetime import datetime - current_time = datetime.now().strftime('%b%d_%H-%M-%S') - log_dir = os.path.join( - self.__class__.__name__ + '-' + current_time + - ('' if len(name) == 0 else '_' + name)) - self.log_dir = log_dir - self.config = kargs - print('parser log_dir:', self.log_dir) - self.config['log_dir'] = self.log_dir - self.kernel = kernel - assert(self.kernel.lower() in ('onnx','pytorch')) - if not os.path.isdir(self.log_dir): - os.makedirs(self.log_dir) - with open(os.path.join(self.log_dir,'parser_args.json'), 'w') as fw: - json.dump(self.config, fw) - - def parse(self, model: nn.Module, data: torch.Tensor, **kargs) -> nn.Module: - model_name = model.__class__.__name__ - model.eval() - - for m in model.modules(): - if hasattr(m,'weight'): - assert(data.get_device() == m.weight.get_device()) - - try: - model = z_norm_integration(model=model, z_norm=self.config['z_norm']) - except KeyError: - pass - layer_reduc = False - for m in model.modules(): - if isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d, nn.BatchNorm3d)): - layer_reduc = True - break - if self.kernel.lower() == 'onnx': - try: - import onnx - import onnxruntime as ort - except ImportError: - print(Warning("Package onnx or onnxruntime not found: launch pytorch convert engine," - " only support very simple arctitecture")) - self.kernel = 'pytorch' - else: - pass - - if self.kernel.lower() == 'onnx': - # use onnx engine - - data = data.cpu() - model = model.cpu() - - import spikingjelly.clock_driven.ann2snn.kernels.onnx as onnx_kernel - - onnx_model = onnx_kernel.pytorch2onnx_model(model=model, data=data, log_dir=self.config['log_dir']) - # onnx_kernel.print_onnx_model(onnx_model.graph) - onnx.checker.check_model(onnx_model) - if layer_reduc: - onnx_model = onnx_kernel.layer_reduction(onnx_model) - # onnx.checker.check_model(onnx_model) - onnx_model = onnx_kernel.rate_normalization(onnx_model, data.numpy(), **kargs) #**self.config['normalization'] - onnx_kernel.save_model(onnx_model,os.path.join(self.config['log_dir'],model_name+".onnx")) - - convert_methods = onnx2pytorch - try: - user_defined = kargs['user_methods'] - assert (user_defined is dict) - for k in user_defined: - convert_methods.add_method(op_name=k, func=user_defined[k]) - except KeyError: - print('no user-defined conversion method found, use default') - except AssertionError: - print('user-defined conversion method should be organized into a dict!') - model = onnx_kernel.onnx2pytorch_model(onnx_model, convert_methods) - else: - # use pytorch engine - - import spikingjelly.clock_driven.ann2snn.kernels.pytorch as pytorch_kernel - - if layer_reduc: - model = pytorch_kernel.layer_reduction(model) - model = pytorch_kernel.rate_normalization(model, data)#, **self.config['normalization'] - - self.ann_filename = os.path.join(self.config['log_dir'], model_name + ".pth") - torch.save(model, os.path.join(self.config['log_dir'], "debug.pth")) - torch.save(model, self.ann_filename) - model = self.to_snn(model) - return model - - def to_snn(self, model: nn.Module, **kargs) -> nn.Module: - for name, module in model._modules.items(): - if hasattr(module, "_modules"): - model._modules[name] = self.to_snn(module, **kargs) - if module.__class__.__name__ == "AvgPool2d": - new_module = nn.Sequential(module, neuron.IFNode(v_reset=None)) - model._modules[name] = new_module - if "BatchNorm" in module.__class__.__name__: - try: - # NSIFNode是能够产生正负脉冲的模型,现在版本被删除 - new_module = nn.Sequential(module, neuron.NSIFNode(v_threshold=(-1.0, 1.0), v_reset=None)) - except AttributeError: - new_module = module - model._modules[name] = new_module - if module.__class__.__name__ == "ReLU": - new_module = neuron.IFNode(v_reset=None) - model._modules[name] = new_module - try: - if module.__class__.__name__ == 'PReLU': - p = module.weight - assert (p.size(0) == 1 and p != 0) - if -1 / p.item() > 0: - model._modules[name] = neuron.NSIFNode(v_threshold=(1.0 / p.item(), 1.0), - bipolar=(1.0, 1.0), v_reset=None) - else: - model._modules[name] = neuron.NSIFNode(v_threshold=(-1 / p.item(), 1.0), - bipolar=(-1.0, 1.0), v_reset=None) - except AttributeError: - assert False, 'NSIFNode has been removed.' - if module.__class__.__name__ == "MaxPool2d": - new_module = nn.AvgPool2d( - kernel_size=module.kernel_size, - stride=module.stride, - padding=module.padding) - model._modules[name] = new_module - return model - -def z_norm_integration(model: nn.Module, z_norm=None) -> nn.Module: - if z_norm is not None: - (z_norm_mean, z_norm_std) = z_norm - z_norm_mean = torch.from_numpy(np.array(z_norm_mean).astype(np.float32)) - z_norm_std = torch.from_numpy(np.array(z_norm_std).astype(np.float32)) - bn = nn.BatchNorm2d(num_features=len(z_norm_std)) - bn.weight.data = torch.ones_like(bn.weight.data) - bn.bias.data = torch.zeros_like(bn.bias.data) - bn.running_var.data = torch.pow(z_norm_std, exponent=2) - bn.eps - bn.running_mean.data = z_norm_mean - return nn.Sequential(bn, model) - else: - return model - -import threading -mutex_schedule = threading.Lock() -mutex_shared = threading.Lock() -global_shared = {} - -class simulator: - def __init__(self, snn, device, name='', **kargs): - snn.eval() - try: - self.log_dir = kargs['log_dir'] - except KeyError: - from datetime import datetime - current_time = datetime.now().strftime('%b%d_%H-%M-%S') - log_dir = os.path.join( - self.__class__.__name__ + '-' + current_time + - ('' if len(name)==0 else '_' + name)) - self.log_dir = log_dir - print('simulator log_dir:',self.log_dir) - if not os.path.isdir(self.log_dir): - os.makedirs(self.log_dir) - - try: - self.fig = kargs['canvas'] - self.ax = self.fig.add_subplot(1, 1, 1) - plt.ion() - except KeyError: - self.fig = None - - try: - encoder = kargs['encoder'] - except KeyError: - encoder = 'constant' - if encoder == 'poisson': - self.encoder = encoding.PoissonEncoder() - else: - self.encoder = lambda x: x - - if isinstance(device,(list,set,tuple)): - if len(device)==1: - device = device[0] - self.pi = False - else: - self.pi = True # parallel inference - else: - self.pi = False - if self.pi: - print('simulator is working on the parallel mode, device(s):', device) - else: - print('simulator is working on the normal mode, device:', device) - self.device = device - - global global_shared, mutex_schedule, mutex_shared - self.mutex_shared = mutex_shared - self.mutex_schedule = mutex_schedule - self.global_shared = global_shared - if self.pi: - self.global_shared['device_used'] = defaultdict(int) - self.global_shared['device_stat'] = defaultdict(int) - self.global_shared['distri_model'] = {} - self.global_shared['batch'] = 0 - self.global_shared['batch_sum'] = 0 - self.global_shared['T'] = None - for dev in self.device: - self.global_shared['distri_model'][dev] = copy.deepcopy(snn).to(dev) - else: - self.global_shared['distri_model'] = {} - self.global_shared['distri_model'][self.device] = copy.deepcopy(snn).to(self.device) - self.config = dict() - self.config['device'] = self.device - self.config['name'] = name - self.config['log_dir'] = self.log_dir - self.config = {**self.config, **kargs} - - - def simulate(self, data_loader, T, **kargs): - self.config['T'] = T - self.config = {**self.config, **kargs} - with open(os.path.join(self.log_dir,'simulator_args.json'), 'w') as fw: - json.dump({k: self.config[k] for k in self.config.keys() if _if_json_serializable(self.config[k])} - , fw) - try: - if kargs['online_drawer']: - if isinstance(self.device, (list, set, tuple)): - warnings.warn("online drawer deprecated because package Matplotlib is not thread safe!") - except KeyError: - pass - try: - func_dict = kargs['func_dict'] - except KeyError: - func_dict = {} - for n in self._get_user_defined_static_methods(): - func_dict[n] = getattr(self,n) - try: - assert(len(func_dict.keys())>0) - except AssertionError: - raise KeyError("Please add valid func_dict for simulator, or use pre-defined subclass of ``simulator``!") - if self.pi: - threads = [] - start = time.perf_counter() - global global_shared - self.global_shared['T'] = T - for value_name in func_dict: - self.global_shared[value_name] = [] - self.global_shared['batch_sum'] = len(data_loader) - for batch, (data, targets) in enumerate(tqdm(data_loader)): - self.global_shared['batch'] = batch - if self.pi: - distributed = False - while not distributed: - time.sleep(0.001) # time delay - for device in self.device: - if self.global_shared['device_used'][device] == 0: - t = threading.Thread(target=self.get_values, - kwargs=dict(data=data, - targets=targets, - device=device, - T=T, - func_dict=func_dict, - **kargs) - ) - t.start() - threads.append(t) - distributed = True - self.global_shared['device_stat'][device] += 1 - break - else: - self.get_values(data=data, - targets=targets, - device=self.device, - T=T, - func_dict=func_dict, - **kargs) - if self.pi: - for t in threads: - t.join() - elapsed = time.perf_counter() - start - print('--------------------simulator summary--------------------') - print('time elapsed:', elapsed, '(sec)') - if self.pi: - print('load stat:',self.global_shared['device_stat']) - print('---------------------------------------------------------') - - try: - if kargs['canvas'] is not None: - plt.ioff() - plt.close() - except KeyError: - pass - - ret_dict = {} - - for value_name in func_dict: - ret_dict[value_name] = self.global_shared[value_name] - return ret_dict - - def get_values(self, data, targets, device, T, func_dict, **kargs): - if self.pi: - if mutex_shared.acquire(): - getattr(self, '_pre_batch_sim')(**kargs) - mutex_shared.release() - else: - getattr(self, '_pre_batch_sim')(**kargs) - global global_shared - data = data.to(device) - targets = targets.to(device) - values_list = defaultdict(list) - - if self.pi: - if mutex_schedule.acquire(): - self.global_shared['device_used'][device] = 1 - mutex_schedule.release() - - snn = self.global_shared['distri_model'][device] - functional.reset_net(snn) - with torch.no_grad(): - for t in range(T): - enc = self.encoder(data).float().to(device) - out = snn(enc) - if t == 0: - counter = out - else: - counter += out - for value_name in func_dict.keys(): - value = func_dict[value_name](data=data, - targets=targets, - out_spike=out, - out_spike_cnt=counter, - device=device, - **kargs) - values_list[value_name].append(value) - - for value_name in func_dict.keys(): - values_list[value_name] = np.array(values_list[value_name]).astype(np.float32) - - if self.pi: - if mutex_shared.acquire(): - for value_name in func_dict.keys(): - self.global_shared[value_name].append(values_list[value_name]) - getattr(self, '_after_batch_sim')(**kargs) - mutex_shared.release() - else: - for value_name in func_dict.keys(): - self.global_shared[value_name].append(values_list[value_name]) - getattr(self, '_after_batch_sim')(**kargs) - - if self.pi: - if mutex_schedule.acquire(): - self.global_shared['device_used'][device] = 0 - mutex_schedule.release() - - def _get_user_defined_static_methods(self): - method = [] - attrs = dir(self) - for attr in attrs: - if attr[0] != '_': - user_defined = inspect.isroutine(getattr(self, attr)) - static_method = False - for cls in inspect.getmro(type(self)): - if attr in cls.__dict__: - v = cls.__dict__[attr] - if isinstance(v, staticmethod): - static_method = True - if user_defined and static_method: - method.append(attr) - return method - - def _pre_batch_sim(self, **kargs): - pass - - def _after_batch_sim(self, **kargs): - pass - - - -class classify_simulator(simulator): # 一个分类任务的实例 - def __init__(self, snn, device, **kargs): - super().__init__(snn, device, **kargs) - self.global_shared['accu_correct'] = 0.0 - self.global_shared['accu_total'] = 0.0 - self.global_shared['acc'] = 0.0 - # try: - # self.fig = kargs['canvas'] - # self.ax = self.fig.add_subplot(1, 1, 1) - # plt.ion() - # except KeyError: - # self.fig = None - - @staticmethod - def correct_num(targets, out_spike_cnt, **kargs) -> float: - n = (out_spike_cnt.max(1)[1] == targets).float().sum().item() - return n - - @staticmethod - def total_num(targets, **kargs) -> float: - n = targets.size(0) - return n - - def _after_batch_sim(self, **kargs): - import matplotlib.pyplot as plt - T = self.global_shared['T'] - self.global_shared['accu_correct'] += self.global_shared['correct_num'][-1] - self.global_shared['accu_total'] += self.global_shared['total_num'][-1] - self.global_shared['acc'] = self.global_shared['accu_correct'] \ - / self.global_shared['accu_total'] - np.savetxt(os.path.join(self.log_dir, 'acc.csv'), self.global_shared['acc'], delimiter=",") - - if self.fig is not None: - self.ax.cla() - x = np.arange(self.global_shared['acc'].shape[0]) - self.ax.plot(x,self.global_shared['acc'] * 100,label='SNN Acc') - - try: - ann_acc = kargs['ann_acc'] * 100 - self.ax.plot(x, np.ones_like(x) * ann_acc, label='ANN', c='g', linestyle=':') - self.ax.text(0, ann_acc + 1, "%.3f%%" % (ann_acc), fontdict={'size': '8', 'color': 'g'}) - except KeyError: - pass - try: - self.ax.set_title("%s\n[%.1f%% dataset]" % ( - kargs['fig_name'], - 100.0 * (self.global_shared['batch']+1) / self.global_shared['batch_sum'] - )) - except KeyError: - pass - try: - if kargs['step_max']: - y = self.global_shared['acc'] * 100 - argmax = np.argmax(y) - disp_bias = 0.3 * float(T) if x[argmax] / T > 0.7 else 0 - self.ax.text(x[argmax] - 0.8 - disp_bias, y[argmax] + 0.8, "MAX:%.3f%% T=%d" % (y[argmax], x[argmax]), - fontdict={'size': '12', 'color': 'r'}) - self.ax.scatter([x[argmax]], [y[argmax]], c='r') - except KeyError: - pass - - self.ax.set_xlabel("T") - self.ax.set_ylabel("Percentage(%)") - self.ax.legend() - plt.savefig(os.path.join(self.log_dir,'plot.pdf')) - - try: - if kargs['online_drawer']: - if not isinstance(self.device, (list, set, tuple)): - plt.pause(0.001) - except KeyError: - pass - -def _if_json_serializable(x): - try: - json.dumps(x) - return True - except: - return False \ No newline at end of file +from spikingjelly.clock_driven.ann2snn.converter import Converter +from spikingjelly.clock_driven.ann2snn.utils import download_url \ No newline at end of file diff --git a/spikingjelly/clock_driven/ann2snn/converter.py b/spikingjelly/clock_driven/ann2snn/converter.py new file mode 100644 index 0000000..29914c8 --- /dev/null +++ b/spikingjelly/clock_driven/ann2snn/converter.py @@ -0,0 +1,107 @@ +from spikingjelly.clock_driven.ann2snn.modules import * +from tqdm import tqdm +from spikingjelly.clock_driven import neuron +import copy + + +class Converter(nn.Module): + + def __init__(self, dataloader, mode='Max'): + """ + * :ref:`API in English ` + + .. _Converter.__init__-cn: + + :param dataloader: 数据加载器 + :type dataloader: Dataloader + :param mode: 转换模式。目前支持三种模式,最大电流转换模式,99.9%电流转换模式,以及缩放转换模式 + :type mode: str, float + + ``Converter`` 用于将ReLU的ANN转换为SNN。这里实现了常见的三种模式。 + 最常见的是最大电流转换模式,它利用前后层的激活上限,使发放率最高的情况能够对应激活取得最大值的情况。 + 99.9%电流转换模式利用99.9%的激活分位点限制了激活上限。 + 缩放转换模式下,用户需要给定缩放参数到模式中,即可利用缩放后的激活最大值对电流进行限制。 + + * :ref:`中文API ` + + .. _Converter.__init__-en: + + :param dataloader: Dataloader for converting + :type dataloader: Dataloader + :param mode: Conversion mode. Now support three mode, MaxNorm, RobustNorm(99.9%), and scaling mode + :type mode: str, float + + ``Converter`` is used to convert ReLU's ANN to SNN. Three common methods are implemented here. + The most common is the maximum mode, which utilizes the upper activation limits of + the front and rear layers so that the case with the highest firing rate corresponds to the case where the + activation achieves the maximum value. + The 99.9% mode utilizes the 99.9% activation quantile to limit the upper activation limit. + In the scaling conversion mode, the user needs to specify the scaling parameters into the mode, and the current + can be limited by the activated maximum value after scaling. + + """ + super().__init__() + self.mode = mode + self.dataloader = dataloader + self._check_mode() + self.device = None + + def forward(self, relu_model): + relu_model = copy.deepcopy(relu_model) + if self.device is None: + self.device = next(relu_model.parameters()).device + relu_model.eval() + model = self.set_voltagehook(relu_model, mode=self.mode).to(self.device) + for _, (imgs, _) in enumerate(tqdm(self.dataloader)): + model(imgs.to(self.device)) + model = self.replace_by_ifnode(model) + return model + + def _check_mode(self): + err_msg = 'You have used a non-defined VoltageScale Method.' + if isinstance(self.mode, str): + if self.mode[-1] == '%': + try: + float(self.mode[:-1]) + except ValueError: + raise NotImplemented(err_msg) + elif self.mode.lower() in ['max']: + pass + else: + raise NotImplemented(err_msg) + elif isinstance(self.mode, float): + try: + assert(self.mode <= 1 and self.mode > 0) + except AssertionError: + raise NotImplemented(err_msg) + else: + raise NotImplemented(err_msg) + + + @staticmethod + def set_voltagehook(model, mode='MaxNorm'): + for name, module in model._modules.items(): + if hasattr(module, "_modules"): + model._modules[name] = Converter.set_voltagehook(module, mode=mode) + if module.__class__.__name__ == 'ReLU': + model._modules[name] = nn.Sequential( + nn.ReLU(), + VoltageHook(mode=mode) + ) + return model + + @staticmethod + def replace_by_ifnode(model): + for name,module in model._modules.items(): + if hasattr(module, "_modules"): + model._modules[name] = Converter.replace_by_ifnode(module) + if module.__class__.__name__ == 'Sequential' and len(module) == 2 and \ + module[0].__class__.__name__ == 'ReLU' and \ + module[1].__class__.__name__ == 'VoltageHook': + s = module[1].scale.item() + model._modules[name] = nn.Sequential( + VoltageScaler(1.0 / s), + neuron.IFNode(v_threshold=1., v_reset=None), + VoltageScaler(s) + ) + return model \ No newline at end of file diff --git a/spikingjelly/clock_driven/ann2snn/examples/cnn_fashionmnist.py b/spikingjelly/clock_driven/ann2snn/examples/cnn_fashionmnist.py deleted file mode 100644 index 86ce1ea..0000000 --- a/spikingjelly/clock_driven/ann2snn/examples/cnn_fashionmnist.py +++ /dev/null @@ -1,175 +0,0 @@ -import torch -import torch.nn as nn -import torchvision -import os -from torch.utils.tensorboard import SummaryWriter -import spikingjelly.clock_driven.ann2snn.examples.utils as utils -from spikingjelly.clock_driven.ann2snn import parser, classify_simulator -import matplotlib.pyplot as plt - -class ANN(nn.Module): - def __init__(self): - super().__init__() - # 网络结构:类似AlexNet的结构 - # Network structure: AlexNet-like structure - self.network = nn.Sequential( - nn.Conv2d(1, 32, kernel_size=3, padding=1), - nn.BatchNorm2d(32), - nn.ReLU(), - nn.AvgPool2d(kernel_size=2, stride=2), - nn.Dropout2d(0.2), - nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1), - nn.BatchNorm2d(64), - nn.ReLU(), - nn.AvgPool2d(kernel_size=2, stride=2), - nn.Dropout2d(0.2), - nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), - nn.BatchNorm2d(128), - nn.ReLU(), - nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), - nn.BatchNorm2d(256), - nn.ReLU(), - nn.Dropout2d(0.2), - nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), - nn.BatchNorm2d(256), - nn.ReLU(), - nn.AvgPool2d(kernel_size=2, stride=2), - nn.Flatten(), - nn.Linear(256 * 3 * 3, 1024), - nn.ReLU(), - nn.Dropout(0.2), - nn.Linear(1024, 512), - nn.ReLU(), - nn.Linear(512, 10) - ) - - def forward(self,x): - x = self.network(x) - return x - -def main(log_dir=None): - torch.random.manual_seed(0) - torch.cuda.manual_seed(0) - - train_device = input('输入运行的设备,例如“cpu”或“cuda:0”\n input device, e.g., "cpu" or "cuda:0": ') - parser_device = input('输入分析模型的设备,例如“cpu”或“cuda:0”\n input parsing device, e.g., "cpu" or "cuda:0": ') - simulator_device = parser_device - # simulator_device = input( - # '输入SNN仿真的设备(支持多线程),例如“cpu,cuda:0”或“cuda:0,cuda:1”\n input SNN simulating device (support multithread), e.g., "cpu,cuda:0" or "cuda:0,cuda:1": ').split( - # ',') - dataset_dir = input('输入保存MNIST数据集的位置,例如“./”\n input root directory for saving FashionMNIST dataset, e.g., "./": ') - batch_size = int(input('输入batch_size,例如“128”\n input batch_size, e.g., "128": ')) - learning_rate = float(input('输入学习率,例如“1e-3”\n input learning rate, e.g., "1e-3": ')) - T = int(input('输入仿真时长,例如“400”\n input simulating steps, e.g., "400": ')) - train_epoch = int(input('输入训练轮数,即遍历训练集的次数,例如“100”\n input training epochs, e.g., "100": ')) - model_name = input('输入模型名字,例如“cnn_fashionmnist”\n input model name, for log_dir generating , e.g., "cnn_fashionmnist": ') - - load = False - if log_dir == None: - from datetime import datetime - current_time = datetime.now().strftime('%b%d_%H-%M-%S') - log_dir = model_name + '-' + current_time - if not os.path.exists(log_dir): - os.makedirs(log_dir) - else: - if not os.path.exists(log_dir): - os.makedirs(log_dir) - if not os.path.exists(os.path.join(log_dir, model_name + '.pkl')): - print('%s has no model to load.' % (log_dir)) - load = False - else: - load = True - - if not load: - writer = SummaryWriter(log_dir) - - # 初始化数据加载器 - # initialize data loader - train_data_dataset = torchvision.datasets.FashionMNIST( - root=dataset_dir, - train=True, - transform=torchvision.transforms.ToTensor(), - download=True) - train_data_loader = torch.utils.data.DataLoader( - train_data_dataset, - batch_size=batch_size, - shuffle=True, - drop_last=True) - test_data_loader = torch.utils.data.DataLoader( - dataset=torchvision.datasets.FashionMNIST( - root=dataset_dir, - train=False, - transform=torchvision.transforms.ToTensor(), - download=True), - batch_size=100, - shuffle=True, - drop_last=False) - - ann = ANN().to(train_device) - loss_function = nn.CrossEntropyLoss() - if not load: - optimizer = torch.optim.Adam(ann.parameters(), lr=learning_rate, weight_decay=5e-4) - best_acc = 0.0 - for epoch in range(train_epoch): - # 使用utils中预先写好的训练程序训练网络 - # 训练程序的写法和经典ANN中的训练也是一样的 - # Train the network using a pre-prepared code in ''utils'' - utils.train_ann(net=ann, - device=train_device, - data_loader=train_data_loader, - optimizer=optimizer, - loss_function=loss_function, - epoch=epoch - ) - # 使用utils中预先写好的验证程序验证网络输出 - # Validate the network using a pre-prepared code in ''utils'' - acc = utils.val_ann(net=ann, - device=train_device, - data_loader=test_data_loader, - loss_function=loss_function, - epoch=epoch - ) - if best_acc <= acc: - utils.save_model(ann, log_dir, model_name + '.pkl') - writer.add_scalar('val_accuracy', acc, epoch) - ann = torch.load(os.path.join(log_dir, model_name + '.pkl')) - print('validating best model...') - ann_acc = utils.val_ann(net=ann, - device=train_device, - data_loader=test_data_loader, - loss_function=loss_function - ) - - # 加载用于归一化模型的数据 - # Load the data to normalize the model - percentage = 0.004 # load 0.004 of the data - norm_data_list = [] - for idx, (imgs, targets) in enumerate(train_data_loader): - norm_data_list.append(imgs) - if idx == int(len(train_data_loader) * percentage) - 1: - break - norm_data = torch.cat(norm_data_list) - print('use %d imgs to parse' % (norm_data.size(0))) - - onnxparser = parser(name=model_name, - log_dir=log_dir + '/parser', - kernel='onnx') - snn = onnxparser.parse(ann, norm_data.to(parser_device)) - - torch.save(snn, os.path.join(log_dir, 'snn-' + model_name + '.pkl')) - fig = plt.figure('simulator') - sim = classify_simulator(snn, - log_dir=log_dir + '/simulator', - device=simulator_device, - canvas=fig - ) - sim.simulate(test_data_loader, - T=T, - online_drawer=True, - ann_acc=ann_acc, - fig_name=model_name, - step_max=True - ) - -if __name__ == '__main__': - main('./cnn_fashionmnist') diff --git a/spikingjelly/clock_driven/ann2snn/examples/cnn_mnist.py b/spikingjelly/clock_driven/ann2snn/examples/cnn_mnist.py index 0dac65a..1019d70 100644 --- a/spikingjelly/clock_driven/ann2snn/examples/cnn_mnist.py +++ b/spikingjelly/clock_driven/ann2snn/examples/cnn_mnist.py @@ -1,219 +1,153 @@ import torch -import torch.nn as nn import torchvision -import os -from torch.utils.tensorboard import SummaryWriter -import spikingjelly.clock_driven.ann2snn.examples.utils as utils -from spikingjelly.clock_driven.ann2snn import parser, classify_simulator +import torch.nn as nn +import spikingjelly +from spikingjelly.clock_driven import ann2snn +from tqdm import tqdm +from spikingjelly.clock_driven.ann2snn.sample_models import mnist_cnn +import numpy as np import matplotlib.pyplot as plt -class ANN(nn.Module): - def __init__(self): - super().__init__() - # 网络结构:三层卷积块串联一个全连接层,每个卷积块由一个卷积层、一个批正则化、一个ReLU激活和一个平均池化层组成 - # Network structure: Three convolution blocks connected with a full-connection layer, each convolution - # block consists of a convolution layer, a batch normalization, a ReLU activation and an average pool - # layer. - self.network = nn.Sequential( - nn.Conv2d(1, 32, 3, 1), - nn.BatchNorm2d(32, eps=1e-3), - nn.ReLU(), - nn.AvgPool2d(2, 2), - - nn.Conv2d(32, 32, 3, 1), - nn.BatchNorm2d(32, eps=1e-3), - nn.ReLU(), - nn.AvgPool2d(2, 2), - - nn.Conv2d(32, 32, 3, 1), - nn.BatchNorm2d(32, eps=1e-3), - nn.ReLU(), - nn.AvgPool2d(2, 2), - - nn.Flatten(), - nn.Linear(32, 10), - nn.ReLU() - ) - - def forward(self,x): - x = self.network(x) - return x - - -def main(log_dir=None): - ''' - :return: None - - 使用Conv-ReLU-[Conv-ReLU]-全连接-ReLU的网络结构训练并转换为SNN,进行MNIST识别。运行示例: - - .. code-block:: python - - >>> import spikingjelly.clock_driven.ann2snn.examples.cnn_mnist as cnn_mnist - >>> cnn_mnist.main() - 输入运行的设备,例如“cpu”或“cuda:0” - input device, e.g., "cpu" or "cuda:0": cuda:15 - 输入保存MNIST数据集的位置,例如“./” - input root directory for saving MNIST dataset, e.g., "./": ./mnist - 输入batch_size,例如“64” - input batch_size, e.g., "64": 128 - 输入学习率,例如“1e-3” - input learning rate, e.g., "1e-3": 1e-3 - 输入仿真时长,例如“100” - input simulating steps, e.g., "100": 100 - 输入训练轮数,即遍历训练集的次数,例如“10” - input training epochs, e.g., "10": 10 - 输入模型名字,用于自动生成日志文档,例如“cnn_mnist” - input model name, for log_dir generating , e.g., "cnn_mnist" - - Epoch 0 [1/937] ANN Training Loss:2.252 Accuracy:0.078 - Epoch 0 [101/937] ANN Training Loss:1.423 Accuracy:0.669 - Epoch 0 [201/937] ANN Training Loss:1.117 Accuracy:0.773 - Epoch 0 [301/937] ANN Training Loss:0.953 Accuracy:0.795 - Epoch 0 [401/937] ANN Training Loss:0.865 Accuracy:0.788 - Epoch 0 [501/937] ANN Training Loss:0.807 Accuracy:0.792 - Epoch 0 [601/937] ANN Training Loss:0.764 Accuracy:0.795 - Epoch 0 [701/937] ANN Training Loss:0.726 Accuracy:0.835 - Epoch 0 [801/937] ANN Training Loss:0.681 Accuracy:0.880 - Epoch 0 [901/937] ANN Training Loss:0.641 Accuracy:0.889 - 100%|██████████| 100/100 [00:00<00:00, 116.12it/s] - Epoch 0 [100/100] ANN Validating Loss:0.327 Accuracy:0.881 - Save model to: cnn_mnist-XXXXX\cnn_mnist.pkl - ...... - --------------------simulator summary-------------------- - time elapsed: 46.55072790000008 (sec) - --------------------------------------------------------- - ''' +def val(net, device, data_loader, T=None): + net.eval().to(device) + correct = 0.0 + total = 0.0 + if T is not None: + corrects = np.zeros(T) + with torch.no_grad(): + for batch, (img, label) in enumerate(tqdm(data_loader)): + img = img.to(device) + if T is None: + out = net(img) + correct += (out.argmax(dim=1) == label.to(device)).float().sum().item() + else: + for m in net.modules(): + if hasattr(m, 'reset'): + m.reset() + for t in range(T): + if t == 0: + out = net(img) + else: + out += net(img) + corrects[t] += (out.argmax(dim=1) == label.to(device)).float().sum().item() + total += out.shape[0] + return correct / total if T is None else corrects / total + +def main(): torch.random.manual_seed(0) torch.cuda.manual_seed(0) - - train_device = input('输入运行的设备,例如“cpu”或“cuda:0”\n input device, e.g., "cpu" or "cuda:0": ') - parser_device = input('输入分析模型的设备,例如“cpu”或“cuda:0”\n input parsing device, e.g., "cpu" or "cuda:0": ') - simulator_device = parser_device - # simulator_device = input( - # '输入SNN仿真的设备(支持多线程),例如“cpu,cuda:0”或“cuda:0,cuda:1”\n input SNN simulating device (support multithread), e.g., "cpu,cuda:0" or "cuda:0,cuda:1": ').split( - # ',') - dataset_dir = input('输入保存MNIST数据集的位置,例如“./”\n input root directory for saving MNIST dataset, e.g., "./": ') - batch_size = int(input('输入batch_size,例如“64”\n input batch_size, e.g., "64": ')) - learning_rate = float(input('输入学习率,例如“1e-3”\n input learning rate, e.g., "1e-3": ')) - T = int(input('输入仿真时长,例如“100”\n input simulating steps, e.g., "100": ')) - train_epoch = int(input('输入训练轮数,即遍历训练集的次数,例如“10”\n input training epochs, e.g., "10": ')) - model_name = input('输入模型名字,例如“cnn_mnist”\n input model name, for log_dir generating , e.g., "cnn_mnist": ') - - load = False - if log_dir == None: - from datetime import datetime - current_time = datetime.now().strftime('%b%d_%H-%M-%S') - log_dir = model_name+'-'+current_time - if not os.path.exists(log_dir): - os.makedirs(log_dir) - else: - if not os.path.exists(log_dir): - os.makedirs(log_dir) - if not os.path.exists(os.path.join(log_dir,model_name+'.pkl')): - print('%s has no model to load.'%(log_dir)) - load = False - else: - load = True - - if not load: - writer = SummaryWriter(log_dir) - - # 初始化数据加载器 - # initialize data loader + device = 'cuda' + dataset_dir = 'G:/Dataset/mnist' + batch_size = 100 + T = 50 + # 训练参数 + lr = 1e-3 + epochs = 10 + + model = mnist_cnn.CNN().to(device) train_data_dataset = torchvision.datasets.MNIST( root=dataset_dir, train=True, transform=torchvision.transforms.ToTensor(), download=True) train_data_loader = torch.utils.data.DataLoader( - train_data_dataset, + dataset=train_data_dataset, batch_size=batch_size, shuffle=True, - drop_last=True) + drop_last=False) + test_data_dataset = torchvision.datasets.MNIST( + root=dataset_dir, + train=False, + transform=torchvision.transforms.ToTensor(), + download=True) test_data_loader = torch.utils.data.DataLoader( - dataset=torchvision.datasets.MNIST( - root=dataset_dir, - train=False, - transform=torchvision.transforms.ToTensor(), - download=True), - batch_size=100, + dataset=test_data_dataset, + batch_size=50, shuffle=True, drop_last=False) - ann = ANN().to(train_device) - loss_function = nn.CrossEntropyLoss() - if not load: - optimizer = torch.optim.Adam(ann.parameters(), lr=learning_rate, weight_decay=5e-4) - best_acc = 0.0 - for epoch in range(train_epoch): - # 使用utils中预先写好的训练程序训练网络 - # 训练程序的写法和经典ANN中的训练也是一样的 - # Train the network using a pre-prepared code in ''utils'' - utils.train_ann(net=ann, - device=train_device, - data_loader=train_data_loader, - optimizer=optimizer, - loss_function=loss_function, - epoch=epoch - ) - # 使用utils中预先写好的验证程序验证网络输出 - # Validate the network using a pre-prepared code in ''utils'' - acc = utils.val_ann(net=ann, - device=train_device, - data_loader=test_data_loader, - loss_function=loss_function, - epoch=epoch - ) - if best_acc <= acc: - utils.save_model(ann, log_dir, model_name + '.pkl') - writer.add_scalar('val_accuracy', acc, epoch) - ann = torch.load(os.path.join(log_dir, model_name + '.pkl')) - print('validating best model...') - ann_acc = utils.val_ann(net=ann, - device=train_device, - data_loader=test_data_loader, - loss_function=loss_function - ) - - # 加载用于归一化模型的数据 - # Load the data to normalize the model - percentage = 0.004 # load 0.004 of the data - norm_data_list = [] - for idx, (imgs, targets) in enumerate(train_data_loader): - norm_data_list.append(imgs) - if idx == int(len(train_data_loader) * percentage) - 1: - break - norm_data = torch.cat(norm_data_list) - print('use %d imgs to parse' % (norm_data.size(0))) - - # 调用parser,使用kernel为onnx - # Call parser, use onnx kernel - onnxparser = parser(name=model_name, - log_dir=log_dir + '/parser', - kernel='onnx') - snn = onnxparser.parse(ann, norm_data.to(parser_device)) - - # 保存转换好的SNN模型 - # Save SNN model - torch.save(snn, os.path.join(log_dir,'snn-'+model_name+'.pkl')) - fig = plt.figure('simulator') + # loss_function = nn.CrossEntropyLoss() + # optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4) + # for epoch in range(epochs): + # model.train() + # for (img, label) in train_data_loader: + # optimizer.zero_grad() + # out = model(img.to(device)) + # loss = loss_function(out, label.to(device)) + # loss.backward() + # optimizer.step() + # torch.save(model.state_dict(), 'SJ-mnist-cnn_model-sample.pth') + # print('Epoch: %d' % epoch) + # acc = val(model, device, train_data_loader) + # print('Validating Accuracy: %.3f' % (acc)) + # print() + + model.load_state_dict(torch.load('SJ-mnist-cnn_model-sample.pth')) + acc = val(model, device, test_data_loader) + print('ANN Validating Accuracy: %.4f' % (acc)) + + print('---------------------------------------------') + print('Converting using MaxNorm') + model_converter = ann2snn.Converter(mode='max', dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_max_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_max_accs[-1])) + + print('---------------------------------------------') + print('Converting using RobustNorm') + model_converter = ann2snn.Converter(mode='99.9%', dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_robust_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_robust_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/2 max(activation) as scales...') + model_converter = ann2snn.Converter(mode=1.0 / 2, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_two_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_two_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/3 max(activation) as scales') + model_converter = ann2snn.Converter(mode=1.0 / 3, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_three_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_three_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/4 max(activation) as scales') + model_converter = ann2snn.Converter(mode=1.0 / 4, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_four_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_four_accs[-1])) + + print('---------------------------------------------') + print('Converting using 1/5 max(activation) as scales') + model_converter = ann2snn.Converter(mode=1.0 / 5, dataloader=train_data_loader) + snn_model = model_converter(model) + print('Simulating...') + mode_five_accs = val(snn_model, device, test_data_loader, T=T) + print('SNN accuracy (simulation %d time-steps): %.4f' % (T, mode_five_accs[-1])) + + fig = plt.figure() + plt.plot(np.arange(0, T), mode_max_accs, label='mode: max') + plt.plot(np.arange(0, T), mode_robust_accs, label='mode: 99.9%') + plt.plot(np.arange(0, T), mode_two_accs, label='mode: 1.0/2') + plt.plot(np.arange(0, T), mode_three_accs, label='mode: 1.0/3') + plt.plot(np.arange(0, T), mode_four_accs, label='mode: 1.0/4') + plt.plot(np.arange(0, T), mode_five_accs, label='mode: 1.0/5') + plt.legend() + plt.xlabel('t') + plt.ylabel('Acc') + plt.show() - # 定义用于分类的SNN仿真器 - # define simulator for classification task - sim = classify_simulator(snn, - log_dir=log_dir + '/simulator', - device=simulator_device, - canvas=fig - ) - # 仿真SNN - # Simulate SNN - sim.simulate(test_data_loader, - T=T, - online_drawer=True, - ann_acc=ann_acc, - fig_name=model_name, - step_max=True - ) if __name__ == '__main__': - main('./cnn_mnist') + print('Downloading SJ-mnist-cnn_model-sample.pth...') + ann2snn.download_url("https://ndownloader.figshare.com/files/34960191", './SJ-mnist-cnn_model-sample.pth') + main() diff --git a/spikingjelly/clock_driven/ann2snn/examples/model_sample/__init__.py b/spikingjelly/clock_driven/ann2snn/examples/model_sample/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/spikingjelly/clock_driven/ann2snn/examples/model_sample/cifar10/__init__.py b/spikingjelly/clock_driven/ann2snn/examples/model_sample/cifar10/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/spikingjelly/clock_driven/ann2snn/examples/model_sample/cifar10/vgg.py b/spikingjelly/clock_driven/ann2snn/examples/model_sample/cifar10/vgg.py deleted file mode 100644 index 425927f..0000000 --- a/spikingjelly/clock_driven/ann2snn/examples/model_sample/cifar10/vgg.py +++ /dev/null @@ -1,70 +0,0 @@ -'''VGG11/13/16/19 in Pytorch.''' -import torch -import torch.nn as nn - - -cfg = { - 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], - 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], -} - - -class VGG(nn.Module): - def __init__(self, vgg_name): - super(VGG, self).__init__() - self.features = self._make_layers(cfg[vgg_name]) - self.classifier = nn.Linear(512, 10) - - def forward(self, x): - out = self.features(x) - out = out.view(out.size(0), -1) - out = self.classifier(out) - return out - - def _make_layers(self, cfg): - layers = [] - in_channels = 3 - for x in cfg: - if x == 'M': - layers += [nn.MaxPool2d(kernel_size=2, stride=2)] - else: - layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), - nn.BatchNorm2d(x), - nn.ReLU(inplace=True)] - in_channels = x - layers += [nn.AvgPool2d(kernel_size=1, stride=1)] - return nn.Sequential(*layers) - - -class VGG_no_bias_bn(nn.Module): - def __init__(self, vgg_name): - super(VGG_no_bias_bn, self).__init__() - self.features = self._make_layers(cfg[vgg_name]) - self.classifier = nn.Linear(512, 10,bias=False) - - def forward(self, x): - out = self.features(x) - out = out.view(out.size(0), -1) - out = self.classifier(out) - return out - - def _make_layers(self, cfg): - layers = [] - in_channels = 3 - for x in cfg: - if x == 'M': - layers += [nn.AvgPool2d(kernel_size=2, stride=2)] - else: - layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1,bias=False), - nn.ReLU(inplace=True)] - in_channels = x - layers += [nn.AvgPool2d(kernel_size=1, stride=1)] - return nn.Sequential(*layers) - -def test(): - net = VGG('VGG11') - x = torch.randn(2,3,32,32) - y = net(x) - print(y.size()) \ No newline at end of file diff --git a/spikingjelly/clock_driven/ann2snn/examples/model_sample/imagenet/__init__.py b/spikingjelly/clock_driven/ann2snn/examples/model_sample/imagenet/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/spikingjelly/clock_driven/ann2snn/examples/model_sample/imagenet/resnet.py b/spikingjelly/clock_driven/ann2snn/examples/model_sample/imagenet/resnet.py deleted file mode 100644 index 0dae608..0000000 --- a/spikingjelly/clock_driven/ann2snn/examples/model_sample/imagenet/resnet.py +++ /dev/null @@ -1,339 +0,0 @@ -import torch -import torch.nn as nn - - -__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', - 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', - 'wide_resnet50_2', 'wide_resnet101_2'] - - - -def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): - """3x3 convolution with padding""" - return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, - padding=dilation, groups=groups, bias=False, dilation=dilation) - - -def conv1x1(in_planes, out_planes, stride=1): - """1x1 convolution""" - return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) - - -class BasicBlock(nn.Module): - expansion = 1 - - def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, - base_width=64, dilation=1, norm_layer=None): - super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - if groups != 1 or base_width != 64: - raise ValueError('BasicBlock only supports groups=1 and base_width=64') - if dilation > 1: - raise NotImplementedError("Dilation > 1 not supported in BasicBlock") - # Both self.conv1 and self.downsample layers downsample the input when stride != 1 - self.conv1 = conv3x3(inplanes, planes, stride) - self.bn1 = norm_layer(planes) - self.relu = nn.ReLU(inplace=True) - self.conv2 = conv3x3(planes, planes) - self.bn2 = norm_layer(planes) - self.downsample = downsample - self.stride = stride - - def forward(self, x): - identity = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - out = self.relu(out) - - return out - - -class Bottleneck(nn.Module): - # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) - # while original implementation places the stride at the first 1x1 convolution(self.conv1) - # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. - # This variant is also known as ResNet V1.5 and improves accuracy according to - # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. - - expansion = 4 - - def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, - base_width=64, dilation=1, norm_layer=None): - super(Bottleneck, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - width = int(planes * (base_width / 64.)) * groups - # Both self.conv2 and self.downsample layers downsample the input when stride != 1 - self.conv1 = conv1x1(inplanes, width) - self.bn1 = norm_layer(width) - self.conv2 = conv3x3(width, width, stride, groups, dilation) - self.bn2 = norm_layer(width) - self.conv3 = conv1x1(width, planes * self.expansion) - self.bn3 = norm_layer(planes * self.expansion) - self.relu = nn.ReLU(inplace=True) - self.downsample = downsample - self.stride = stride - - def forward(self, x): - identity = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - out = self.relu(out) - - out = self.conv3(out) - out = self.bn3(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - out = self.relu(out) - - return out - - -class ResNet(nn.Module): - - def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, - groups=1, width_per_group=64, replace_stride_with_dilation=None, - norm_layer=None): - super(ResNet, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - self._norm_layer = norm_layer - - self.inplanes = 64 - self.dilation = 1 - if replace_stride_with_dilation is None: - # each element in the tuple indicates if we should replace - # the 2x2 stride with a dilated convolution instead - replace_stride_with_dilation = [False, False, False] - if len(replace_stride_with_dilation) != 3: - raise ValueError("replace_stride_with_dilation should be None " - "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) - self.groups = groups - self.base_width = width_per_group - self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, - bias=False) - self.bn1 = norm_layer(self.inplanes) - self.relu = nn.ReLU(inplace=True) - self.avgpool1 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1) - self.layer1 = self._make_layer(block, 64, layers[0]) - self.layer2 = self._make_layer(block, 128, layers[1], stride=2, - dilate=replace_stride_with_dilation[0]) - self.layer3 = self._make_layer(block, 256, layers[2], stride=2, - dilate=replace_stride_with_dilation[1]) - self.layer4 = self._make_layer(block, 512, layers[3], stride=2, - dilate=replace_stride_with_dilation[2]) - self.avgpool2 = nn.AdaptiveAvgPool2d((1, 1)) - self.fc = nn.Linear(512 * block.expansion, num_classes) - - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') - elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - # Zero-initialize the last BN in each residual branch, - # so that the residual branch starts with zeros, and each residual block behaves like an identity. - # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 - if zero_init_residual: - for m in self.modules(): - if isinstance(m, Bottleneck): - nn.init.constant_(m.bn3.weight, 0) - elif isinstance(m, BasicBlock): - nn.init.constant_(m.bn2.weight, 0) - - def _make_layer(self, block, planes, blocks, stride=1, dilate=False): - norm_layer = self._norm_layer - downsample = None - previous_dilation = self.dilation - if dilate: - self.dilation *= stride - stride = 1 - if stride != 1 or self.inplanes != planes * block.expansion: - downsample = nn.Sequential( - conv1x1(self.inplanes, planes * block.expansion, stride), - norm_layer(planes * block.expansion), - ) - - layers = [] - layers.append(block(self.inplanes, planes, stride, downsample, self.groups, - self.base_width, previous_dilation, norm_layer)) - self.inplanes = planes * block.expansion - for _ in range(1, blocks): - layers.append(block(self.inplanes, planes, groups=self.groups, - base_width=self.base_width, dilation=self.dilation, - norm_layer=norm_layer)) - - return nn.Sequential(*layers) - - def _forward_impl(self, x): - # See note [TorchScript super()] - x = self.conv1(x) - x = self.bn1(x) - x = self.relu(x) - x = self.avgpool1(x) - - x = self.layer1(x) - x = self.layer2(x) - x = self.layer3(x) - x = self.layer4(x) - - x = self.avgpool2(x) - x = torch.flatten(x, 1) - x = self.fc(x) - - return x - - def forward(self, x): - return self._forward_impl(x) - - -def _resnet(arch, block, layers, pretrained, progress, **kwargs): - model = ResNet(block, layers, **kwargs) - if pretrained: - raise NotImplementedError - - return model - - -def resnet18(pretrained=False, progress=True, **kwargs): - r"""ResNet-18 model from - `"Deep Residual Learning for Image Recognition" `_ - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, - **kwargs) - - -def resnet34(pretrained=False, progress=True, **kwargs): - r"""ResNet-34 model from - `"Deep Residual Learning for Image Recognition" `_ - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress, - **kwargs) - - -def resnet50(pretrained=False, progress=True, **kwargs): - r"""ResNet-50 model from - `"Deep Residual Learning for Image Recognition" `_ - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress, - **kwargs) - - -def resnet101(pretrained=False, progress=True, **kwargs): - r"""ResNet-101 model from - `"Deep Residual Learning for Image Recognition" `_ - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress, - **kwargs) - - -def resnet152(pretrained=False, progress=True, **kwargs): - r"""ResNet-152 model from - `"Deep Residual Learning for Image Recognition" `_ - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress, - **kwargs) - - -def resnext50_32x4d(pretrained=False, progress=True, **kwargs): - r"""ResNeXt-50 32x4d model from - `"Aggregated Residual Transformation for Deep Neural Networks" `_ - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - kwargs['groups'] = 32 - kwargs['width_per_group'] = 4 - return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], - pretrained, progress, **kwargs) - - -def resnext101_32x8d(pretrained=False, progress=True, **kwargs): - r"""ResNeXt-101 32x8d model from - `"Aggregated Residual Transformation for Deep Neural Networks" `_ - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - kwargs['groups'] = 32 - kwargs['width_per_group'] = 8 - return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], - pretrained, progress, **kwargs) - - -def wide_resnet50_2(pretrained=False, progress=True, **kwargs): - r"""Wide ResNet-50-2 model from - `"Wide Residual Networks" `_ - - The model is the same as ResNet except for the bottleneck number of channels - which is twice larger in every block. The number of channels in outer 1x1 - convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 - channels, and in Wide ResNet-50-2 has 2048-1024-2048. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - kwargs['width_per_group'] = 64 * 2 - return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], - pretrained, progress, **kwargs) - - -def wide_resnet101_2(pretrained=False, progress=True, **kwargs): - r"""Wide ResNet-101-2 model from - `"Wide Residual Networks" `_ - - The model is the same as ResNet except for the bottleneck number of channels - which is twice larger in every block. The number of channels in outer 1x1 - convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 - channels, and in Wide ResNet-50-2 has 2048-1024-2048. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - kwargs['width_per_group'] = 64 * 2 - return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], - pretrained, progress, **kwargs) diff --git a/spikingjelly/clock_driven/ann2snn/examples/resnet18_cifar10.py b/spikingjelly/clock_driven/ann2snn/examples/resnet18_cifar10.py index 0c3d6f0..b6ef6e6 100644 --- a/spikingjelly/clock_driven/ann2snn/examples/resnet18_cifar10.py +++ b/spikingjelly/clock_driven/ann2snn/examples/resnet18_cifar10.py @@ -1,47 +1,50 @@ import torch -import torch.nn as nn import torchvision -import os -from torch.utils.tensorboard import SummaryWriter -import spikingjelly.clock_driven.ann2snn.examples.utils as utils -from spikingjelly.clock_driven.ann2snn import parser, classify_simulator -from spikingjelly.clock_driven.ann2snn.examples.model_sample.cifar10 import resnet -import matplotlib.pyplot as plt +from tqdm import tqdm +import spikingjelly.clock_driven.ann2snn as ann2snn +from spikingjelly.clock_driven.ann2snn.sample_models import cifar10_resnet -def main(log_dir=None): - torch.random.manual_seed(0) - torch.cuda.manual_seed(0) - - train_device = input('输入运行的设备,例如“cpu”或“cuda:0”\n input training device, e.g., "cpu" or "cuda:0": ') - parser_device = input('输入分析模型的设备,例如“cpu”或“cuda:0”\n input parsing device, e.g., "cpu" or "cuda:0": ') - simulator_device = parser_device - # simulator_device = input('输入SNN仿真的设备(支持多线程),例如“cpu,cuda:0”或“cuda:0,cuda:1”\n input SNN simulating device (support multithread), e.g., "cpu,cuda:0" or "cuda:0,cuda:1": ').split(',') - dataset_dir = input('输入保存cifar10数据集的位置,例如“./”\n input root directory for saving cifar10 dataset, e.g., "./": ') - batch_size = int(input('输入batch_size,例如“128”\n input batch_size, e.g., "128": ')) - T = int(input('输入仿真时长,例如“400”\n input simulating steps, e.g., "400": ')) - model_name = input('输入模型名字,例如“resnet18_cifar10”\n input model name, for log_dir generating , e.g., "resnet18_cifar10": ') - z_norm_mean = (0.4914, 0.4822, 0.4465) - z_norm_std = (0.2023, 0.1994, 0.2010) +def val(net, device, data_loader, T=None): + net.eval().to(device) + correct = 0.0 + total = 0.0 + with torch.no_grad(): + for batch, (img, label) in enumerate(tqdm(data_loader)): + img = img.to(device) + if T is None: + out = net(img) + else: + for m in net.modules(): + if hasattr(m, 'reset'): + m.reset() + for t in range(T): + if t == 0: + out = net(img) + else: + out += net(img) + correct += (out.argmax(dim=1) == label.to(device)).float().sum().item() + total += out.shape[0] + acc = correct / total + print('Validating Accuracy: %.3f' % (acc)) + return acc - load = False - if log_dir == None: - from datetime import datetime - current_time = datetime.now().strftime('%b%d_%H-%M-%S') - log_dir = model_name + '-' + current_time - if not os.path.exists(log_dir): - os.makedirs(log_dir) - else: - if not os.path.exists(log_dir): - os.makedirs(log_dir) - - if not load: - writer = SummaryWriter(log_dir) +def main(): + torch.random.manual_seed(0) + torch.cuda.manual_seed(0) + device = 'cuda:9' + dataset_dir = '~/dataset/cifar10' + batch_size = 100 + T = 400 transform = torchvision.transforms.Compose([ - torchvision.transforms.ToTensor() + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) ]) + model = cifar10_resnet.ResNet18() + model.load_state_dict(torch.load('SJ-cifar10-resnet18_model-sample.pth')) + train_data_dataset = torchvision.datasets.CIFAR10( root=dataset_dir, train=True, @@ -63,44 +66,16 @@ def main(log_dir=None): shuffle=True, drop_last=False) - ann = resnet.ResNet18().to(train_device) - loss_function = nn.CrossEntropyLoss() - checkpoint_state_dict = torch.load('./SJ-cifar10-resnet18_model-sample.pth') - ann.load_state_dict(checkpoint_state_dict) - - # 加载用于归一化模型的数据 - # Load the data to normalize the model - percentage = 0.004 # load 0.004 of the data - norm_data_list = [] - for idx, (imgs, targets) in enumerate(train_data_loader): - norm_data_list.append(imgs) - if idx == int(len(train_data_loader) * percentage) - 1: - break - norm_data = torch.cat(norm_data_list) - print('use %d imgs to parse' % (norm_data.size(0))) - - onnxparser = parser(name=model_name, - log_dir=log_dir + '/parser', - kernel='onnx', - z_norm=(z_norm_mean, z_norm_std)) - - snn = onnxparser.parse(ann, norm_data.to(parser_device)) - ann_acc = utils.val_ann(torch.load(onnxparser.ann_filename).to(train_device),train_device,test_data_loader,loss_function) - torch.save(snn, os.path.join(log_dir, 'snn-' + model_name + '.pkl')) - fig = plt.figure('simulator') - sim = classify_simulator(snn, - log_dir=log_dir + '/simulator', - device=simulator_device, - canvas=fig - ) - sim.simulate(test_data_loader, - T=T, - online_drawer=True, - ann_acc=ann_acc, - fig_name=model_name, - step_max=True - ) + print('ANN accuracy:') + val(model, device, test_data_loader) + print('Converting...') + model_converter = ann2snn.Converter(device=device,mode='Max', dataloader=train_data_loader) + snn_model = model_converter(model) + print('SNN accuracy:') + val(snn_model, device, test_data_loader, T=T) if __name__ == '__main__': - utils.download_sample_pth("https://ndownloader.figshare.com/files/26676110",'./SJ-cifar10-resnet18_model-sample.pth') - main('./resnet18_cifar10') \ No newline at end of file + print('Downloading SJ-cifar10-resnet18_model-sample.pth') + ann2snn.download_url("https://ndownloader.figshare.com/files/26676110",'./SJ-cifar10-resnet18_model-sample.pth') + main() + diff --git a/spikingjelly/clock_driven/ann2snn/examples/utils.py b/spikingjelly/clock_driven/ann2snn/examples/utils.py deleted file mode 100644 index 798420a..0000000 --- a/spikingjelly/clock_driven/ann2snn/examples/utils.py +++ /dev/null @@ -1,170 +0,0 @@ -import torch -import os -import numpy as np -from tqdm import tqdm -import requests - -def train_ann(net, device, data_loader, optimizer, loss_function, epoch=None): - ''' - * :ref:`API in English ` - - .. _train_ann-cn: - - :param net: 训练的模型 - :param device: 运行的设备 - :param data_loader: 训练集 - :param optimizer: 神经网络优化器 - :param loss_function: 损失函数 - :param epoch: 当前训练期数 - :return: ``None`` - - 经典的神经网络训练程序预设,便于直接调用训练网络 - - * :ref:`中文API ` - - .. _train_ann-en: - - :param net: network to train - :param device: running device - :param data_loader: training data loader - :param optimizer: neural network optimizer - :param loss_function: neural network loss function - :param epoch: current training epoch - :return: ``None`` - - Preset classic neural network training program - ''' - net.train() - losses = [] - correct = 0.0 - total = 0.0 - for batch, (img, label) in enumerate(data_loader): - img = img.to(device) - optimizer.zero_grad() - out = net(img) - loss = loss_function(out, label.to(device)) - loss.backward() - optimizer.step() - losses.append(loss.item()) - correct += (out.max(dim=1)[1] == label.to(device)).float().sum().item() - total += out.shape[0] - if batch % 100 == 0: - acc = correct / total - print('Epoch %d [%d/%d] ANN Training Loss:%.3f Accuracy:%.3f' % (epoch, - batch + 1, - len(data_loader), - np.array(losses).mean(), - acc)) - correct = 0.0 - total = 0.0 - - -def val_ann(net, device, data_loader, loss_function, epoch=None): - ''' - * :ref:`API in English ` - - .. _val_ann-cn: - - :param net: 待验证的模型 - :param device: 运行的设备 - :param data_loader: 测试集 - :param epoch: 当前训练期数 - :return: 验证准确率 - - 经典的神经网络训练程序预设,便于直接调用训练网络 - - * :ref:`中文API ` - - .. _val_ann-en: - - :param net: network to test - :param device: running device - :param data_loader: testing data loader - :param epoch: current training epoch - :return: testing accuracy - - Preset classic neural network training program - ''' - net.eval() - correct = 0.0 - total = 0.0 - losses = [] - with torch.no_grad(): - for batch, (img, label) in enumerate(tqdm(data_loader)): - img = img.to(device) - out = net(img) - loss = loss_function(out, label.to(device)) - correct += (out.argmax(dim=1) == label.to(device)).float().sum().item() - total += out.shape[0] - losses.append(loss.item()) - acc = correct / total - if epoch == None: - print('ANN Validating Accuracy:%.3f' % (acc)) - else: - print('Epoch %d [%d/%d] ANN Validating Loss:%.3f Accuracy:%.3f' % (epoch, - batch + 1, - len(data_loader), - np.array(losses).mean(), - acc)) - return acc - - -def save_model(net, log_dir, file_name): - ''' - * :ref:`API in English ` - - .. _save_model-cn: - - :param net: 要保存的模型 - :param log_dir: 日志文件夹 - :param file_name: 文件名 - :return: ``None`` - - 保存模型的参数,以两种形式保存,分别为Pytorch保存的完整模型(适用于网络模型中只用了Pytorch预设模块的) - 以及模型参数(适用于网络模型中有自己定义的非参数模块无法保存完整模型) - - * :ref:`中文API ` - - .. _save_model-en: - - :param net: network model to save - :param log_dir: log file folder - :param file_name: file name - :return: ``None`` - - Save the model, which is saved in two forms, the full model saved by Pytorch (for the network model only possessing - the Pytorch preset module) and model parameters only (for network models that have their own defined nonparametric - modules. In that case, Pytorch cannot save the full model) - ''' - if not os.path.exists(log_dir): - os.makedirs(log_dir) - torch.save(net, os.path.join(log_dir, file_name)) - torch.save(net.state_dict(), os.path.join(log_dir, 'param_' + file_name)) - print('Save model to:', os.path.join(log_dir, file_name)) - - -def download_sample_pth(url, filename): - ''' - * :ref:`API in English ` - - .. _download_sample_pth-cn: - - :param url: 链接 - :param filename: 文件名 - :return: ``None`` - - 下载例子的模型文件 - - * :ref:`中文API ` - - .. _download_sample_pth-en: - - :param url: links - :param filename: file name - :return: ``None`` - - Download model state dict for examples - ''' - print('Downloading %s from %s, please wait...'%(filename,url)) - r = requests.get(url, allow_redirects=True) - open(filename, 'wb').write(r.content) \ No newline at end of file diff --git a/spikingjelly/clock_driven/ann2snn/kernels/__init__.py b/spikingjelly/clock_driven/ann2snn/kernels/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/spikingjelly/clock_driven/ann2snn/kernels/onnx.py b/spikingjelly/clock_driven/ann2snn/kernels/onnx.py deleted file mode 100644 index 5aae4b4..0000000 --- a/spikingjelly/clock_driven/ann2snn/kernels/onnx.py +++ /dev/null @@ -1,1215 +0,0 @@ -import onnx -import onnx.helper as helper -import onnx.numpy_helper as numpy_helper -import collections -import numpy as np -import torch -import torch.nn as nn -import os -import tqdm -import onnxruntime as ort -from collections import defaultdict -import json - - -class Mul(nn.Module): - def __init__(self): - super().__init__() - def forward(self, input1, input2): - return input1 * input2 - - -class Add(nn.Module): - def __init__(self): - super().__init__() - def forward(self,input1,input2): - return input1 + input2 - - -class Reshape(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, input1, input2): - return torch.reshape(input1,shape=list(input2)) - - -class Concat(nn.Module): - def __init__(self, dim=[1]): - super().__init__() - self.dim = dim - if not isinstance(self.dim, list): - self.dim = [self.dim] - for i, d in enumerate(self.dim): - if not isinstance(d, int): - self.dim[i] = int(d) - - def forward(self, *args): - args = list(args) - for i,a in enumerate(args): - args[i] = a.type_as(args[0]) - return torch.cat(args,dim=self.dim[0]) - -class Shape(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, input): - return torch.IntTensor([input.size(i) for i in range(len(input.size()))]) - -class Gather(nn.Module): - def __init__(self,dim=1): - super().__init__() - self.dim= int(dim) - - def forward(self, input1, input2): - return torch.gather(input1,dim=self.dim,index=input2.cpu()) - -class Unsqueeze(nn.Module): - def __init__(self, dim=[1]): - super().__init__() - self.dim = dim - if not isinstance(self.dim, list): - self.dim = [self.dim] - for i,d in enumerate(self.dim): - if not isinstance(d,int): - self.dim[i] = int(d) - - def forward(self, input): - x = input - for i in self.dim: - x = torch.unsqueeze(x,dim=i) - return x - -class TopologyAnalyser: - def __init__(self): - ''' - * :ref:`API in English ` - - .. _TopologyAnalyser.__init__-cn: - - 这个类通过onnx分析模型的拓扑结构,方便后续处理 - 此处还有更多更好的实现方法,欢迎开发者不断优化 - - * :ref:`API in English ` - - .. _TopologyAnalyser.__init__-en: - - This class analyzes the topological structure of the model through onnx to facilitate subsequent processing - There are better implementation methods here, developers are welcome to continue to optimize - ''' - self.data_nodes = [] - self.module_output = collections.OrderedDict() - self.module_op = collections.OrderedDict() - self.module_idx = collections.OrderedDict() - self.param_idx = collections.OrderedDict() - self.edge = collections.OrderedDict() - self.reverse_edge = collections.OrderedDict() # 快速计算前驱结点 - - def add_data_node(self, a): - if not a in self.data_nodes: - self.data_nodes.append(a) - - def insert(self, a, b, info=None): - self.add_data_node(a) - self.add_data_node(b) - if a not in self.edge.keys(): - self.edge[a] = [(b, info)] - else: - self.edge[a].append((b, info)) - if b not in self.reverse_edge.keys(): - self.reverse_edge[b] = [a] - else: - self.reverse_edge[b].append(a) - - def findNext(self, id): - if isinstance(id, str): - if id in self.edge.keys(): - return self.edge[id] - else: - return [] - elif isinstance(id, list): - l = [] - for i in id: - l += self.findNext(i) - return l - - def findPre(self, id): - l = [] - if isinstance(id, str): - for pre_id in self.reverse_edge[id]: - if pre_id in self.reverse_edge.keys(): - for pre_pre_id in self.reverse_edge[pre_id]: - if pre_pre_id in self.edge.keys(): - for item in self.edge[pre_pre_id]: - if item[0] == pre_id: - l.append(item) - elif isinstance(id, list): - for i in id: - l += self.findPre(i) - return l - - def find_pre_module(self, module_name): - if module_name in self.module_output.keys(): - ids = self.module_output[module_name] - return set(['%s:%s' % (k[1]['op'], k[1]['param_module_name']) for k in self.findPre(ids)]) - else: - return set() - - def find_next_module(self, module_name): - if module_name in self.module_output.keys(): - ids = self.module_output[module_name] - return set(['%s:%s' % (k[1]['op'], k[1]['param_module_name']) for k in self.findNext(ids)]) - else: - return set() - - def update_module_idx(self, onnx_graph): - for idx, n in enumerate(onnx_graph.node): - trainable_input = n.input[1:] - op = n.op_type - k = set() - for i in trainable_input: - n = self._get_module_name_from_value_name(i) - if n is not None: - k.add(n) - if len(k) > 1: - # TODO: sanity check, raise error - pass - if len(k) == 1: - param_module_name = list(k)[0] - self.module_op[param_module_name] = op - self.module_idx[param_module_name] = idx - - def analyse(self, onnx_graph): # 输入的onnx graph需要保证所以常量在已经在initializer中 - # 先把该往initializer放下面的参数,保证下面只有运算没有常量 - for idx, constant in enumerate(onnx_graph.initializer): - self.param_idx[constant.name] = idx - for idx, n in enumerate(onnx_graph.node): - param_module_name = None - op = n.op_type - inputs = n.input - outputs = n.output - # print(inputs,outputs) - k = set() - trainable_input = inputs[1:] - for i in trainable_input: - n = self._get_module_name_from_value_name(i) - if n is not None: - k.add(n) - if len(k) > 1: - # TODO: sanity check, raise error - pass - if len(k) == 1: - param_module_name = list(k)[0] - self.module_op[param_module_name] = op - self.module_idx[param_module_name] = idx - if op is not None: - for o in outputs: - for i in inputs: - self.insert(i, o, {'op': op, 'param_module_name': param_module_name}) - if param_module_name is not None: - if param_module_name not in self.module_output.keys(): - self.module_output[param_module_name] = [o] - else: - self.module_output[param_module_name].append(o) - return self - - @staticmethod - def _get_module_name_from_value_name(value_name): - module_name = None - if len(value_name.split('.')) > 1: - l = value_name.split('.')[:-1] - l = '.'.join(l) - module_name = l # [1:] - module_name.replace(' ', '') - return module_name - -def pytorch2onnx_model(model: nn.Module, data, **kargs) -> onnx.ModelProto: - ''' - - * :ref:`API in English ` - - .. _pytorch2onnx_model-cn: - - :param model: 待转换的PyTorch模型 - - :param data: 用于转换的数据(用来确定输入维度) - - :param log_dir: 输出文件夹 - - 转换PyTorch模型到onnx模型 - - * :ref:`API in English ` - - .. _pytorch2onnx_model-en: - - :param model: the PyTorch model to be converted - - :param data: The data used for conversion (used to determine the input dimension) - - :param log_dir: output folder - - Convert PyTorch model to onnx model - - ''' - try: - log_dir = kargs['log_dir'] - except KeyError: - print('pytorch2onnx_model need argument log_dir!') - dump_input_size = [data.size(i) for i in range(len(data.size()))] - dump_input_size[0] = 1 - fname = os.path.join(log_dir,'onnxmodel') - try: - dynamic_axes = {'input': {0: 'batch_size'}, - 'output': {0: 'batch_size'}} - torch.onnx.export(model, torch.ones(dump_input_size), fname, - input_names=['input'], - output_names=['output'], - dynamic_axes=dynamic_axes) - except BaseException: - raise NotImplementedError("Models with multiple inputs are not supported yet!") - return onnx.load(fname) - -def onnx2pytorch_model(model: onnx.ModelProto, _converter) -> nn.Module: - model = _pt_model(model, _converter) - model = model.reduce() - return model - -def layer_reduction(model: onnx.ModelProto) -> onnx.ModelProto: - graph = model.graph - topo_analyser = TopologyAnalyser() - graph = move_constant_to_initializer(graph) - topo_analyser.analyse(graph) - - absorb_bn(graph, topo_analyser) - remove_unreferenced_initializer(graph) - update_topology(graph) - print("Finish layer reduction!") - return model - -def rate_normalization(model: onnx.ModelProto, data: torch.Tensor, **kargs) -> onnx.ModelProto: - ''' - - * :ref:`API in English ` - - .. _rate_normalization-cn: - - :param model: ANN模型,类型为onnx.ModelProto - - :param data: 用于转换的数据,类型为torch.Tensor - - :param channelwise: 如果为``True``,则控制激活幅值的统计是channelwise的;否则,控制激活幅值的统计是layerwise的 - - :param robust: 如果为``True``,则控制激活幅值的统计是激活的99.9百分位;否则,控制激活幅值的统计是激活的最值 - - :param eps: epsilon;未设置值时默认1e-5 - - 发放率归一化 - - * :ref:`API in English ` - - .. _rate_normalization-en: - - :param model: ANN model, the type is onnx.ModelProto - - :param data: the data used for conversion, the type is torch.Tensor - - :param channelwise: If ``True`` , the statistics that control the activation amplitude are channelwise; otherwise, the statistics that control the activation amplitude are layerwise - - :param robust: If ``True``, the statistic of the control activation amplitude is the 99.9th percentile of activation; otherwise, the statistic of the activation amplitude is the maximum value of activation - - :param eps: epsilon; if no value is set, the default is 1e-5 - - normalize the firing rate - - ''' - - try: - channelwise = kargs['channelwise'] - except KeyError: - channelwise = False - try: - robust_norm = kargs['robust'] - except KeyError: - robust_norm = False - try: - eps = kargs['eps'] - except KeyError: - eps = 1e-5 - topo_analyser = update_topology(model.graph) - output_debug = {} - output_statistics = get_intermediate_output_statistics(model, data, - channelwise=channelwise) # if want debug, debug=output_debug - model = normalize_model(model, output_statistics, topo_analyser, robust_norm=robust_norm, - channelwise=channelwise, eps=eps) - return model - -def save_model(model: onnx.ModelProto, f=None): - fb = model.SerializeToString() - if f is not None: - if hasattr(f, 'write'): - f.write(fb) - else: - with open(f, "wb") as f: - f.write(fb) - return fb - -def move_constant_to_initializer(graph): - constant_idx = [] - for idx, n in enumerate(graph.node): - op = n.op_type - if op == 'Constant': - constant_idx.append(idx) - if len(constant_idx): - for idx in reversed(constant_idx): - n = graph.node[idx] - graph.initializer.append( - numpy_helper.from_array(numpy_helper.to_array(n.attribute[0].t), n.output[0])) - graph.node.remove(n) - return graph - -def print_onnx_model(graph): - print(onnx.helper.printable_graph(graph)) - -def absorb_bn(graph, topo_analyser): - print("\nAbsorbing BatchNorm Parameters...\n") - for mn in tqdm.tqdm(reversed(topo_analyser.module_output.keys())): - if topo_analyser.module_op[mn] == 'BatchNormalization': - pre_m = topo_analyser.find_pre_module(mn) - next_m = topo_analyser.find_next_module(mn) - bn_weight_idx = topo_analyser.param_idx[graph.node[topo_analyser.module_idx[mn]].input[1]] - bn_weight = np.array(numpy_helper.to_array(graph.initializer[bn_weight_idx])) - bn_bias_idx = topo_analyser.param_idx[graph.node[topo_analyser.module_idx[mn]].input[2]] - bn_bias = np.array(numpy_helper.to_array(graph.initializer[bn_bias_idx])) - bn_mean_idx = topo_analyser.param_idx[graph.node[topo_analyser.module_idx[mn]].input[3]] - bn_mean = np.array(numpy_helper.to_array(graph.initializer[bn_mean_idx])) - bn_var_idx = topo_analyser.param_idx[graph.node[topo_analyser.module_idx[mn]].input[4]] - bn_var = np.array(numpy_helper.to_array(graph.initializer[bn_var_idx])) - bn_eps = graph.node[topo_analyser.module_idx[mn]].attribute[0].f - bn_std = np.sqrt(bn_var + bn_eps) - if len(pre_m) == 1 and list(pre_m)[0].split(':')[0] in ['Conv', 'Gemm']: - pre_mn = list(pre_m)[0].split(':')[1] - weight_idx = topo_analyser.param_idx[graph.node[topo_analyser.module_idx[pre_mn]].input[1]] - weight = np.array(numpy_helper.to_array(graph.initializer[weight_idx])) - if len(graph.node[topo_analyser.module_idx[pre_mn]].input) == 2: - bias = None - else: - bias_idx = topo_analyser.param_idx[graph.node[topo_analyser.module_idx[pre_mn]].input[2]] - bias = np.array(numpy_helper.to_array(graph.initializer[bias_idx])) - wrsp_args = (-1, 1) if len(weight.shape) == 2 else (-1, 1, 1, 1) - - weight_ = weight * bn_weight.reshape(*wrsp_args) / bn_std.reshape(*wrsp_args) - bias_ = ((bias if bias is not None else 0) - bn_mean.reshape(-1)) * bn_weight.reshape( - -1) / bn_std.reshape(-1) \ - + bn_bias.reshape(-1) - assert (list(pre_m)[0].split(':')[0] in ['Conv', 'Gemm']) - args = {} - for attr in graph.node[topo_analyser.module_idx[pre_mn]].attribute: - args[attr.name] = helper.get_attribute_value(attr) - new_node = onnx.helper.make_node( - list(pre_m)[0].split(':')[0], - inputs=[graph.node[topo_analyser.module_idx[pre_mn]].input[0], pre_mn + ".new.weight", pre_mn + ".new.bias"], - outputs=[graph.node[topo_analyser.module_idx[mn]].output[0]], - **args - ) - graph.initializer.append(numpy_helper.from_array(weight_.astype(np.float32), pre_mn + ".new.weight")) - graph.initializer.append(numpy_helper.from_array(bias_.astype(np.float32), pre_mn + ".new.bias")) - graph.node.remove(graph.node[topo_analyser.module_idx[pre_mn]]) - graph.node.insert(topo_analyser.module_idx[pre_mn], new_node) - graph.node.remove(graph.node[topo_analyser.module_idx[mn]]) - else: - weight_ = bn_weight / bn_std - bias_ = bn_bias - bn_weight * bn_mean / bn_std - name = graph.initializer[bn_weight_idx].name - graph.initializer.remove(graph.initializer[bn_weight_idx]) - graph.initializer.insert(bn_weight_idx, numpy_helper.from_array(weight_.astype(np.float32), name)) - name = graph.initializer[bn_bias_idx].name - graph.initializer.remove(graph.initializer[bn_bias_idx]) - graph.initializer.insert(bn_bias_idx, numpy_helper.from_array(bias_.astype(np.float32), name)) - name = graph.initializer[bn_mean_idx].name - graph.initializer.remove(graph.initializer[bn_mean_idx]) - graph.initializer.insert(bn_mean_idx, - numpy_helper.from_array(np.zeros_like(bn_mean).astype(np.float32), name)) - name = graph.initializer[bn_var_idx].name - graph.initializer.remove(graph.initializer[bn_var_idx]) - graph.initializer.insert(bn_var_idx, - numpy_helper.from_array(np.ones_like(bn_var).astype(np.float32), name)) - -def remove_unreferenced_initializer(graph): - in_graph = set() - in_initializer = set() - for node in graph.node: - in_graph.update(node.input) - in_graph.update(node.output) - for init in graph.initializer: - in_initializer.add(init.name) - not_in_graph = in_initializer - in_graph - l = len(graph.initializer) - for i in range(l - 1, -1, -1): - if graph.initializer[i].name in not_in_graph: - graph.initializer.remove(graph.initializer[i]) - -def update_topology(graph): - topo_analyser = TopologyAnalyser() - move_constant_to_initializer(graph) - topo_analyser.analyse(graph) - return topo_analyser - -def find_node_by_output(output_name, graph): - flag = False - idx, node = None, None - for idx, node in enumerate(graph.node): - if output_name in node.output: - flag = True - break - if not flag: - idx, node = None, None - return idx, node - -def scale_node_weight_bias(topo_analyser, graph, node_idx, scale): - initializer = graph.initializer - node = graph.node[node_idx] - if len(node.input) < 2: - return - weight_idx = topo_analyser.param_idx[node.input[1]] - bias_idx = topo_analyser.param_idx[node.input[2]] if len(node.input) >= 3 else None - weight = np.array(numpy_helper.to_array(initializer[weight_idx])) - bias = np.array(numpy_helper.to_array(initializer[bias_idx])) if bias_idx is not None else None - - w_scale = scale.reshape([*scale.shape] + [1 for _ in range(len(weight.shape) - 1)]) \ - if len(scale.shape) == 1 else scale - b_scale = scale - - weight_ = weight * w_scale - name = initializer[weight_idx].name - initializer.remove(initializer[weight_idx]) - initializer.insert(weight_idx, numpy_helper.from_array(weight_.astype(np.float32), name)) - if bias is not None: - bias_ = bias * b_scale - name = initializer[bias_idx].name - initializer.remove(initializer[bias_idx]) - initializer.insert(bias_idx, numpy_helper.from_array(bias_.astype(np.float32), name)) - -def get_onnx_output(model, numpy_tensor): - ort_session = ort.InferenceSession(model.SerializeToString()) - outputs = ort_session.run(None, {'input': numpy_tensor}) - return outputs - -def get_intermediate_output_statistics(model, numpy_tensor, channelwise=False, debug=None): - graph = model.graph - output_needed_module = {} - output_needed_all_input = {} - for idx, node in enumerate(graph.node): - output = node.output - input = node.input - if 'input' in node.input: - for out in output: - output_needed_module[out] = set([idx]) - output_needed_all_input[out] = set(input) - else: - s = set() - s_i = set() - for in_ in input: - s |= (output_needed_module[in_] if in_ in output_needed_module.keys() else set()) - s_i |= (output_needed_all_input[in_] if in_ in output_needed_all_input.keys() else set()) - for out in output: - output_needed_module[out] = s | set([idx]) - output_needed_all_input[out] = s_i | set(input) - - output_statistics = {} - if not channelwise: - statistic = {'shape': numpy_tensor.shape, - 'min': np.min(numpy_tensor), - 'max': np.max(numpy_tensor) if np.max(numpy_tensor) > 0 else np.abs(np.min(numpy_tensor)), - '99.9': np.percentile(numpy_tensor, 99.9) - } - else: - axis_args = (0, 2, 3) if len(numpy_tensor.shape) == 4 else (0) - statistic = {'shape': numpy_tensor.shape, - 'min': np.min(numpy_tensor, axis=axis_args), - 'max': np.max(numpy_tensor, axis=axis_args), - '99.9': np.percentile(numpy_tensor, 99.9, axis=axis_args) - } - output_statistics['input'] = statistic - print("\nGetting intermediate output statistics...\n") - for out in tqdm.tqdm(output_needed_module.keys()): - keep_nodes = [graph.node[i] for i in list(output_needed_module[out])] - keep_initializer = [init for init in graph.initializer - if init.name in list(output_needed_all_input[out])] - var_out = [] - value_info = onnx.ValueInfoProto() - value_info.name = out - var_out.append(value_info) - new_graph = onnx.helper.make_graph(keep_nodes, graph.name, graph.input, - var_out, keep_initializer) - tmp_model = onnx.helper.make_model(new_graph) - tmp_model.ir_version = model.ir_version - tmp_model.producer_name = model.producer_name - tmp_model.producer_version = model.producer_version - tmp_model.domain = model.domain - tmp_model.model_version = model.model_version - tmp_model.doc_string = model.doc_string - if len(tmp_model.metadata_props) > 0: - values = {p.key: p.value for p in model.metadata_props} - onnx.helper.set_model_props(tmp_model, values) - # fix opset import - for oimp in model.opset_import: - op_set = tmp_model.opset_import.add() - op_set.domain = oimp.domain - op_set.version = oimp.version - - ort_session = ort.InferenceSession(tmp_model.SerializeToString()) - outputs = ort_session.run(None, {'input': numpy_tensor}) - if debug is not None: - # print(out,outputs[0].reshape(1,-1)[0,10:20]) - debug[out] = outputs[0] - if not channelwise: - statistic = {'shape': outputs[0].shape, - 'min': np.min(outputs[0]), - 'max': np.max(outputs[0]) if np.max(outputs[0]) > 0 else np.abs(np.min(outputs[0])), - '99.9': np.percentile(outputs[0], 99.9) if np.percentile(outputs[0], 99.9) > 0 else np.abs(np.min(outputs[0])) - } - else: - axis_args = (0, 2, 3) if len(outputs[0].shape) == 4 else (0) - statistic = {'shape': outputs[0].shape, - 'min': np.min(outputs[0], axis=axis_args), - 'max': np.max(outputs[0], axis=axis_args), - '99.9': np.percentile(outputs[0], 99.9, axis=axis_args) - } - # print(np.max(statistic['max']),np.max(outputs[0])) - output_statistics[out] = statistic - print("Finished getting intermediate output statistics!") - if debug is not None: - return output_statistics,debug - else: - return output_statistics - -def normalize_model(model, output_statistics, topo_analyser, robust_norm=True, channelwise=False, eps=1e-5): - nodes = model.graph.node - graph = model.graph - initializer = model.graph.initializer - if robust_norm: - statistic_key = '99.9' - else: - statistic_key = 'max' - node_scaled_range = {} - seperate_scale = collections.OrderedDict() - print("\nNormalizing model...\n") - for node_idx, node in enumerate(tqdm.tqdm(nodes)): - output = node.output - input = node.input - op = node.op_type - if input[0] == 'input': # single input model - l = output_statistics[input[0]]['shape'][1] - node_scaled_range[input[0]] = np.ones(l) if channelwise else 1.0 \ - * output_statistics[input[0]][ - statistic_key] - - if op in ['Conv', 'Gemm']: - weight_idx = topo_analyser.param_idx[input[1]] - bias_idx = topo_analyser.param_idx[input[2]] if len(input) == 3 else None - weight = np.array(numpy_helper.to_array(initializer[weight_idx])) - bias = np.array(numpy_helper.to_array(initializer[bias_idx])) if bias_idx is not None else None - - l = output_statistics[output[0]]['shape'][1] - input_real_range = node_scaled_range[input[0]] - input_range = output_statistics[input[0]][statistic_key] - output_range = output_statistics[output[0]][statistic_key] - demand = np.ones(l) if channelwise else 1.0 - w_scale = demand / (output_range + eps) * ( - input_range / (input_real_range + eps)) if not channelwise else \ - (demand / (output_range + eps)).reshape(-1, 1).dot( - (input_range / (input_real_range + eps)).reshape(1, -1)) - w_scale = w_scale.reshape([*w_scale.shape, 1, 1]) if len(weight.shape) == 4 else w_scale - b_scale = 1 / (output_range + eps) - node_scaled_range[output[0]] = demand - - weight_ = weight * w_scale - - name = initializer[weight_idx].name - initializer.remove(initializer[weight_idx]) - initializer.insert(weight_idx, numpy_helper.from_array(weight_.astype(np.float32), name)) - if bias is not None: - bias_ = bias * b_scale - name = initializer[bias_idx].name - initializer.remove(initializer[bias_idx]) - initializer.insert(bias_idx, numpy_helper.from_array(bias_.astype(np.float32), name)) - - elif op == 'BatchNormalization': # var=1 mean=0 - weight_idx = topo_analyser.param_idx[input[1]] - bias_idx = topo_analyser.param_idx[input[2]] - weight = np.array(numpy_helper.to_array(initializer[weight_idx])) - bias = np.array(numpy_helper.to_array(initializer[bias_idx])) - - # node_scaled_range[output[0]] = node_scaled_range[input[0]] * self.output_statistics[input[0]][statistic_key] / self.output_statistics[output[0]][statistic_key] - # lamda_last = self.output_statistics[input[0]][statistic_key] - # lamda = self.output_statistics[output[0]][statistic_key] - # weight_ = weight * node_scaled_range[output[0]] - # bias_ = bias / lamda - - # print(output_statistics[output[0]]) - input_real_range = node_scaled_range[input[0]] - input_range = output_statistics[input[0]][statistic_key] - output_range = output_statistics[output[0]][statistic_key] - demand = 1.0 - w_scale = demand / (output_range + eps) * (input_range / (input_real_range + eps)) - b_scale = 1 / (output_range + eps) - node_scaled_range[output[0]] = demand - weight_ = weight * w_scale - bias_ = bias * b_scale - # print(output[0],op,input[0], input_range, output_range, demand, input_real_range, w_scale) - - name = initializer[weight_idx].name - initializer.remove(initializer[weight_idx]) - initializer.insert(weight_idx, numpy_helper.from_array(weight_.astype(np.float32), name)) - name = initializer[bias_idx].name - initializer.remove(initializer[bias_idx]) - initializer.insert(bias_idx, numpy_helper.from_array(bias_.astype(np.float32), name)) - - elif op == 'Add': - l = output_statistics[output[0]]['shape'][1] - demand = np.ones(l) if channelwise else 1.0 - node_scaled_range[output[0]] = demand - output_range = output_statistics[output[0]][statistic_key] - - # node_scaled_range[output[0]] = 1.0 - # lamda = self.output_statistics[output[0]][statistic_key] - # lamda_lasts = {} - for i in input: - if i in output_statistics.keys(): - # lamda_lasts[i] = self.output_statistics[i][statistic_key] - # scale = lamda_lasts[i] / lamda - input_real_range = node_scaled_range[i] - input_range = output_statistics[i][statistic_key] - scale = demand / (output_range + eps) * (input_range / (input_real_range + eps)) - - # print(output[0], op, i, input_range, output_range, demand, input_real_range, scale) - - idx, _ = find_node_by_output(i, graph) - if idx is not None and nodes[idx].op_type in ['Conv', 'Gemm', 'BatchNormalization']: - scale_node_weight_bias(topo_analyser, graph, idx, scale) - else: - scale = scale.reshape( - [1, *scale.shape] + [1 for _ in range(len(output_statistics[i]['shape']) - 2)]) \ - if len(scale.shape) == 1 else scale - initializer.append(numpy_helper.from_array(scale.astype(np.float32), "scale_" + i)) - if idx not in seperate_scale.keys(): - # seperate_scale[node_idx] = [(i,"scale_"+i,"scaled_"+i)] - seperate_scale[node_idx] = {i: ("scale_" + i, "scaled_" + i)} - else: - # seperate_scale[node_idx].append((i,"scale_"+i,"scaled_"+i)) - seperate_scale[node_idx][i] = ("scale_" + i, "scaled_" + i) - pass - elif op in ['Gather', 'Unsqueeze', 'Shape', 'Concat']: - continue - # elif op == "Concat": - # raise NotImplementedError("Not supported %s yet!"%(op)) - elif op == "Softmax": - raise NotImplementedError("Not supported %s yet!" % (op)) - else: # single input single output module - # print(op,self.output_statistics[output[0]]['shape']) - input_range = output_statistics[input[0]][statistic_key] - output_range = output_statistics[output[0]][statistic_key] - input_scaled_range = node_scaled_range[input[0]] - output_scaled_range = input_scaled_range / (input_range + eps) * output_range - node_scaled_range[output[0]] = output_scaled_range - - # print(output[0], op, input[0], input_range, output_range, output_scaled_range) - # print(op, node_scaled_range[output[0]],'=',input_scaled_range,'/',input_range,'*',output_range) - # else: - # raise NotImplementedError("Not supported yet! %s"%(op)) - - if len(seperate_scale.keys()) != 0: - print("Making new scale node...") - - for node_idx in reversed(seperate_scale.keys()): - args = {} - for attr in nodes[node_idx].attribute: - args[attr.name] = helper.get_attribute_value(attr) - input = [str(i) if i not in seperate_scale[node_idx].keys() else seperate_scale[node_idx][i][1] \ - for i in nodes[node_idx].input] - - output = [str(i) for i in nodes[node_idx].output] - - new_node = onnx.helper.make_node( - nodes[node_idx].op_type, - inputs=input, - outputs=output, - **args - ) - nodes.remove(nodes[node_idx]) - nodes.insert(node_idx, new_node) - - for i in seperate_scale[node_idx].keys(): - new_node = onnx.helper.make_node( - 'Mul', - inputs=[seperate_scale[node_idx][i][0], i], - outputs=[seperate_scale[node_idx][i][1]] - ) - nodes.insert(node_idx, new_node) - print("Finished normalizing model!") - return model - -def _pre_onnx_shape_inference(model:onnx.ModelProto): - ''' - 为了对模型进行shape inference,需要先对onnxmodel运行此函数进行准备 - - To perform shape inference for model, need to run this function on onnxmodel to prepare - - This function has referenced code in https://github.com/onnx/onnx/issues/2660#issuecomment-605874784 - ''' - if model.ir_version < 4: - return - - def add_some_graph_info(graph:onnx.GraphProto): - inputs = {i.name for i in graph.input} - vi_dict = {vi.name: vi for vi in graph.value_info} - for init in graph.initializer: - if init.name in inputs: - continue - vi = vi_dict.get(init.name) - if vi is None: - vi = graph.value_info.add() - vi.name = init.name - - tensor_type = vi.type.tensor_type - if tensor_type.elem_type == onnx.TensorProto.UNDEFINED: - tensor_type.elem_type = init.data_type - if not tensor_type.HasField("shape"): - tensor_type.shape.dim.extend([]) - for dim in init.dims: - tensor_type.shape.dim.add().dim_value = dim - - for node in graph.node: - for attr in node.attribute: - if attr.ref_attr_name != "": - continue - - if attr.type == onnx.AttributeProto.GRAPH: - add_some_graph_info(attr.g) - if attr.type == onnx.AttributeProto.GRAPHS: - for g in attr.graphs: - add_some_graph_info(g) - return add_some_graph_info(model.graph) - -class _pt_model(nn.Module): - def __init__(self, path_or_model, _converter=None): - super(_pt_model, self).__init__() - if path_or_model is not None: - if isinstance(path_or_model, str): - onnx_model = onnx.load(path_or_model) - else: - onnx_model = path_or_model - self.onnx_model = onnx_model - - self.loaded_weights = load_parameters(self, onnx_model.graph.initializer) - self.module_list = nn.ModuleList([]) - self.op_tree = {} - - _pre_onnx_shape_inference(onnx_model) - inferred_model = onnx.shape_inference.infer_shapes(onnx_model) - self.value_info = inferred_model.graph.value_info - self.dim_info = {} - for idx, v in enumerate(self.value_info): - self.dim_info[v.name] = len(v.type.tensor_type.shape.dim) - - self.graph = defaultdict(list) - # self.V = set() - for idx, node in enumerate(onnx_model.graph.node): - op = node.op_type - # if op=='MatMul': # TODO temporary - # op = 'Gemm' - (op_idx, inputs, outputs) = getattr(_converter, 'convert_' + op.lower())(node, self) - for out_seq, output in enumerate(outputs): - self.op_tree[str(output)] = (int(op_idx), [str(i) for i in inputs], out_seq) - for output in outputs: - for input in inputs: - self.graph[input].append(output) - # self.V.update(inputs) - # self.V.update(outputs) - # print(self.V) - self.op_tree = json.dumps(self.op_tree) - - self.input_name = [i.name for i in onnx_model.graph.input] - self.output_name = [i.name for i in onnx_model.graph.output] - - for out in onnx_model.graph.output: - self.graph[out.name] = [] - - def TopologicalSort(G): - in_degrees = dict((u, 0) for u in G) - for u in G: - for v in G[u]: - in_degrees[v] += 1 - Q = [u for u in G if in_degrees[u] == 0] - res = [] - while Q: - u = Q.pop() - res.append(u) - for v in G[u]: - in_degrees[v] -= 1 - if in_degrees[v] == 0: - Q.append(v) - return res - - self.compute_seq = TopologicalSort(self.graph) - # print(self.compute_seq) - # self.tensors = {} - - for k in self.loaded_weights.keys(): - if isinstance(self.loaded_weights[k], torch.FloatTensor): - setattr(self, 'P' + k.replace('.', '@'), torch.nn.Parameter(self.loaded_weights[k])) - else: - # print(self.loaded_weights[k]) - self.register_buffer('P' + k.replace('.', '@'), self.loaded_weights[k]) - - self.reserved_tensors_name = list(self.loaded_weights.keys()) - # print('reserve', self.reserved_tensors_name) - # print(self.tensors) - - def refresh_running_tensor(self): - self.tensors = {} - for k in set(self.tensors.keys()) | set(self.reserved_tensors_name): - if k not in self.reserved_tensors_name: - del self.tensors[k] - else: - self.tensors[k] = getattr(self, 'P' + k.replace('.', '@')) - - - def forward(self, input): - op_tree = json.loads(self.op_tree) - - tensors = {} - for k in set(tensors.keys()) | set(self.reserved_tensors_name): - if k not in self.reserved_tensors_name: - del tensors[k] - else: - tensors[k] = getattr(self, 'P' + k.replace('.', '@')) - - # self.refresh_running_tensor() - if not isinstance(input, list) or not isinstance(input, tuple): - input = [input] - for i, n in enumerate(self.input_name): - tensors[n] = input[i] - for name in self.compute_seq: - if name in op_tree.keys(): - op_idx, inputs, out_seq = op_tree[name] - # print(name,op_idx, inputs,out_seq) - args = [] - for input in inputs: - args.append(tensors[input]) - # print(len(args)) - # print(type(args[0])) - result = self.module_list[op_idx](*args) - - if not isinstance(result, tuple): - tensors[name] = result - # print(' %s = self.module_list[%d] (%s)'%(name,op_idx,inputs)) - else: - tensors[name] = result[out_seq] - # print(' %s = self.module_list[%d] (%s)[%d]'%(name,op_idx,inputs,out_seq) ) - - if len(self.output_name) == 1: - return tensors[self.output_name[0]] - else: - ret = [] - for output in self.output_name: - ret.append(tensors[output]) - return ret - - def reduce(self): - import copy - net = _pt_model(None) - for k in self.reserved_tensors_name: - if isinstance(self.loaded_weights[k], torch.FloatTensor): - setattr(net, 'P' + k.replace('.', '@'), - torch.nn.Parameter(getattr(self,'P' + k.replace('.', '@')).data.detach().clone()) ) - else: - net.register_buffer('P' + k.replace('.', '@'), - getattr(self,'P' + k.replace('.', '@')).data.clone() ) - - net.compute_seq = copy.deepcopy(self.compute_seq) - - net.input_name = copy.deepcopy(self.input_name) - net.output_name = copy.deepcopy(self.output_name) - net.module_list = copy.deepcopy(self.module_list) - net.op_tree = copy.deepcopy(self.op_tree) - net.reserved_tensors_name = copy.deepcopy(self.reserved_tensors_name) - return net - -def load_parameters(model:_pt_model, initializer): - param_dict = {} - for init in initializer: - param_dict[init.name] = torch.from_numpy(numpy_helper.to_array(init).copy()) - return param_dict - -class _o2p_converter: - def __init__(self): - ''' - * :ref:`API in English ` - - .. _ONNX_Converter.__init__-cn: - - 该类主要将onnx模型转换为Pytorch的ANN模型,从而转换为SpikingJelly的SNN模型 - 链接中 [#f1]_ 提供了一个onnx-pytorch转换的主要版本。更复杂的版本可以在这里找到。 - 大多数使用过的onnx运算符已在此处定义,但仍然有一些未被覆盖,或没有被完美实现 - 用户可以通过添加如下面例子所示的静态方法来定义您的例外情况 - - * :ref:`API in English ` - - .. _ONNX_Converter.__init__-en: - - This class mainly convert an onnx model to Pytorch ANN model, and thus to SpikingJelly SNN model - The link [#f1]_ has provided a primary version of onnx-pytorch conversion. More complex version can be found here. - Most used onnx operators has covered here, yet still there are some left, or not being defined perfectly - User can define your exceptions by adding static method like below - - .. [#f1] https://gist.github.com/qinjian623/6aa777037534c1c1dccbb66f832e93b8 - ''' - pass - - def add_method(self, op_name, func): - setattr(self, 'convert_'+op_name, staticmethod(func)) - - @staticmethod - def convert_conv(node, model:_pt_model): - attr_map = { - "pads": "padding", - "strides": "stride", - "kernel_shape": "kernel_size", - "group": "groups", - "dilations": "dilation" - } - assert len(node.output) == 1 - with_bias = False - if len(node.input) == 3: - with_bias = True - bias = model.loaded_weights[node.input[2]] - del model.loaded_weights[node.input[2]] - weight = model.loaded_weights[node.input[1]] - del model.loaded_weights[node.input[1]] - in_channels = weight.shape[1] - out_channels = weight.shape[0] - kwargs = {} - for att in node.attribute: - kwargs[attr_map[att.name]] = list(att.ints) if att.name != 'group' else att.i - if 'padding' in kwargs: - assert(kwargs["padding"][0]==kwargs["padding"][2] and kwargs["padding"][1]==kwargs["padding"][3]) - kwargs["padding"] = kwargs["padding"][0],kwargs["padding"][1] - groups = 1 if 'groups' not in kwargs else kwargs['groups'] - in_channels *= groups - conv = nn.Conv2d(in_channels, out_channels, **kwargs, bias=with_bias) - conv.weight.data = weight - if with_bias: - conv.bias.data = bias - model.module_list.append(conv) - return len(model.module_list)-1, node.input[:1], node.output - - @staticmethod - def convert_relu(node, model:_pt_model): - relu = nn.ReLU() - model.module_list.append(relu) - return len(model.module_list)-1, node.input, node.output - - @staticmethod - def convert_prelu(node, model:_pt_model): - weight = model.loaded_weights[node.input[1]] - del model.loaded_weights[node.input[1]] - prelu = nn.PReLU() - prelu.weight.data = weight - model.module_list.append(prelu) - return len(model.module_list) - 1, node.input[:-1], node.output - - @staticmethod - def convert_shape(node, model:_pt_model): - shape = Shape() - model.module_list.append(shape) - return len(model.module_list) - 1, node.input, node.output - - @staticmethod - def convert_gather(node, model:_pt_model): - attr_map = { - "axis": "dim" - } - kwargs = {} - for att in node.attribute: - if att.name in attr_map: - kwargs[attr_map[att.name]] = att.f - gather = Gather(**kwargs) - model.module_list.append(gather) - return len(model.module_list) - 1, node.input, node.output - - @staticmethod - def convert_unsqueeze(node, model:_pt_model): - attr_map = { - "axes": "dim" - } - kwargs = {} - for att in node.attribute: - if att.name in attr_map: - kwargs[attr_map[att.name]] = att.f - unsqueeze = Unsqueeze(**kwargs) - model.module_list.append(unsqueeze) - return len(model.module_list) - 1, node.input, node.output - - @staticmethod - def convert_concat(node, model:_pt_model): - attr_map = { - "axis": "dim" - } - kwargs = {} - for att in node.attribute: - if att.name in attr_map: - kwargs[attr_map[att.name]] = att.f - - concat = Concat(**kwargs) - model.module_list.append(concat) - return len(model.module_list) - 1, node.input, node.output - - @staticmethod - def convert_reshape(node, model:_pt_model): - reshape = Reshape() - model.module_list.append(reshape) - return len(model.module_list) - 1, node.input, node.output - - @staticmethod - def convert_matmul(node, model:_pt_model): - class MatMul(nn.Module): - def __init__(self): - super().__init__() - def forward(self,input1,input2): - return input1 @ input2 - mul = MatMul() - model.module_list.append(mul) - return len(model.module_list)-1, node.input, node.output - - - @staticmethod - def convert_batchnormalization(node, model:_pt_model): - attr_map = { - "epsilon": "eps", - "momentum": "momentum" - } - assert len(node.input) == 5 - assert len(node.output) == 1 - weight = model.loaded_weights[node.input[1]] - bias = model.loaded_weights[node.input[2]] - running_mean = model.loaded_weights[node.input[3]] - running_var = model.loaded_weights[node.input[4]] - del model.loaded_weights[node.input[1]] - del model.loaded_weights[node.input[2]] - del model.loaded_weights[node.input[3]] - del model.loaded_weights[node.input[4]] - dim = weight.shape[0] - kwargs = {} - # _check_attr(node.attribute, rebuild_batchnormalization.bn_attr_map) - for att in node.attribute: - if att.name in attr_map: - kwargs[attr_map[att.name]] = att.f - bn = None - if model.dim_info[node.output[0]] == 5: - bn = nn.BatchNorm3d(num_features=dim) - elif model.dim_info[node.output[0]] == 4: - bn = nn.BatchNorm2d(num_features=dim) - elif model.dim_info[node.output[0]] == 2 or model.dim_info[node.output[0]] == 3: - bn = nn.BatchNorm1d(num_features=dim) - bn.weight.data = weight - bn.bias.data = bias - bn.running_mean.data = running_mean - bn.running_var.data = running_var - model.module_list.append(bn) - return len(model.module_list)-1, node.input[:1], node.output - - @staticmethod - def convert_add(node, model:_pt_model): - add = Add() - model.module_list.append(add) - return len(model.module_list)-1, node.input, node.output - - @staticmethod - def convert_mul(node, model:_pt_model): - mul = Mul() - model.module_list.append(mul) - return len(model.module_list)-1, node.input, node.output - - @staticmethod - def convert_averagepool(node, model:_pt_model): - attr_map = { - "pads": "padding", - "strides": "stride", - "kernel_shape": "kernel_size", - } - kwargs = {} - for att in node.attribute: - kwargs[attr_map[att.name]] = list(att.ints) - if 'padding' in kwargs: - assert (kwargs["padding"][0] == kwargs["padding"][2] and kwargs["padding"][1] == kwargs["padding"][3]) - kwargs["padding"] = kwargs["padding"][0], kwargs["padding"][1] - ap = nn.AvgPool2d(**kwargs) - model.module_list.append(ap) - return len(model.module_list)-1, node.input, node.output - - @staticmethod - def convert_globalaveragepool(node, model:_pt_model): - gap = nn.AdaptiveAvgPool2d((1, 1)) - model.module_list.append(gap) - model.module_list.append(gap) - return len(model.module_list) - 1, node.input, node.output - - @staticmethod - def convert_maxpool(node, model:_pt_model): - attr_map = { - "pads": "padding", - "strides": "stride", - "kernel_shape": "kernel_size", - } - kwargs = {} - for att in node.attribute: - kwargs[attr_map[att.name]] = list(att.ints) - if 'padding' in kwargs: - assert (kwargs["padding"][0] == kwargs["padding"][2] and kwargs["padding"][1] == kwargs["padding"][3]) - kwargs["padding"] = kwargs["padding"][0], kwargs["padding"][1] - ap = nn.MaxPool2d(**kwargs) - model.module_list.append(ap) - return len(model.module_list) - 1, node.input, node.output - - @staticmethod - def convert_flatten(node, model:_pt_model): - if len(node.attribute) == 0: - axis = 1 - else: - axis = node.attribute[0].i - if axis==1: - flatten = nn.Flatten() - model.module_list.append(flatten) - return len(model.module_list)-1, node.input, node.output - else: - raise NotImplementedError("Not Implemented yet!") - - @staticmethod - def convert_gemm(node, model:_pt_model): - weight = model.loaded_weights[node.input[1]] - bias = model.loaded_weights[node.input[2]] - del model.loaded_weights[node.input[2]] - del model.loaded_weights[node.input[1]] - in_features = weight.shape[1] - out_features = weight.shape[0] - linear = nn.Linear(in_features=in_features, out_features=out_features) - linear.weight.data = weight - linear.bias.data = bias - model.module_list.append(linear) - return len(model.module_list)-1, node.input[:1], node.output - - @staticmethod - def convert_pad(node, model:_pt_model): - mode = node.attribute[0].s - pads = list(node.attribute[1].ints) - value = node.attribute[2].f - try: - assert(mode == b'constant') - assert(sum(pads[:4]) == 0) - except AssertionError: - print("Now only support converting to nn.ConstantPad2d") - pad = nn.ConstantPad2d([*pads[2:4],*pads[3:5]],value) - model.module_list.append(pad) - return len(model.module_list)-1, node.input, node.output diff --git a/spikingjelly/clock_driven/ann2snn/kernels/pytorch.py b/spikingjelly/clock_driven/ann2snn/kernels/pytorch.py deleted file mode 100644 index fbc245d..0000000 --- a/spikingjelly/clock_driven/ann2snn/kernels/pytorch.py +++ /dev/null @@ -1,127 +0,0 @@ -import numpy as np -import torch -import torch.nn as nn -import copy -from collections import defaultdict - -def layer_reduction(model: nn.Module) -> nn.Module: - relu_linker = {} # 字典类型,用于通过relu层在network中的序号确定relu前参数化模块的序号 - param_module_relu_linker = {} # 字典类型,用于通过relu前在network中的参数化模块的序号确定relu层序号 - activation_range = defaultdict(float) # 字典类型,保存在network中的序号对应层的激活最大值(或某分位点值) - - module_len = 0 - module_list = nn.ModuleList([]) - last_parammodule_idx = 0 - for n, m in model.named_modules(): - Name = m.__class__.__name__ - # 加载激活层 - if isinstance(m,nn.Softmax): - Name = 'ReLU' - print(UserWarning("Replacing Softmax by ReLU.")) - if isinstance(m,nn.ReLU) or Name == "ReLU": - module_list.append(m) - relu_linker[module_len] = last_parammodule_idx - param_module_relu_linker[last_parammodule_idx] = module_len - module_len += 1 - activation_range[module_len] = -1e5 - # 加载BatchNorm层 - if isinstance(m,(nn.BatchNorm2d,nn.BatchNorm1d)): - if isinstance(module_list[last_parammodule_idx], (nn.Conv2d,nn.Linear)): - absorb(module_list[last_parammodule_idx], m) - else: - module_list.append(copy.deepcopy(m)) - # 加载有参数的层 - if isinstance(m,(nn.Conv2d,nn.Linear)): - module_list.append(m) - last_parammodule_idx = module_len - module_len += 1 - # 加载无参数层 - if isinstance(m,nn.MaxPool2d): - module_list.append(m) - module_len += 1 - if isinstance(m,nn.AvgPool2d): - module_list.append(nn.AvgPool2d(kernel_size=m.kernel_size, stride=m.stride, padding=m.padding)) - module_len += 1 - # if isinstance(m,nn.Flatten): - if m.__class__.__name__ == "Flatten": - module_list.append(m) - module_len += 1 - network = torch.nn.Sequential(*module_list) - setattr(network,'param_module_relu_linker',param_module_relu_linker) - setattr(network, 'activation_range', activation_range) - return network - -def rate_normalization(model: nn.Module, data: torch.Tensor, **kargs) -> nn.Module: - if not hasattr(model,"activation_range") or not hasattr(model,"param_module_relu_linker"): - raise(AttributeError("run layer_reduction first!")) - try: - robust_norm = kargs['robust'] - except KeyError: - robust_norm = False - x = data - i = 0 - with torch.no_grad(): - for n, m in model.named_modules(): - Name = m.__class__.__name__ - if Name in ['Conv2d', 'ReLU', 'MaxPool2d', 'AvgPool2d', 'Flatten', 'Linear']: - x = m.forward(x) - a = x.cpu().numpy().reshape(-1) - if robust_norm: - model.activation_range[i] = np.percentile(a[np.nonzero(a)], 99.9) - else: - model.activation_range[i] = np.max(a) - i += 1 - i = 0 - last_lambda = 1.0 - for n, m in model.named_modules(): - Name = m.__class__.__name__ - if Name in ['Conv2d', 'ReLU', 'MaxPool2d', 'AvgPool2d', 'Flatten', 'Linear']: - if Name in ['Conv2d', 'Linear']: - relu_output_layer = model.param_module_relu_linker[i] - if hasattr(m, 'weight') and m.weight is not None: - m.weight.data = m.weight.data * last_lambda / model.activation_range[relu_output_layer] - if hasattr(m, 'bias') and m.bias is not None: - m.bias.data = m.bias.data / model.activation_range[relu_output_layer] - last_lambda = model.activation_range[relu_output_layer] - i += 1 - return model - -def save_model(model: nn.Module, f): - if isinstance(f,str): - torch.save(model,f) - return - -def absorb(param_module, bn_module): - if_2d = len(param_module.weight.size()) == 4 # 判断是否为BatchNorm2d - bn_std = torch.sqrt(bn_module.running_var.data + bn_module.eps) - if not if_2d: - if param_module.bias is not None: - param_module.weight.data = param_module.weight.data * bn_module.weight.data.view(-1, 1) / bn_std.view( - -1, - 1) - param_module.bias.data = (param_module.bias.data - bn_module.running_mean.data.view( - -1)) * bn_module.weight.data.view(-1) / bn_std.view( - -1) + bn_module.bias.data.view(-1) - else: - param_module.weight.data = param_module.weight.data * bn_module.weight.data.view(-1, 1) / bn_std.view( - -1, - 1) - param_module.bias.data = (torch.zeros_like( - bn_module.running_mean.data.view(-1)) - bn_module.running_mean.data.view( - -1)) * bn_module.weight.data.view(-1) / bn_std.view(-1) + bn_module.bias.data.view(-1) - else: - if param_module.bias is not None: - param_module.weight.data = param_module.weight.data * bn_module.weight.data.view(-1, 1, 1, - 1) / bn_std.view(-1, 1, - 1, 1) - param_module.bias.data = (param_module.bias.data - bn_module.running_mean.data.view( - -1)) * bn_module.weight.data.view(-1) / bn_std.view( - -1) + bn_module.bias.data.view(-1) - else: - param_module.weight.data = param_module.weight.data * bn_module.weight.data.view(-1, 1, 1, - 1) / bn_std.view(-1, 1, - 1, 1) - param_module.bias.data = (torch.zeros_like( - bn_module.running_mean.data.view(-1)) - bn_module.running_mean.data.view( - -1)) * bn_module.weight.data.view(-1) / bn_std.view(-1) + bn_module.bias.data.view(-1) - return param_module \ No newline at end of file diff --git a/spikingjelly/clock_driven/ann2snn/modules.py b/spikingjelly/clock_driven/ann2snn/modules.py index 0822f1e..250606f 100644 --- a/spikingjelly/clock_driven/ann2snn/modules.py +++ b/spikingjelly/clock_driven/ann2snn/modules.py @@ -1,133 +1,94 @@ -import torch import torch.nn as nn -import torch.nn.functional as F - - -class MaxPool2d(nn.Module): - def __init__(self, kernel_size, stride=None, padding=0, dilation=1, - return_indices=False, ceil_mode=False, momentum=None): - ''' - * :ref:`API in English ` - - .. _MaxPool2d.__init__-cn: - - :param kernel_size: 窗口取最大的大小 - :param stride: 窗口的步长. 默认值为 :attr:`kernel_size` - :param padding: 隐式两侧填充零的大小 - :param dilation: 控制窗口中元素的步幅的参数 - :param return_indices: 当 ``True`` ,将返回最大序号并输出 - :param ceil_mode: 当 ``True`` ,将使用 `ceil` 而不是 `floor` 来计算输出形状 - :param momentum: 当在[0,1]中,将在门控函数中使用在线动量统计; - 当为 ``None`` 时,将在门控函数中使用累计脉冲数 - :return: ``None`` - - 基于文献 [#f1]_ 中2.2.6章节设计MaxPool2d模块。为了兼容Pytorch的MaxPool2d模块,众多参数设定和Pytorch相同。详情请见 ``torch.nn.MaxPool2d`` 。 - 基本想法是对输入脉冲进行统计,统计量可以控制门控函数确定以哪一路输入信号作为输出。 - 根据 `momentum` 参数类型不同可以有不同的统计功能。 `momentum` 参数支持None值和[0,1]区间的浮点数数值作为输出。 - 假定在t时刻,脉冲输入张量为 :math:`s_t` ,脉冲统计量为 :math:`p_t` - 当 `momentum` 参数为 ``None`` 时,统计量为累计脉冲数 - - .. math:: - p_t = p_{t-1} + s_t - - 当 `momentum` 参数为[0,1]区间的浮点数时,统计量为在线的动量累积 - - .. math:: - p_t = momentum * p_{t-1} + (1-momentum) * s_t - - * :ref:`中文API ` - - .. _MaxPool2d.__init__-en: - - :param kernel_size: the size of the window to take a max over - :param stride: the stride of the window. Default value is :attr:`kernel_size` - :param padding: implicit zero padding to be added on both sides - :param dilation: a parameter that controls the stride of elements in the window - :param return_indices: if ``True``, will return the max indices along with the outputs. - Useful for :class:`torch.nn.MaxUnpool2d` later - :param ceil_mode: when ``True``, will use `ceil` instead of `floor` to compute the output shape - :param momentum: when in [0,1], will use online momentum statistics in gate functions; - when ``None``, will use accumulated spike in gate functions - :return: ``None`` - - Design the MaxPool2d module based on section 2.2.6 in [#f1]_ . In order to be compatible with Pytorch's MaxPool2d module, many parameter settings are the same as Pytorch. See ``torch.nn.MaxPool2d`` for details. - The basic idea is to accumulate the input spikes, which can control the gating function to determine which input spike is used as output. - Depending on the type of `momentum` parameter, different statistical functions can be used. - `momentum` supports the floating-point value in [0,1] or value ``None`` - Assume at time t, the spike input is :math:`s_t` and the spike statistic is :math:`p_t`. - When `momentum` is ``None``, the statistic is sum of spikes over time. - - .. math:: - p_t = p_{t-1} + s_t - - When `momentum` is a floating point in [0,1], the statistic is online momentum of spikes. - - .. math:: - p_t = momentum * p_{t-1} + (1-momentum) * s_t - - .. [#f1] Rueckauer B, Lungu I-A, Hu Y, Pfeiffer M and Liu S-C (2017) Conversion of Continuous-Valued Deep Networks to - Efficient Event-Driven Networks for Image Classification. Front. Neurosci. 11:682. - ''' - - super(MaxPool2d, self).__init__() - self.kernel_size = kernel_size - self.stride = stride or kernel_size - self.padding = padding - self.dilation = dilation - self.return_indices = return_indices - self.ceil_mode = ceil_mode - - assert (momentum is None or momentum <= 1) - self.momentum = momentum +import torch +import numpy as np - self.v = 0 +class VoltageHook(nn.Module): + def __init__(self, scale=1.0, momentum=0.1, mode='Max'): + """ + * :ref:`API in English ` - def forward(self, dv: torch.Tensor): - if self.momentum is not None: - self.v = self.v * self.momentum + (1 - self.momentum) * dv - else: - self.v += dv - (dv_out, ind) = F.max_pool2d(self.v, self.kernel_size, self.stride, - self.padding, self.dilation, self.ceil_mode, True) - unpool_dv_out = F.max_unpool2d(dv_out, ind, self.kernel_size, self.stride, self.padding, self.v.size()) - max_gate = (unpool_dv_out != 0.0).float() - gated_spk = dv * max_gate - spk = F.max_pool2d(gated_spk, self.kernel_size, self.stride, - self.padding) - return spk - - def reset(self): - ''' - :return: None - - 重置神经元为初始状态 - ''' - self.v = 0 - - -class AccuLayer(nn.Module): - def __init__(self, momentum=None): - super(AccuLayer, self).__init__() - - assert (momentum is None or momentum <= 1) + .. _voltageHook.__init__-cn: + + :param scale: 缩放初始值 + :type scale: float + :param momentum: 动量值 + :type momentum: float + :param mode: 模式。输入“Max”表示记录ANN激活最大值,“99.9%”表示记录ANN激活的99.9%分位点,输入0-1的float型浮点数表示记录激活最大值的对应倍数。 + :type mode: str, float + + ``VoltageHook`` 用于在ANN推理中确定激活的范围。 + + * :ref:`中文API ` + + .. _voltageHook.__init__-en: + + :param scale: initial scaling value + :type scale: float + :param momentum: momentum value + :type momentum: float + :param mode: The mode. Value "Max" means recording the maximum value of ANN activation, "99.9%" means recording the 99.9% precentile of ANN activation, and a float of 0-1 means recording the corresponding multiple of the maximum activation value. + :type mode: str, float + + ``VoltageHook`` is used to determine the range of activations in ANN inference. + + """ + super().__init__() + self.register_buffer('scale', torch.tensor(scale)) + self.mode = mode + self.num_batches_tracked = 0 self.momentum = momentum - self.v = 0 - self.t = 0.0 - - def forward(self, spk: torch.Tensor): - self.t += 1.0 - if self.momentum is not None: - self.v = self.v * self.momentum + (1 - self.momentum) * spk - return self.v + + def forward(self, x): + err_msg = 'You have used a non-defined VoltageScale Method.' + if isinstance(self.mode, str): + if self.mode[-1] == '%': + try: + s_t = torch.tensor(np.percentile(x.detach().cpu(), float(self.mode[:-1]))) + except ValueError: + raise NotImplemented(err_msg) + elif self.mode.lower() in ['max']: + s_t = x.max().detach() + else: + raise NotImplemented(err_msg) + elif isinstance(self.mode, float) and self.mode <= 1 and self.mode > 0: + s_t = x.max().detach() * self.mode + else: + raise NotImplemented(err_msg) + + if self.num_batches_tracked == 0: + self.scale = s_t else: - self.v += spk - return self.v / self.t + self.scale = (1 - self.momentum) * self.scale + self.momentum * s_t + self.num_batches_tracked += x.shape[0] + return x + +class VoltageScaler(nn.Module): + def __init__(self, scale=1.0): + """ + * :ref:`API in English ` + + .. _voltageScaler.__init__-cn: + + :param scale: 缩放值 + :type scale: float + + ``VoltageScaler`` 用于SNN推理中缩放电流。 + + * :ref:`中文API ` + + .. _voltageScaler.__init__-en: + + :param scale: scaling value + :type scale: float + + ``VoltageScaler`` is used for scaling current in SNN inference. + + """ + super().__init__() + self.register_buffer('scale', torch.tensor(scale)) - def reset(self): - ''' - :return: None + def forward(self, x): + return x * self.scale - 重置神经元为初始状态 - ''' - self.t = 0.0 - self.v = 0.0 \ No newline at end of file + def extra_repr(self): + return '%f' % self.scale.item() \ No newline at end of file diff --git a/spikingjelly/clock_driven/ann2snn/examples/model_sample/cifar10/resnet.py b/spikingjelly/clock_driven/ann2snn/sample_models/cifar10_resnet.py similarity index 89% rename from spikingjelly/clock_driven/ann2snn/examples/model_sample/cifar10/resnet.py rename to spikingjelly/clock_driven/ann2snn/sample_models/cifar10_resnet.py index 95607b8..5dce8ed 100644 --- a/spikingjelly/clock_driven/ann2snn/examples/model_sample/cifar10/resnet.py +++ b/spikingjelly/clock_driven/ann2snn/sample_models/cifar10_resnet.py @@ -30,12 +30,14 @@ class BasicBlock(nn.Module): kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion*planes) ) + self.relu1 = nn.ReLU() + self.relu2 = nn.ReLU() def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) + out = self.relu1(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) out += self.shortcut(x) - out = F.relu(out) + out = self.relu2(out) return out @@ -60,13 +62,16 @@ class Bottleneck(nn.Module): kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion*planes) ) + self.relu1 = nn.ReLU() + self.relu2 = nn.ReLU() + self.relu3 = nn.ReLU() def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) + out = self.relu1(self.bn1(self.conv1(x))) + out = self.relu2(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) out += self.shortcut(x) - out = F.relu(out) + out = self.relu3(out) return out @@ -84,6 +89,7 @@ class ResNet(nn.Module): self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) self.linear = nn.Linear(512*block.expansion, num_classes) self.flatten = nn.Flatten() + self.relu = nn.ReLU() def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) @@ -94,7 +100,7 @@ class ResNet(nn.Module): return nn.Sequential(*layers) def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) + out = self.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) diff --git a/spikingjelly/clock_driven/ann2snn/sample_models/mnist_cnn.py b/spikingjelly/clock_driven/ann2snn/sample_models/mnist_cnn.py new file mode 100644 index 0000000..f2046fa --- /dev/null +++ b/spikingjelly/clock_driven/ann2snn/sample_models/mnist_cnn.py @@ -0,0 +1,28 @@ +import torch.nn as nn + +class CNN(nn.Module): + def __init__(self): + super().__init__() + self.network = nn.Sequential( + nn.Conv2d(1, 32, 3, 1), + nn.BatchNorm2d(32), + nn.ReLU(), + nn.AvgPool2d(2, 2), + + nn.Conv2d(32, 32, 3, 1), + nn.BatchNorm2d(32), + nn.ReLU(), + nn.AvgPool2d(2, 2), + + nn.Conv2d(32, 32, 3, 1), + nn.BatchNorm2d(32), + nn.ReLU(), + nn.AvgPool2d(2, 2), + + nn.Flatten(), + nn.Linear(32, 10) + ) + + def forward(self,x): + x = self.network(x) + return x \ No newline at end of file diff --git a/spikingjelly/clock_driven/ann2snn/utils.py b/spikingjelly/clock_driven/ann2snn/utils.py new file mode 100644 index 0000000..72489f4 --- /dev/null +++ b/spikingjelly/clock_driven/ann2snn/utils.py @@ -0,0 +1,29 @@ +import requests +import os +from tqdm import tqdm + +def download_url(url, dst): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0' + } + + response = requests.get(url, headers=headers, stream=True) # (1) + file_size = int(response.headers['content-length']) # (2) + if os.path.exists(dst): + first_byte = os.path.getsize(dst) # (3) + else: + first_byte = 0 + if first_byte >= file_size: # (4) + return file_size + + header = {"Range": f"bytes={first_byte}-{file_size}"} + + pbar = tqdm(total=file_size, initial=first_byte, unit='B', unit_scale=True, desc=dst) + req = requests.get(url, headers=header, stream=True) # (5) + with open(dst, 'ab') as f: + for chunk in req.iter_content(chunk_size=1024): # (6) + if chunk: + f.write(chunk) + pbar.update(1024) + pbar.close() + return file_size \ No newline at end of file diff --git a/spikingjelly/clock_driven/cu_kernel_opt.py b/spikingjelly/clock_driven/cu_kernel_opt.py index cfc54af..62979f2 100644 --- a/spikingjelly/clock_driven/cu_kernel_opt.py +++ b/spikingjelly/clock_driven/cu_kernel_opt.py @@ -1,67 +1,179 @@ +import logging +import torch +import time +import numpy as np +from .. import configure +import os +import threading +import datetime +from torch.utils.tensorboard import SummaryWriter +import re try: import cupy - import torch - import time - import numpy as np - from ..configure import cuda_threads, cuda_compiler_options - - - def cal_fun_t(n, device, f, *args, **kwargs): - if n <= 2: - torch.cuda.synchronize(device) - t_start = time.perf_counter() - f(*args, **kwargs) - torch.cuda.synchronize(device) - return (time.perf_counter() - t_start) - # warm up - f(*args, **kwargs) - torch.cuda.synchronize(device) - - t_list = [] - for _ in range(n * 2): - torch.cuda.synchronize(device) - t_start = time.perf_counter() - f(*args, **kwargs) - torch.cuda.synchronize(device) - t_list.append(time.perf_counter() - t_start) - t_list = np.asarray(t_list) - return t_list[n:].mean() - - def cal_blocks(numel: int): - return (numel + cuda_threads - 1) // cuda_threads - - def get_contiguous(*args): - ret_list = [] - for item in args: - if isinstance(item, torch.Tensor): - ret_list.append(item.contiguous()) - - elif isinstance(item, cupy.ndarray): - ret_list.append(cupy.ascontiguousarray(item)) - - else: - raise TypeError - return ret_list - - def wrap_args_to_raw_kernel(device: int, *args): - # note that the input must be contiguous - # check device and get data_ptr from tensor - ret_list = [] - for item in args: - if isinstance(item, torch.Tensor): - assert item.get_device() == device - assert item.is_contiguous() - ret_list.append(item.data_ptr()) - - elif isinstance(item, cupy.ndarray): - assert item.device.id == device - assert item.flags['C_CONTIGUOUS'] - ret_list.append(item) - - else: - raise TypeError - - return tuple(ret_list) - -except ImportError: - pass \ No newline at end of file +except BaseException as e: + logging.info(f'spikingjelly.clock_driven.cu_kernel_opt: {e}') + pass + +def cuda_timer(device, f, *args, **kwargs): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + f(*args, **kwargs) + end.record() + torch.cuda.synchronize(device) + return start.elapsed_time(end) + +def cal_fun_t(n, device, f, *args, **kwargs): + assert n > 2 + # warm up + cuda_timer(device, f, *args, **kwargs) + + t_list = [] + for _ in range(n * 2): + t_list.append(cuda_timer(device, f, *args, **kwargs)) + t_list = np.asarray(t_list) + return t_list[n:].mean() + +def cal_blocks(numel: int): + return (numel + configure.cuda_threads - 1) // configure.cuda_threads + +def get_contiguous(*args): + ret_list = [] + for item in args: + if isinstance(item, torch.Tensor): + ret_list.append(item.contiguous()) + + elif isinstance(item, cupy.ndarray): + ret_list.append(cupy.ascontiguousarray(item)) + + else: + raise TypeError + return ret_list + +def wrap_args_to_raw_kernel(device: int, *args): + # note that the input must be contiguous + # check device and get data_ptr from tensor + ret_list = [] + for item in args: + if isinstance(item, torch.Tensor): + assert item.get_device() == device + assert item.is_contiguous() + ret_list.append(item.data_ptr()) + + elif isinstance(item, cupy.ndarray): + assert item.device.id == device + assert item.flags['C_CONTIGUOUS'] + ret_list.append(item) + + else: + raise TypeError + + return tuple(ret_list) + +class GPUMonitor(threading.Thread): + def __init__(self, log_dir: str = None, gpu_ids: tuple = (0, ), interval: float = 60., start_now=True): + """ + :param log_dir: the directory for saving logs with tensorboard. If it is None, this module will print logs + :type log_dir: str + :param gpu_ids: the id of GPUs to be monitored, e.g., `(0, 1, 2, 3)`. The default value is `(0, )` + :type gpu_ids: tuple + :param interval: the recording interval (in seconds) + :type interval: float + :param start_now: if true, the monitor will start to record now. Otherwise, it will start after the user call `start()` manually + :type start_now: + + The GPU monitor, which starts a new thread to record the utilization and memory used of `gpu_ids` every `interval` seconds. + + .. admonition:: Warning + :class: warning + + Do not forget to call `stop()` after the main thread finishes its job, otherwise the main thread will never stop! + + Codes example: + + .. code-block:: python + + import time + + gm = GPUMonitor(interval=1) + time.sleep(2) # make the main thread sleep + gm.stop() + + # The outputs are: + + # 2022-04-28 10:52:25 + # utilization.gpu [%], memory.used [MiB] + # 0 %, 376 MiB + """ + super().__init__() + self.gpu_ids = gpu_ids + self.interval = interval + self.stopped = False + self.cmds = 'nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv' + self.cmds += ' -i ' + id_str = [] + for gpu_id in self.gpu_ids: + id_str.append(str(gpu_id)) + self.cmds += ','.join(id_str) + self.step = 0 + + if log_dir is None: + self.writer = None + else: + self.writer = [] + for i in range(self.gpu_ids.__len__()): + self.writer.append(SummaryWriter(os.path.join(log_dir, f'gpu_{id_str[i]}'))) + + if start_now: + self.start() + + def stop(self): + self.stopped = True + + def run(self): + while not self.stopped: + with os.popen(self.cmds) as fp: + outputs = fp.read() + if self.writer is not None: + outputs = outputs.split('\n')[1:-1] + # skip the first row 'utilization.gpu [%], memory.used [MiB]' and the last row ('\n') + for i in range(outputs.__len__()): + utilization_memory = re.findall(r'\d+', outputs[i]) + utilization = int(utilization_memory[0]) + memory_used = int(utilization_memory[1]) + self.writer[i].add_scalar('utilization', utilization, self.step) + self.writer[i].add_scalar('memory_used', memory_used, self.step) + else: + print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + print(outputs) + ''' + 2022-04-20 18:14:26 + utilization.gpu [%], memory.used [MiB] + 4 %, 1816 MiB + 0 %, 1840 MiB + 0 %, 1840 MiB + 0 %, 1720 MiB + ''' + time.sleep(self.interval) + self.step += 1 + + +class DeviceEnvironment: + def __init__(self, device: int): + """ + This module is used as a context to make CuPy use the specific device, and avoids `torch.cuda.current_device()` is changed by CuPy. + Refer to https://github.com/cupy/cupy/issues/6569 for more details. + """ + self.device = device + self.previous_device = None + + def __enter__(self): + current_device = torch.cuda.current_device() + if current_device != self.device: + torch.cuda.set_device(self.device) + self.previous_device = current_device + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.previous_device is not None: + torch.cuda.set_device(self.previous_device) + diff --git a/spikingjelly/clock_driven/encoding.py b/spikingjelly/clock_driven/encoding.py index 36c8776..3bb011e 100644 --- a/spikingjelly/clock_driven/encoding.py +++ b/spikingjelly/clock_driven/encoding.py @@ -312,14 +312,14 @@ class PoissonEncoder(StatelessEncoder): return out_spike class WeightedPhaseEncoder(StatefulEncoder): - def __init__(self, T: int): + def __init__(self, K: int): """ * :ref:`API in English ` .. _WeightedPhaseEncoder.__init__-cn: - :param T: 编码周期。通常情况下,与SNN的仿真周期(总步长一致) - :type T: int + :param K: 编码周期。通常情况下,与SNN的仿真周期(总步长一致) + :type K: int Kim J, Kim H, Huh S, et al. Deep neural networks with weighted spikes[J]. Neurocomputing, 2018, 311: 373-386. @@ -346,8 +346,8 @@ class WeightedPhaseEncoder(StatefulEncoder): .. _WeightedPhaseEncoder.__init__-en: - :param T: the encoding period. It is usually same with the total simulation time-steps of SNN - :type T: int + :param K: the encoding period. It is usually same with the total simulation time-steps of SNN + :type K: int The weighted phase encoder, which is based on binary system. It will flatten ``x`` as a binary number. When ``T=k``, it can encode :math:`x \in [0, 1-2^{-K}]` to different spikes. Here is the example from the origin paper: @@ -368,14 +368,14 @@ class WeightedPhaseEncoder(StatefulEncoder): """ - super().__init__(T) + super().__init__(K) def encode(self, x: torch.Tensor): - assert (x >= 0).all() and (x <= 1 - 2 ** (-self.phase)).all() + assert (x >= 0).all() and (x <= 1 - 2 ** (-self.T)).all() inputs = x.clone() - self.spike = torch.empty((self.phase,) + x.shape, device=x.device) # 编码为[phase, batch_size, *] + self.spike = torch.empty((self.T,) + x.shape, device=x.device) # Encoding to [T, batch_size, *] w = 0.5 - for i in range(self.phase): + for i in range(self.T): self.spike[i] = inputs >= w inputs -= w * self.spike[i] w *= 0.5 diff --git a/spikingjelly/clock_driven/examples/DQN_state.py b/spikingjelly/clock_driven/examples/DQN_state.py index eb3b6e0..d54053a 100644 --- a/spikingjelly/clock_driven/examples/DQN_state.py +++ b/spikingjelly/clock_driven/examples/DQN_state.py @@ -69,7 +69,7 @@ if __name__ == '__main__': device = torch.device("cuda" if args.use_cuda else "cpu") - writer = SummaryWriter(logdir='./log') + writer = SummaryWriter(log_dir='./log') env = gym.make(env_name).unwrapped env.seed(args.seed) diff --git a/spikingjelly/clock_driven/examples/Spiking_DQN_state.py b/spikingjelly/clock_driven/examples/Spiking_DQN_state.py index 6bc0fd2..21392ab 100644 --- a/spikingjelly/clock_driven/examples/Spiking_DQN_state.py +++ b/spikingjelly/clock_driven/examples/Spiking_DQN_state.py @@ -38,15 +38,15 @@ class ReplayMemory(object): return len(self.memory) - class NonSpikingLIFNode(neuron.LIFNode): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def forward(self, dv: torch.Tensor): - self.neuronal_charge(dv) - # self.neuronal_fire() - # self.neuronal_reset() - return self.v +class NonSpikingLIFNode(neuron.LIFNode): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, dv: torch.Tensor): + self.neuronal_charge(dv) + # self.neuronal_fire() + # self.neuronal_reset() + return self.v # Spiking DQN algorithm diff --git a/spikingjelly/clock_driven/examples/lif_fc_mnist.py b/spikingjelly/clock_driven/examples/lif_fc_mnist.py index 75ba8ba..0c32249 100644 --- a/spikingjelly/clock_driven/examples/lif_fc_mnist.py +++ b/spikingjelly/clock_driven/examples/lif_fc_mnist.py @@ -179,6 +179,17 @@ def main(): # 保存绘图用数据 net.eval() + # 注册钩子 + output_layer = net[-1] # 输出层 + output_layer.v_seq = [] + output_layer.s_seq = [] + def save_hook(m, x, y): + m.v_seq.append(m.v.unsqueeze(0)) + m.s_seq.append(y.unsqueeze(0)) + + output_layer.register_forward_hook(save_hook) + + with torch.no_grad(): img, label = test_dataset[0] img = img.to(device) @@ -189,10 +200,12 @@ def main(): out_spikes_counter += net(encoder(img).float()) out_spikes_counter_frequency = (out_spikes_counter / T).cpu().numpy() print(f'Firing rate: {out_spikes_counter_frequency}') - output_layer = net[-1] # 输出层 - v_t_array = output_layer.v.cpu().numpy().squeeze().T # v_t_array[i][j]表示神经元i在j时刻的电压值 + + output_layer.v_seq = torch.cat(output_layer.v_seq) + output_layer.s_seq = torch.cat(output_layer.s_seq) + v_t_array = output_layer.v_seq.cpu().numpy().squeeze().T # v_t_array[i][j]表示神经元i在j时刻的电压值 np.save("v_t_array.npy",v_t_array) - s_t_array = output_layer.spike.cpu().numpy().squeeze().T # s_t_array[i][j]表示神经元i在j时刻释放的脉冲,为0或1 + s_t_array = output_layer.s_seq.cpu().numpy().squeeze().T # s_t_array[i][j]表示神经元i在j时刻释放的脉冲,为0或1 np.save("s_t_array.npy",s_t_array) train_accs = np.array(train_accs) diff --git a/spikingjelly/clock_driven/functional.py b/spikingjelly/clock_driven/functional.py index 8a3dc74..97b3f27 100644 --- a/spikingjelly/clock_driven/functional.py +++ b/spikingjelly/clock_driven/functional.py @@ -4,6 +4,8 @@ import torch.nn.functional as F import math from . import neuron +from torch import Tensor + def reset_net(net: nn.Module): ''' * :ref:`API in English ` @@ -30,7 +32,7 @@ def reset_net(net: nn.Module): if hasattr(m, 'reset'): m.reset() -def spike_cluster(v: torch.Tensor, v_threshold, T_in: int): +def spike_cluster(v: Tensor, v_threshold, T_in: int): ''' * :ref:`API in English ` @@ -180,7 +182,7 @@ def spike_cluster(v: torch.Tensor, v_threshold, T_in: int): return N_o, k_positive, k_negative -def spike_similar_loss(spikes:torch.Tensor, labels:torch.Tensor, kernel_type='linear', loss_type='mse', *args): +def spike_similar_loss(spikes:Tensor, labels:Tensor, kernel_type='linear', loss_type='mse', *args): ''' * :ref:`API in English ` @@ -285,7 +287,7 @@ def spike_similar_loss(spikes:torch.Tensor, labels:torch.Tensor, kernel_type='li else: raise NotImplementedError -def kernel_dot_product(x:torch.Tensor, y:torch.Tensor, kernel='linear', *args): +def kernel_dot_product(x:Tensor, y:Tensor, kernel='linear', *args): ''' * :ref:`API in English ` @@ -349,7 +351,7 @@ def kernel_dot_product(x:torch.Tensor, y:torch.Tensor, kernel='linear', *args): else: raise NotImplementedError -def set_threshold_margin(output_layer:neuron.BaseNode, label_one_hot:torch.Tensor, +def set_threshold_margin(output_layer:neuron.BaseNode, label_one_hot:Tensor, eval_threshold=1.0, threshold0=0.9, threshold1=1.1): ''' * :ref:`API in English ` @@ -391,7 +393,7 @@ def set_threshold_margin(output_layer:neuron.BaseNode, label_one_hot:torch.Tenso else: output_layer.v_threshold = eval_threshold -def redundant_one_hot(labels:torch.Tensor, num_classes:int, n:int): +def redundant_one_hot(labels:Tensor, num_classes:int, n:int): ''' * :ref:`API in English ` @@ -453,7 +455,7 @@ def redundant_one_hot(labels:torch.Tensor, num_classes:int, n:int): codes += F.one_hot(labels * n + i, redundant_classes) return codes -def first_spike_index(spikes: torch.Tensor): +def first_spike_index(spikes: Tensor): ''' * :ref:`API in English ` @@ -522,47 +524,46 @@ def first_spike_index(spikes: torch.Tensor): # 在时间维度上,2次cumsum后,元素为1的位置,即为首次发放脉冲的位置 return spikes.cumsum(dim=-1).cumsum(dim=-1) == 1 -def multi_step_forward(x_seq: torch.Tensor, multi_step_module: nn.Module or list or tuple): +def multi_step_forward(x_seq: Tensor, single_step_module: nn.Module or list or tuple or nn.Sequential): """ :param x_seq: shape=[T, batch_size, ...] - :type x_seq: torch.Tensor - :param multi_step_module: a multi-step module, or a list/tuple that contains multi-step modules - :type multi_step_module: torch.nn.Module or list or tuple + :type x_seq: Tensor + :param single_step_module: a single-step module, or a list/tuple that contains single-step modules + :type single_step_module: torch.nn.Module or list or tuple or torch.nn.Sequential :return: y_seq, shape=[T, batch_size, ...] - :rtype: torch.Tensor + :rtype: Tensor See :class:`spikingjelly.clock_driven.layer.MultiStepContainer` for more details. """ y_seq = [] - if isinstance(multi_step_module, (list, tuple)): + if isinstance(single_step_module, (list, tuple, nn.Sequential)): for t in range(x_seq.shape[0]): x_seq_t = x_seq[t] - for m in multi_step_module: + for m in single_step_module: x_seq_t = m(x_seq_t) y_seq.append(x_seq_t) else: for t in range(x_seq.shape[0]): - y_seq.append(multi_step_module(x_seq[t])) + y_seq.append(single_step_module(x_seq[t])) for t in range(y_seq.__len__()): - # y_seq[t].unsqueeze_(0) y_seq[t] = y_seq[t].unsqueeze(0) return torch.cat(y_seq, 0) -def seq_to_ann_forward(x_seq: torch.Tensor, stateless_module: nn.Module or list or tuple): +def seq_to_ann_forward(x_seq: Tensor, stateless_module: nn.Module or list or tuple or nn.Sequential): """ :param x_seq: shape=[T, batch_size, ...] - :type x_seq: torch.Tensor - :param multi_step_module: a stateless module, e.g., 'torch.nn.Conv2d' or a list contains stateless modules, e.g., '[torch.nn.Conv2d, torch.nn.BatchNorm2d] - :type multi_step_module: torch.nn.Module or list or tuple + :type x_seq: Tensor + :param stateless_module: a stateless module, e.g., 'torch.nn.Conv2d' or a list contains stateless modules, e.g., '[torch.nn.Conv2d, torch.nn.BatchNorm2d] + :type stateless_module: torch.nn.Module or list or tuple or torch.nn.Sequential :return: y_seq, shape=[T, batch_size, ...] - :rtype: torch.Tensor + :rtype: Tensor See :class:`spikingjelly.clock_driven.layer.SeqToANNContainer` for more details. """ y_shape = [x_seq.shape[0], x_seq.shape[1]] y = x_seq.flatten(0, 1) - if isinstance(stateless_module, (list, tuple)): + if isinstance(stateless_module, (list, tuple, nn.Sequential)): for m in stateless_module: y = m(y) else: @@ -577,7 +578,7 @@ def fused_conv2d_weight_of_convbn2d(conv2d: nn.Conv2d, bn2d: nn.BatchNorm2d): :param bn2d: a BatchNorm2d layer :type bn2d: torch.nn.BatchNorm2d :return: the weight of this fused module - :rtype: torch.Tensor + :rtype: Tensor A {Conv2d-BatchNorm2d} can be fused to a {Conv2d} module with BatchNorm2d's parameters being absorbed into Conv2d. This function returns the weight of this fused module. @@ -600,7 +601,7 @@ def fused_conv2d_bias_of_convbn2d(conv2d: nn.Conv2d, bn2d: nn.BatchNorm2d): :param bn2d: a BatchNorm2d layer :type bn2d: torch.nn.BatchNorm2d :return: the bias of this fused module - :rtype: torch.Tensor + :rtype: Tensor A {Conv2d-BatchNorm2d} can be fused to a {Conv2d} module with BatchNorm2d's parameters being absorbed into Conv2d. This function returns the bias of this fused module. @@ -690,14 +691,14 @@ def fuse_convbn2d(conv2d: nn.Conv2d, bn2d: nn.BatchNorm2d, k=None, b=None): fused_conv.bias.data = fused_conv2d_bias_of_convbn2d(conv2d, bn2d) return fused_conv -def temporal_efficient_training_cross_entropy(x_seq: torch.Tensor, target: torch.LongTensor): +def temporal_efficient_training_cross_entropy(x_seq: Tensor, target: torch.LongTensor): """ :param x_seq: ``shape=[T, N, C, *]``, where ``C`` is the number of classes - :type x_seq: torch.Tensor + :type x_seq: Tensor :param target: ``shape=[N]``, where ``0 <= target[i] <= C-1`` :type target: torch.LongTensor :return: the temporal efficient training cross entropy - :rtype: torch.Tensor + :rtype: Tensor The temporal efficient training (TET) cross entropy, which is the mean of cross entropy of each time-step. @@ -705,7 +706,7 @@ def temporal_efficient_training_cross_entropy(x_seq: torch.Tensor, target: torch .. code-block:: python - def tet_ce_for_loop_version(x_seq: torch.Tensor, target: torch.LongTensor): + def tet_ce_for_loop_version(x_seq: Tensor, target: torch.LongTensor): loss = 0. for t in range(x_seq.shape[0]): loss += F.cross_entropy(x_seq[t], target) @@ -761,9 +762,9 @@ def kaiming_normal_conv_linear_weight(net: nn.Module): :return: None - initialize all weights (not including bias) of :class:`torch.nn._ConvNd` and `:class:`torch.nn.Linear` in `net` by the kaiming normal. See :class:`torch.nn.init.kaiming_normal_` + initialize all weights (not including bias) of :class:`torch.nn._ConvNd` and :class:`torch.nn.Linear` in `net` by the kaiming normal. See :class:`torch.nn.init.kaiming_normal_` for more details. ''' for m in net.modules(): if isinstance(m, (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear)): - nn.init.kaiming_normal_(m.weight, a=math.sqrt(5)) + nn.init.kaiming_normal_(m.weight, a=math.sqrt(5)) \ No newline at end of file diff --git a/spikingjelly/clock_driven/lava_exchange.py b/spikingjelly/clock_driven/lava_exchange.py new file mode 100644 index 0000000..1dc255e --- /dev/null +++ b/spikingjelly/clock_driven/lava_exchange.py @@ -0,0 +1,303 @@ +import torch +import torch.nn as nn +import logging +from . import neuron + +try: + import lava.lib.dl.slayer as slayer + +except BaseException as e: + logging.info(f'spikingjelly.clock_driven.lava_exchange: {e}') + slayer = None + +# ---------------------------------------- +# data reshape function + +def TNX_to_NXT(x_seq: torch.Tensor): + # x_seq.shape = [T, N, *] + permute_args = list(range(1, x_seq.dim())) + permute_args.append(0) + return x_seq.permute(permute_args) + +def NXT_to_TNX(x_seq: torch.Tensor): + # x_seq.shape = [N, *, T] + permute_args = list(range(x_seq.dim() - 1)) + permute_args.insert(0, x_seq.dim() - 1) + return x_seq.permute(permute_args) + + +def lava_neuron_forward(lava_neuron: nn.Module, x_seq: torch.Tensor, v: torch.Tensor or float): + # x_seq.shape = [T, N, *] + # lave uses shape = [*, T], while SJ uses shape = [T, *] + unsqueeze_flag = False + if x_seq.dim() == 2: + x_seq = x_seq.unsqueeze(1) + # lave needs input with shape [N, ... ,T] + unsqueeze_flag = True + + if isinstance(v, float): + v_init = v + v = torch.zeros_like(x_seq[0]) + if v_init != 0.: + torch.fill_(v, v_init) + + x_seq_shape = x_seq.shape + x_seq = x_seq.flatten(2).permute(1, 2, 0) + # [T, N, *] -> [N, *, T] + + lava_neuron.voltage_state = v + spike = lava_neuron(x_seq).permute(2, 0, 1) + + v = lava_neuron.voltage_state.reshape(x_seq_shape[1:]) + spike = spike.reshape(x_seq_shape) + if unsqueeze_flag: + v = v.squeeze(1) + spike = spike.squeeze(1) + + return spike, v + +# ---------------------------------------- +# quantize function + +class _step_quantize(torch.autograd.Function): + @staticmethod + def forward(ctx, x, step): + return torch.round(x / step) * step + + @staticmethod + def backward(ctx, grad_output): + return grad_output, None + +def step_quantize(x: torch.Tensor, step: float = 1.): + """ + :param x: the input tensor + :type x: torch.Tensor + :param step: the quantize step + :type step: float + :return: quantized tensor + :rtype: torch.Tensor + + The step quantize function. Here is an example: + + .. code-block:: python + + # plt.style.use(['science', 'muted', 'grid']) + fig = plt.figure(dpi=200, figsize=(6, 4)) + x = torch.arange(-4, 4, 0.001) + plt.plot(x, lava_exchange.step_quantize(x, 2.), label='quantize(x, step=2)') + plt.plot(x, x, label='y=x', ls='-.') + plt.legend() + plt.grid(ls='--') + plt.title('step quantize') + plt.xlabel('Input') + plt.ylabel('Output') + plt.savefig('./docs/source/_static/API/clock_driven/lava_exchange/step_quantize.svg') + plt.savefig('./docs/source/_static/API/clock_driven/lava_exchange/step_quantize.pdf') + + .. image:: ./_static/API/clock_driven/lava_exchange/step_quantize.* + :width: 100% + + """ + return _step_quantize.apply(x, step) + + +def quantize_8bit(x: torch.Tensor, scale, descale=False): + if descale: + return step_quantize(x, 2. / scale).clamp(-256. / scale, 255. / scale) * scale + else: + return step_quantize(x, 2. / scale).clamp(-256. / scale, 255. / scale) + +# ---------------------------------------- +# convert function +def check_conv2d(conv2d_nn: nn.Conv2d): + if not isinstance(conv2d_nn, nn.Conv2d): + raise ValueError(f'expected conv2d_nn with type torch.nn.Conv2d, but got conv2d_nn with type {type(conv2d_nn)}!') + + if conv2d_nn.bias is not None: + raise ValueError('lava does not support for convolutional synapse with bias!') + +def check_fc(fc: nn.Linear): + if not isinstance(fc, nn.Linear): + raise ValueError(f'expected fc with type torch.nn.Linear, but got fc with type {type(fc)}!') + + if fc.bias is not None: + raise ValueError('lava does not support for dense synapse with bias!') + +def to_lava_neuron_param_dict(sj_ms_neuron: nn.Module): + if isinstance(sj_ms_neuron, neuron.MultiStepIFNode): + if sj_ms_neuron.v_reset != 0.: + raise ValueError('lava only supports for v_reset == 0!') + return { + 'threshold': sj_ms_neuron.v_threshold, + 'current_decay': 1., + 'voltage_decay': 0., + 'tau_grad': 1, 'scale_grad': 1, 'scale': sj_ms_neuron.lava_s_cale, + 'norm': None, 'dropout': None, + 'shared_param': True, 'persistent_state': True, 'requires_grad': False, + 'graded_spike': False + } + + elif isinstance(sj_ms_neuron, neuron.MultiStepLIFNode): + if sj_ms_neuron.v_reset != 0.: + raise ValueError('lava only supports for v_reset == 0!') + if sj_ms_neuron.decay_input: + raise ValueError('lava only supports for decay_input == False!') + return { + 'threshold': sj_ms_neuron.v_threshold, + 'current_decay': 1., + 'voltage_decay': 1. / sj_ms_neuron.tau, + 'tau_grad': 1, 'scale_grad': 1, 'scale': sj_ms_neuron.lava_s_cale, + 'norm': None, 'dropout': None, + 'shared_param': True, 'persistent_state': True, 'requires_grad': False, + 'graded_spike': False + } + else: + raise NotImplementedError(sj_ms_neuron) + + +def to_lava_neuron(sj_ms_neuron: nn.Module): + if isinstance(sj_ms_neuron, (neuron.MultiStepIFNode, neuron.MultiStepLIFNode)): + return slayer.neuron.cuba.Neuron( + **to_lava_neuron_param_dict(sj_ms_neuron) + ) + else: + raise NotImplementedError(sj_ms_neuron) + +def linear_to_lava_synapse_dense(fc: nn.Linear): + """ + :param fc: a pytorch linear layer without bias + :type fc: nn.Linear + :return: a lava slayer dense synapse + :rtype: slayer.synapse.Dense + + Codes example: + + .. code-block:: python + + T = 4 + N = 2 + layer_nn = nn.Linear(8, 4, bias=False) + layer_sl = lava_exchange.linear_to_lava_synapse_dense(layer_nn) + x_seq = torch.rand([T, N, 8]) + with torch.no_grad(): + y_nn = functional.seq_to_ann_forward(x_seq, layer_nn) + y_sl = lava_exchange.NXT_to_TNX(layer_sl(lava_exchange.TNX_to_NXT(x_seq))) + print('max error:', (y_nn - y_sl).abs().max()) + """ + check_fc(fc) + + dense_slayer = slayer.synapse.Dense(fc.in_features, fc.out_features) + + # `dense_slayer` is a `torch.torch.nn.Conv3d`. Its weight has shape [out_features, in_features, 1, 1, 1] + dense_slayer.weight.data[:, :, 0, 0, 0] = fc.weight.data.clone() + + return dense_slayer + +def conv2d_to_lava_synapse_conv(conv2d_nn: nn.Conv2d): + """ + :param conv2d_nn: a pytorch conv2d layer without bias + :type conv2d_nn: nn.Conv2d + :return: a lava slayer conv synapse + :rtype: slayer.synapse.Conv + + Codes example: + + .. code-block:: python + + T = 4 + N = 2 + layer_nn = nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1, bias=False) + layer_sl = lava_exchange.conv2d_to_lava_synapse_conv(layer_nn) + x_seq = torch.rand([T, N, 3, 28, 28]) + with torch.no_grad(): + y_nn = functional.seq_to_ann_forward(x_seq, layer_nn) + y_sl = lava_exchange.NXT_to_TNX(layer_sl(lava_exchange.TNX_to_NXT(x_seq))) + print('max error:', (y_nn - y_sl).abs().max()) + """ + check_conv2d(conv2d_nn) + + conv_slayer = slayer.synapse.Conv(in_features=conv2d_nn.in_channels, out_features=conv2d_nn.out_channels, kernel_size=conv2d_nn.kernel_size, stride=conv2d_nn.stride, padding=conv2d_nn.padding, dilation=conv2d_nn.dilation, groups=conv2d_nn.groups) + # `conv_slayer` is a `torch.torch.nn.Conv3d`. + conv_slayer.weight.data[:, :, :, :, 0] = conv2d_nn.weight.data.clone() + + return conv_slayer + +def avgpool2d_to_lava_synapse_pool(pool2d_nn: nn.AvgPool2d): + """ + :param pool2d_nn: a pytorch AvgPool2d layer + :type pool2d_nn: nn.AvgPool2d + :return: a lava slayer pool layer + :rtype: slayer.synapse.Pool + + .. admonition:: Warning + :class: warning + + The lava slayer pool layer applies sum pooling, rather than average pooling. + + .. code-block:: python + + T = 4 + N = 2 + layer_nn = nn.AvgPool2d(kernel_size=2, stride=2) + layer_sl = lava_exchange.avgpool2d_to_lava_synapse_pool(layer_nn) + x_seq = torch.rand([T, N, 3, 28, 28]) + with torch.no_grad(): + y_nn = functional.seq_to_ann_forward(x_seq, layer_nn) + y_sl = lava_exchange.NXT_to_TNX(layer_sl(lava_exchange.TNX_to_NXT(x_seq))) / 4. + print('max error:', (y_nn - y_sl).abs().max()) + """ + if not isinstance(pool2d_nn, nn.AvgPool2d): + raise ValueError(f'expected pool2d_nn with type torch.nn.Conv2d, but got pool2d_nn with type {type(pool2d_nn)}!') + + return slayer.synapse.Pool(pool2d_nn.kernel_size, pool2d_nn.stride, pool2d_nn.padding) + +def to_lava_block_dense(fc: nn.Linear, sj_ms_neuron: nn.Module, quantize_to_8bit: bool = True): + + check_fc(fc) + + neuron_params = to_lava_neuron_param_dict(sj_ms_neuron) + if isinstance(sj_ms_neuron, (neuron.MultiStepIFNode, neuron.MultiStepLIFNode)): + block_init = slayer.block.cuba.Dense + else: + raise NotImplementedError(sj_ms_neuron) + + + if quantize_to_8bit: + # if 'pre_hook_fx' not in kwargs.keys(), then `pre_hook_fx` will be set to `quantize_8bit` by default + block_lava = block_init(neuron_params, fc.in_features, fc.out_features, delay_shift=False) + else: + block_lava = block_init(neuron_params, fc.in_features, fc.out_features, delay_shift=False, pre_hook_fx=None) + + block_lava.synapse.weight.data[:, :, 0, 0, 0] = fc.weight.data.clone() + + return block_lava + + +def to_lava_block_conv(conv2d_nn: nn.Conv2d, sj_ms_neuron: nn.Module, quantize_to_8bit: bool = True): + + check_conv2d(conv2d_nn) + + neuron_params = to_lava_neuron_param_dict(sj_ms_neuron) + if isinstance(sj_ms_neuron, (neuron.MultiStepIFNode, neuron.MultiStepLIFNode)): + block_init = slayer.block.cuba.Conv + else: + raise NotImplementedError(sj_ms_neuron) + + if quantize_to_8bit: + # if 'pre_hook_fx' not in kwargs.keys(), then `pre_hook_fx` will be set to `quantize_8bit` by default + block_lava = block_init(neuron_params, in_features=conv2d_nn.in_channels, out_features=conv2d_nn.out_channels, kernel_size=conv2d_nn.kernel_size, stride=conv2d_nn.stride, padding=conv2d_nn.padding, dilation=conv2d_nn.dilation, groups=conv2d_nn.groups, delay_shift=False) + else: + block_lava = block_init(neuron_params, in_features=conv2d_nn.in_channels, out_features=conv2d_nn.out_channels, kernel_size=conv2d_nn.kernel_size, stride=conv2d_nn.stride, padding=conv2d_nn.padding, dilation=conv2d_nn.dilation, groups=conv2d_nn.groups, delay_shift=False, pre_hook_fx=None) + + block_lava.synapse.weight.data[:, :, :, :, 0] = conv2d_nn.weight.data.clone() + + return block_lava + + +def to_lava_block_flatten(flatten_nn: nn.Flatten): + if flatten_nn.start_dim != 1: + raise ValueError('lava only supports for flatten_nn.start_dim == 1!') + return slayer.block.cuba.Flatten() + + + diff --git a/spikingjelly/clock_driven/layer.py b/spikingjelly/clock_driven/layer.py index d7642f3..5330d17 100644 --- a/spikingjelly/clock_driven/layer.py +++ b/spikingjelly/clock_driven/layer.py @@ -3,8 +3,11 @@ import torch.nn as nn import torch.nn.functional as F import math from . import base, functional +from torch import Tensor from torch.nn.common_types import _size_2_t from typing import Callable +from torch.nn.modules.batchnorm import _BatchNorm + class NeuNorm(base.MemoryModule): def __init__(self, in_channels, height, width, k=0.9, shared_across_channels=False): @@ -71,12 +74,12 @@ class NeuNorm(base.MemoryModule): self.k0 = k self.k1 = (1. - self.k0) / in_channels ** 2 if shared_across_channels: - self.w = nn.Parameter(torch.Tensor(1, height, width)) + self.w = nn.Parameter(Tensor(1, height, width)) else: - self.w = nn.Parameter(torch.Tensor(in_channels, height, width)) + self.w = nn.Parameter(Tensor(in_channels, height, width)) nn.init.kaiming_uniform_(self.w, a=math.sqrt(5)) - def forward(self, in_spikes: torch.Tensor): + def forward(self, in_spikes: Tensor): self.x = self.k0 * self.x + self.k1 * in_spikes.sum(dim=1, keepdim=True) # x.shape = [batch_size, 1, height, width] return in_spikes - self.w * self.x @@ -119,7 +122,7 @@ class DCT(nn.Module): else: self.kernel[i][j] = math.sqrt(2 / kernel_size) * math.cos((j + 0.5) * math.pi * i / kernel_size) - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): if self.kernel.device != x.device: self.kernel = self.kernel.to(x.device) x_shape = x.shape @@ -160,10 +163,10 @@ class AXAT(nn.Module): The input will be regarded as a batch of tensors with ``shape = [in_features, in_features]``. """ super().__init__() - self.A = nn.Parameter(torch.Tensor(out_features, in_features)) + self.A = nn.Parameter(Tensor(out_features, in_features)) nn.init.kaiming_uniform_(self.A, a=math.sqrt(5)) - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): x_shape = list(x.shape) x = x.view(-1, x_shape[-2], x_shape[-1]) x = self.A.matmul(x).matmul(self.A.t()) @@ -241,10 +244,10 @@ class Dropout(base.MemoryModule): def extra_repr(self): return f'p={self.p}' - def create_mask(self, x: torch.Tensor): + def create_mask(self, x: Tensor): self.mask = F.dropout(torch.ones_like(x.data), self.p, training=True) - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): if self.training: if self.mask is None: self.create_mask(x) @@ -284,7 +287,7 @@ class Dropout2d(Dropout): """ super().__init__(p) - def create_mask(self, x: torch.Tensor): + def create_mask(self, x: Tensor): self.mask = F.dropout2d(torch.ones_like(x.data), self.p, training=True) @@ -321,7 +324,7 @@ class MultiStepDropout(Dropout): """ super().__init__(p) - def forward(self, x_seq: torch.Tensor): + def forward(self, x_seq: Tensor): if self.training: if self.mask is None: self.create_mask(x_seq[0]) @@ -364,7 +367,7 @@ class MultiStepDropout2d(Dropout2d): """ super().__init__(p) - def forward(self, x_seq: torch.Tensor): + def forward(self, x_seq: Tensor): if self.training: if self.mask is None: self.create_mask(x_seq[0]) @@ -526,7 +529,7 @@ class SynapseFilter(base.MemoryModule): return f'tau={tau}, learnable={self.learnable}' - def forward(self, in_spikes: torch.Tensor): + def forward(self, in_spikes: Tensor): if self.learnable: inv_tau = self.w.sigmoid() else: @@ -536,6 +539,7 @@ class SynapseFilter(base.MemoryModule): return self.out_i + class ChannelsPool(nn.Module): def __init__(self, pool: nn.MaxPool1d or nn.AvgPool1d): """ @@ -578,7 +582,7 @@ class ChannelsPool(nn.Module): super().__init__() self.pool = pool - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): x_shape = x.shape return self.pool(x.flatten(2).permute(0, 2, 1)).permute(0, 2, 1).view((x_shape[0], -1) + x_shape[2:]) @@ -660,9 +664,9 @@ class DropConnectLinear(base.MemoryModule): super().__init__() self.in_features = in_features self.out_features = out_features - self.weight = nn.Parameter(torch.Tensor(out_features, in_features)) + self.weight = nn.Parameter(Tensor(out_features, in_features)) if bias: - self.bias = nn.Parameter(torch.Tensor(out_features)) + self.bias = nn.Parameter(Tensor(out_features)) else: self.register_parameter('bias', None) @@ -738,7 +742,7 @@ class DropConnectLinear(base.MemoryModule): # self.dropped_b = mask_b.to(self.bias) * self.bias self.dropped_b = self.bias * mask_b - def forward(self, input: torch.Tensor) -> torch.Tensor: + def forward(self, input: Tensor) -> Tensor: if self.training: if self.invariant: if self.dropped_w is None: @@ -809,21 +813,15 @@ class MultiStepContainer(nn.Sequential): """ super().__init__(*args) - def forward(self, x_seq: torch.Tensor): + def forward(self, x_seq: Tensor): """ :param x_seq: shape=[T, batch_size, ...] - :type x_seq: torch.Tensor + :type x_seq: Tensor :return: y_seq, shape=[T, batch_size, ...] - :rtype: torch.Tensor + :rtype: Tensor """ - y_seq = [] - for t in range(x_seq.shape[0]): - y_seq.append(super().forward(x_seq[t])) - for t in range(y_seq.__len__()): - # y_seq[t].unsqueeze_(0) - y_seq[t] = y_seq[t].unsqueeze(0) - return torch.cat(y_seq, 0) + return functional.multi_step_forward(x_seq, self) class SeqToANNContainer(nn.Sequential): @@ -869,17 +867,14 @@ class SeqToANNContainer(nn.Sequential): """ super().__init__(*args) - def forward(self, x_seq: torch.Tensor): + def forward(self, x_seq: Tensor): """ :param x_seq: shape=[T, batch_size, ...] - :type x_seq: torch.Tensor + :type x_seq: Tensor :return: y_seq, shape=[T, batch_size, ...] - :rtype: torch.Tensor + :rtype: Tensor """ - y_shape = [x_seq.shape[0], x_seq.shape[1]] - y_seq = super().forward(x_seq.flatten(0, 1)) - y_shape.extend(y_seq.shape[1:]) - return y_seq.view(y_shape) + return functional.seq_to_ann_forward(x_seq, self) class STDPLearner(base.MemoryModule): @@ -970,7 +965,7 @@ class STDPLearner(base.MemoryModule): self.f_post = f_post @torch.no_grad() - def stdp(self, s_pre: torch.Tensor, s_post: torch.Tensor, module: nn.Module, learning_rate: float): + def stdp(self, s_pre: Tensor, s_post: Tensor, module: nn.Module, learning_rate: float): if isinstance(module, nn.Linear): # update trace self.trace_pre += - self.trace_pre / self.tau_pre + s_pre @@ -1009,7 +1004,7 @@ class PrintShapeModule(nn.Module): super().__init__() self.ext_str = ext_str - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): print(self.ext_str, x.shape) return x @@ -1064,20 +1059,20 @@ class ConvBatchNorm2d(nn.Module): self.bn = nn.BatchNorm2d(num_features=out_channels, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats) - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): return self.bn(self.conv(x)) def get_fused_weight(self): """ :return: the weight of this fused module - :rtype: torch.Tensor + :rtype: Tensor """ return functional.fused_conv2d_weight_of_convbn2d(self.conv, self.bn) def get_fused_bias(self): """ :return: the bias of this fused module - :rtype: torch.Tensor + :rtype: Tensor """ return functional.fused_conv2d_bias_of_convbn2d(self.conv, self.bn) @@ -1108,6 +1103,7 @@ class ConvBatchNorm2d(nn.Module): def get_fused_conv(self): return functional.fuse_convbn2d(self.conv, self.bn) + class ElementWiseRecurrentContainer(base.MemoryModule): def __init__(self, sub_module: nn.Module, element_wise_function: Callable): """ @@ -1150,7 +1146,7 @@ class ElementWiseRecurrentContainer(base.MemoryModule): self.element_wise_function = element_wise_function self.register_memory('y', None) - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): if self.y is None: self.y = torch.zeros_like(x.data) self.y = self.sub_module(self.element_wise_function(self.y, x)) @@ -1159,6 +1155,7 @@ class ElementWiseRecurrentContainer(base.MemoryModule): def extra_repr(self) -> str: return f'element-wise function={self.element_wise_function}' + class LinearRecurrentContainer(base.MemoryModule): def __init__(self, sub_module: nn.Module, in_features: int, out_features: int, bias: bool = True) -> None: """ @@ -1214,7 +1211,7 @@ class LinearRecurrentContainer(base.MemoryModule): self.sub_module = sub_module self.register_memory('y', None) - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): if self.y is None: if x.ndim == 2: self.y = torch.zeros([x.shape[0], self.sub_module_out_features]).to(x) @@ -1225,4 +1222,211 @@ class LinearRecurrentContainer(base.MemoryModule): self.y = torch.zeros(out_shape).to(x) x = torch.cat((x, self.y), dim=-1) self.y = self.sub_module(self.rc(x)) - return self.y \ No newline at end of file + return self.y +class _MultiStepThresholdDependentBatchNormBase(_BatchNorm): + def __init__(self, alpha: float, v_th: float, *args, **kwargs): + super().__init__(*args, **kwargs) + self.alpha = alpha + self.v_th = v_th + assert self.affine, "ThresholdDependentBatchNorm needs to set `affine = True`!" + torch.nn.init.constant_(self.weight, alpha * v_th) + + def forward(self, x_seq): + y_shape = [x_seq.shape[0], x_seq.shape[1]] + y = x_seq.flatten(0, 1) + y = super().forward(y) + y_shape.extend(y.shape[1:]) + return y.view(y_shape) + + +class MultiStepThresholdDependentBatchNorm1d(_MultiStepThresholdDependentBatchNormBase): + def __init__(self, alpha: float, v_th: float, *args, **kwargs): + """ + * :ref:`API in English ` + + .. _MultiStepThresholdDependentBatchNorm1d.__init__-cn: + + :param alpha: 由网络结构决定的超参数 + :type alpha: float + :param v_th: 下一个脉冲神经元层的阈值 + :type v_th: float + + ``*args, **kwargs`` 中的参数与 :class:`torch.nn.BatchNorm1d` 的参数相同。 + + `Going Deeper With Directly-Trained Larger Spiking Neural Networks `_ 一文提出 + 的Threshold-Dependent Batch Normalization (tdBN)。 + + * :ref:`中文API ` + + .. _MultiStepThresholdDependentBatchNorm1d.__init__-en: + + :param alpha: the hyper-parameter depending on network structure + :type alpha: float + :param v_th: the threshold of next spiking neurons layer + :type v_th: float + + Other parameters in ``*args, **kwargs`` are same with those of :class:`torch.nn.BatchNorm1d`. + + The Threshold-Dependent Batch Normalization (tdBN) proposed in `Going Deeper With Directly-Trained Larger Spiking Neural Networks `_. + """ + super().__init__(alpha, v_th, *args, **kwargs) + + def _check_input_dim(self, x): + if x.dim() != 2 and x.dim() != 3: + raise ValueError( + f'expected 3D or 4D input with shape [T, N, C] or [T, N, C, M], but got input with shape {x.shape}') + + +class MultiStepThresholdDependentBatchNorm2d(_MultiStepThresholdDependentBatchNormBase): + def __init__(self, alpha: float, v_th: float, *args, **kwargs): + """ + * :ref:`API in English ` + + .. _MultiStepThresholdDependentBatchNorm2d.__init__-cn: + + :param alpha: 由网络结构决定的超参数 + :type alpha: float + :param v_th: 下一个脉冲神经元层的阈值 + :type v_th: float + + ``*args, **kwargs`` 中的参数与 :class:`torch.nn.BatchNorm2d` 的参数相同。 + + `Going Deeper With Directly-Trained Larger Spiking Neural Networks `_ 一文提出 + 的Threshold-Dependent Batch Normalization (tdBN)。 + + * :ref:`中文API ` + + .. _MultiStepThresholdDependentBatchNorm2d.__init__-en: + + :param alpha: the hyper-parameter depending on network structure + :type alpha: float + :param v_th: the threshold of next spiking neurons layer + :type v_th: float + + Other parameters in ``*args, **kwargs`` are same with those of :class:`torch.nn.BatchNorm2d`. + + The Threshold-Dependent Batch Normalization (tdBN) proposed in `Going Deeper With Directly-Trained Larger Spiking Neural Networks `_. + """ + super().__init__(alpha, v_th, *args, **kwargs) + + def _check_input_dim(self, x): + if x.dim() != 4: + raise ValueError(f'expected 5D input with shape [T, N, C, H, W], but got input with shape {x.shape}') + + +class MultiStepThresholdDependentBatchNorm3d(_MultiStepThresholdDependentBatchNormBase): + def __init__(self, alpha: float, v_th: float, *args, **kwargs): + """ + * :ref:`API in English ` + + .. _MultiStepThresholdDependentBatchNorm3d.__init__-cn: + + :param alpha: 由网络结构决定的超参数 + :type alpha: float + :param v_th: 下一个脉冲神经元层的阈值 + :type v_th: float + + ``*args, **kwargs`` 中的参数与 :class:`torch.nn.BatchNorm3d` 的参数相同。 + + `Going Deeper With Directly-Trained Larger Spiking Neural Networks `_ 一文提出 + 的Threshold-Dependent Batch Normalization (tdBN)。 + + * :ref:`中文API ` + + .. _MultiStepThresholdDependentBatchNorm3d.__init__-en: + + :param alpha: the hyper-parameter depending on network structure + :type alpha: float + :param v_th: the threshold of next spiking neurons layer + :type v_th: float + + Other parameters in ``*args, **kwargs`` are same with those of :class:`torch.nn.BatchNorm3d`. + + The Threshold-Dependent Batch Normalization (tdBN) proposed in `Going Deeper With Directly-Trained Larger Spiking Neural Networks `_. + """ + super().__init__(alpha, v_th, *args, **kwargs) + + def _check_input_dim(self, x): + if x.dim() != 5: + raise ValueError(f'expected 6D input with shape [T, N, C, D, H, W], but got input with shape {x.shape}') + + +class MultiStepTemporalWiseAttention(nn.Module): + def __init__(self, T: int, reduction: int = 16, dimension: int = 4): + """ + * :ref:`API in English ` + + .. _MultiStepTemporalWiseAttention.__init__-cn: + + :param T: 输入数据的时间步长 + + :param reduction: 压缩比 + + :param dimension: 输入数据的维度。当输入数据为[T, N, C, H, W]时, dimension = 4;输入数据维度为[T, N, L]时,dimension = 2。 + + `Temporal-Wise Attention Spiking Neural Networks for Event Streams Classification `_ 中提出 + 的MultiStepTemporalWiseAttention层。MultiStepTemporalWiseAttention层必须放在二维卷积层之后脉冲神经元之前,例如: + + ``Conv2d -> MultiStepTemporalWiseAttention -> LIF`` + + 输入的尺寸是 ``[T, N, C, H, W]`` 或者 ``[T, N, L]`` ,经过MultiStepTemporalWiseAttention层,输出为 ``[T, N, C, H, W]`` 或者 ``[T, N, L]`` 。 + + ``reduction`` 是压缩比,相当于论文中的 :math:`r`。 + + * :ref:`中文API ` + + .. _MultiStepTemporalWiseAttention.__init__-en: + + :param T: timewindows of input + + :param reduction: reduction ratio + + :param dimension: Dimensions of input. If the input dimension is [T, N, C, H, W], dimension = 4; when the input dimension is [T, N, L], dimension = 2. + + The MultiStepTemporalWiseAttention layer is proposed in `Temporal-Wise Attention Spiking Neural Networks for Event Streams Classification `_. + + It should be placed after the convolution layer and before the spiking neurons, e.g., + + ``Conv2d -> MultiStepTemporalWiseAttention -> LIF`` + + The dimension of the input is ``[T, N, C, H, W]`` or ``[T, N, L]`` , after the MultiStepTemporalWiseAttention layer, the output dimension is ``[T, N, C, H, W]`` or ``[T, N, L]`` . + + ``reduction`` is the reduction ratio,which is :math:`r` in the paper. + + """ + super().__init__() + assert dimension == 4 or dimension == 2, 'dimension must be 4 or 2' + + self.dimension = dimension + + # Sequence + if self.dimension == 2: + self.avg_pool = nn.AdaptiveAvgPool1d(1) + self.max_pool = nn.AdaptiveMaxPool1d(1) + elif self.dimension == 4: + self.avg_pool = nn.AdaptiveAvgPool3d(1) + self.max_pool = nn.AdaptiveMaxPool3d(1) + + assert T >= reduction, 'reduction cannot be greater than T' + + # Excitation + self.sharedMLP = nn.Sequential( + nn.Linear(T, T // reduction, bias=False), + nn.ReLU(), + nn.Linear(T // reduction, T, bias=False) + ) + + self.sigmoid = nn.Sigmoid() + + def forward(self, x_seq: torch.Tensor): + assert x_seq.dim() == 3 or x_seq.dim() == 5, ValueError(f'expected 3D or 5D input with shape [T, N, M] or [T, N, C, H, W], but got input with shape {x_seq.shape}') + x_seq = x_seq.transpose(0, 1) + avgout = self.sharedMLP(self.avg_pool(x_seq).view([x_seq.shape[0], x_seq.shape[1]])) + maxout = self.sharedMLP(self.max_pool(x_seq).view([x_seq.shape[0], x_seq.shape[1]])) + scores = self.sigmoid(avgout + maxout) + if self.dimension == 2: + y_seq = x_seq * scores[:, :, None] + elif self.dimension == 4: + y_seq = x_seq * scores[:, :, None, None, None] + y_seq = y_seq.transpose(0, 1) + return y_seq diff --git a/spikingjelly/clock_driven/model/parametric_lif_net.py b/spikingjelly/clock_driven/model/parametric_lif_net.py index 1361831..7d8394d 100644 --- a/spikingjelly/clock_driven/model/parametric_lif_net.py +++ b/spikingjelly/clock_driven/model/parametric_lif_net.py @@ -15,9 +15,7 @@ class VotingLayer(nn.Module): self.voting_size = voting_size def forward(self, x: torch.Tensor): - x.unsqueeze_(1) # [N, C] -> [N, 1, C] - y = F.avg_pool1d(x, self.voting_size, self.voting_size) - y.squeeze_(1) + y = F.avg_pool1d(x.unsqueeze(1), self.voting_size, self.voting_size).squeeze(1) return y diff --git a/spikingjelly/clock_driven/model/sew_resnet.py b/spikingjelly/clock_driven/model/sew_resnet.py index ee2bd01..b8b992a 100644 --- a/spikingjelly/clock_driven/model/sew_resnet.py +++ b/spikingjelly/clock_driven/model/sew_resnet.py @@ -1,8 +1,11 @@ import torch import torch.nn as nn from .. import functional -from torchvision.models.utils import load_state_dict_from_url - +try: + from torchvision.models.utils import load_state_dict_from_url +except ImportError: + from torchvision._internally_replaced_utils import load_state_dict_from_url + __all__ = ['SEWResNet', 'sew_resnet18', 'sew_resnet34', 'sew_resnet50', 'sew_resnet101', 'sew_resnet152', 'sew_resnext50_32x4d', 'sew_resnext101_32x8d', 'sew_wide_resnet50_2', 'sew_wide_resnet101_2', diff --git a/spikingjelly/clock_driven/model/spiking_resnet.py b/spikingjelly/clock_driven/model/spiking_resnet.py index 357bb24..95ba28f 100644 --- a/spikingjelly/clock_driven/model/spiking_resnet.py +++ b/spikingjelly/clock_driven/model/spiking_resnet.py @@ -1,7 +1,10 @@ import torch import torch.nn as nn from .. import functional -from torchvision.models.utils import load_state_dict_from_url +try: + from torchvision.models.utils import load_state_dict_from_url +except ImportError: + from torchvision._internally_replaced_utils import load_state_dict_from_url __all__ = ['SpikingResNet', 'spiking_resnet18', 'spiking_resnet34', 'spiking_resnet50', 'spiking_resnet101', 'spiking_resnet152', 'spiking_resnext50_32x4d', 'spiking_resnext101_32x8d', diff --git a/spikingjelly/clock_driven/model/spiking_vgg.py b/spikingjelly/clock_driven/model/spiking_vgg.py index 3069e6b..757db89 100644 --- a/spikingjelly/clock_driven/model/spiking_vgg.py +++ b/spikingjelly/clock_driven/model/spiking_vgg.py @@ -1,8 +1,10 @@ import torch import torch.nn as nn from spikingjelly.clock_driven import functional, neuron -from torchvision.models.utils import load_state_dict_from_url - +try: + from torchvision.models.utils import load_state_dict_from_url +except ImportError: + from torchvision._internally_replaced_utils import load_state_dict_from_url __all__ = [ 'SpikingVGG', 'MultiStepSpikingVGG', @@ -356,7 +358,7 @@ def spiking_vgg16(pretrained=False, progress=True, single_step_neuron: callable A spiking version of VGG-16 model from `"Very Deep Convolutional Networks for Large-Scale Image Recognition" `_ """ - return _spiking_vgg('vgg16', 'C', False, pretrained, progress, None, single_step_neuron, **kwargs) + return _spiking_vgg('vgg16', 'D', False, pretrained, progress, None, single_step_neuron, **kwargs) def multi_step_spiking_vgg16(pretrained=False, progress=True, T: int = None, multi_step_neuron: callable = None, **kwargs): @@ -377,7 +379,7 @@ def multi_step_spiking_vgg16(pretrained=False, progress=True, T: int = None, mul A multi-step spiking version of VGG-16 model from `"Very Deep Convolutional Networks for Large-Scale Image Recognition" `_ """ - return _multi_step_spiking_vgg('vgg16', 'C', False, pretrained, progress, None, T, multi_step_neuron, **kwargs) + return _multi_step_spiking_vgg('vgg16', 'D', False, pretrained, progress, None, T, multi_step_neuron, **kwargs) def spiking_vgg16_bn(pretrained=False, progress=True, norm_layer: callable = None, single_step_neuron: callable = None, **kwargs): @@ -398,7 +400,7 @@ def spiking_vgg16_bn(pretrained=False, progress=True, norm_layer: callable = Non A spiking version of VGG-16-BN model from `"Very Deep Convolutional Networks for Large-Scale Image Recognition" `_ """ - return _spiking_vgg('vgg16', 'C', True, pretrained, progress, norm_layer, single_step_neuron, **kwargs) + return _spiking_vgg('vgg16', 'D', True, pretrained, progress, norm_layer, single_step_neuron, **kwargs) def multi_step_spiking_vgg16_bn(pretrained=False, progress=True, norm_layer: callable = None, T: int = None, multi_step_neuron: callable = None, **kwargs): @@ -421,7 +423,7 @@ def multi_step_spiking_vgg16_bn(pretrained=False, progress=True, norm_layer: cal A multi-step spiking version of VGG-16-BN model from `"Very Deep Convolutional Networks for Large-Scale Image Recognition" `_ """ - return _multi_step_spiking_vgg('vgg16', 'C', True, pretrained, progress, norm_layer, T, multi_step_neuron, **kwargs) + return _multi_step_spiking_vgg('vgg16', 'D', True, pretrained, progress, norm_layer, T, multi_step_neuron, **kwargs) def spiking_vgg19(pretrained=False, progress=True, single_step_neuron: callable = None, **kwargs): @@ -440,7 +442,7 @@ def spiking_vgg19(pretrained=False, progress=True, single_step_neuron: callable A spiking version of VGG-19 model from `"Very Deep Convolutional Networks for Large-Scale Image Recognition" `_ """ - return _spiking_vgg('vgg19', 'D', False, pretrained, progress, None, single_step_neuron, **kwargs) + return _spiking_vgg('vgg19', 'E', False, pretrained, progress, None, single_step_neuron, **kwargs) def multi_step_spiking_vgg19(pretrained=False, progress=True, T: int = None, multi_step_neuron: callable = None, **kwargs): @@ -461,7 +463,7 @@ def multi_step_spiking_vgg19(pretrained=False, progress=True, T: int = None, mul A multi-step spiking version of VGG-19 model from `"Very Deep Convolutional Networks for Large-Scale Image Recognition" `_ """ - return _multi_step_spiking_vgg('vgg19', 'D', False, pretrained, progress, None, T, multi_step_neuron, **kwargs) + return _multi_step_spiking_vgg('vgg19', 'E', False, pretrained, progress, None, T, multi_step_neuron, **kwargs) def spiking_vgg19_bn(pretrained=False, progress=True, norm_layer: callable = None, single_step_neuron: callable = None, **kwargs): @@ -482,7 +484,7 @@ def spiking_vgg19_bn(pretrained=False, progress=True, norm_layer: callable = Non A spiking version of VGG-19-BN model from `"Very Deep Convolutional Networks for Large-Scale Image Recognition" `_ """ - return _spiking_vgg('vgg19', 'D', True, pretrained, progress, norm_layer, single_step_neuron, **kwargs) + return _spiking_vgg('vgg19', 'E', True, pretrained, progress, norm_layer, single_step_neuron, **kwargs) def multi_step_spiking_vgg19_bn(pretrained=False, progress=True, norm_layer: callable = None, T: int = None, multi_step_neuron: callable = None, **kwargs): @@ -505,5 +507,5 @@ def multi_step_spiking_vgg19_bn(pretrained=False, progress=True, norm_layer: cal A multi-step spiking version of VGG-19-BN model from `"Very Deep Convolutional Networks for Large-Scale Image Recognition" `_ """ - return _multi_step_spiking_vgg('vgg19', 'D', True, pretrained, progress, norm_layer, T, multi_step_neuron, **kwargs) + return _multi_step_spiking_vgg('vgg19', 'E', True, pretrained, progress, norm_layer, T, multi_step_neuron, **kwargs) diff --git a/spikingjelly/clock_driven/neuron.py b/spikingjelly/clock_driven/neuron.py index 72da7fc..a32005f 100644 --- a/spikingjelly/clock_driven/neuron.py +++ b/spikingjelly/clock_driven/neuron.py @@ -1,15 +1,37 @@ from abc import abstractmethod -from typing import Callable +from typing import Callable, overload import torch import torch.nn as nn -from . import surrogate, base +from . import surrogate, base, lava_exchange +from .. import configure import math +import numpy as np +import logging try: import cupy from . import neuron_kernel, cu_kernel_opt -except ImportError: +except BaseException as e: + logging.info(f'spikingjelly.clock_driven.neuron: {e}') + cupy = None neuron_kernel = None + cu_kernel_opt = None +try: + import lava.lib.dl.slayer as slayer + +except BaseException as e: + logging.info(f'spikingjelly.clock_driven.neuron: {e}') + slayer = None + +def check_backend(backend: str): + if backend == 'torch': + return + elif backend == 'cupy': + assert cupy is not None, 'CuPy is not installed! You can install it from "https://github.com/cupy/cupy".' + elif backend == 'lava': + assert slayer is not None, 'Lava-DL is not installed! You can install it from "https://github.com/lava-nc/lava-dl".' + else: + raise NotImplementedError(backend) class BaseNode(base.MemoryModule): def __init__(self, v_threshold: float = 1., v_reset: float = 0., @@ -60,13 +82,11 @@ class BaseNode(base.MemoryModule): if v_reset is None: self.register_memory('v', 0.) - self.register_memory('spike', 0.) else: self.register_memory('v', v_reset) - self.register_memory('spike', 0.) - self.v_threshold = v_threshold - self.v_reset = v_reset + self.register_memory('v_threshold', v_threshold) + self.register_memory('v_reset', v_reset) self.detach_reset = detach_reset self.surrogate_function = surrogate_function @@ -105,9 +125,9 @@ class BaseNode(base.MemoryModule): Calculate out spikes of neurons by their current membrane potential and threshold voltage. """ - self.spike = self.surrogate_function(self.v - self.v_threshold) + return self.surrogate_function(self.v - self.v_threshold) - def neuronal_reset(self): + def neuronal_reset(self, spike): """ * :ref:`API in English ` @@ -123,17 +143,17 @@ class BaseNode(base.MemoryModule): Reset the membrane potential according to neurons' output spikes. """ if self.detach_reset: - spike = self.spike.detach() + spike_d = spike.detach() else: - spike = self.spike + spike_d = spike if self.v_reset is None: # soft reset - self.v = self.v - spike * self.v_threshold + self.v = self.v - spike_d * self.v_threshold else: # hard reset - self.v = (1. - spike) * self.v + spike * self.v_reset + self.v = (1. - spike_d) * self.v + spike_d * self.v_reset def extra_repr(self): return f'v_threshold={self.v_threshold}, v_reset={self.v_reset}, detach_reset={self.detach_reset}' @@ -167,14 +187,50 @@ class BaseNode(base.MemoryModule): """ self.neuronal_charge(x) - self.neuronal_fire() - self.neuronal_reset() - return self.spike + spike = self.neuronal_fire() + self.neuronal_reset(spike) + return spike + +class AdaptiveBaseNode(BaseNode): + def __init__(self, v_threshold: float = 1., v_reset: float = 0., + v_rest: float = 0., w_rest: float = 0, tau_w: float = 2., a: float = 0., b: float = 0., + surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False): + # b: jump amplitudes + # a: subthreshold coupling + assert isinstance(w_rest, float) + assert isinstance(v_rest, float) + assert isinstance(tau_w, float) + assert isinstance(a, float) + assert isinstance(b, float) + + super.__init__(v_threshold, v_reset, surrogate_function, detach_reset) + + self.register_memory('w', w_rest) + self.w_rest = w_rest + self.v_rest = v_rest + self.tau_w = tau_w + self.a = a + self.b = b + + + def neuronal_adaptation(self, spike): + self.w = self.w + 1. / self.tau_w * (self.a * (self.v - self.v_rest) - self.w) + self.b * spike + + def extra_repr(self): + return super().extra_repr() + f', v_rest={self.v_rest}, w_rest={self.w_rest}, tau_w={self.tau_w}, a={self.a}, b={self.b}' + + @overload + def forward(self, x: torch.Tensor): + self.neuronal_charge(x) + spike = self.neuronal_fire() + self.neuronal_adaptation(spike) + self.neuronal_reset(spike) + return spike class IFNode(BaseNode): def __init__(self, v_threshold: float = 1., v_reset: float = 0., - surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False): + surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False, cupy_fp32_inference=False): """ * :ref:`API in English ` @@ -193,6 +249,9 @@ class IFNode(BaseNode): :param detach_reset: 是否将reset过程的计算图分离 :type detach_reset: bool + :param cupy_fp32_inference: 若为 `True`,在 `eval` 模式下,使用float32,却在GPU上运行,并且 `cupy` 已经安装,则会自动使用 `cupy` 进行加速 + :type cupy_fp32_inference: bool + Integrate-and-Fire 神经元模型,可以看作理想积分器,无输入时电压保持恒定,不会像LIF神经元那样衰减。其阈下神经动力学方程为: .. math:: @@ -215,21 +274,108 @@ class IFNode(BaseNode): :param detach_reset: whether detach the computation graph of reset :type detach_reset: bool + :param cupy_fp32_inference: If `True`, if this module is in `eval` mode, using float32, running on GPU, and `cupy` is installed, then this + module will use `cupy` to accelerate + :type cupy_fp32_inference: bool + The Integrate-and-Fire neuron, which can be seen as a ideal integrator. The voltage of the IF neuron will not decay as that of the LIF neuron. The subthreshold neural dynamics of it is as followed: .. math:: V[t] = V[t-1] + X[t] + """ super().__init__(v_threshold, v_reset, surrogate_function, detach_reset) + if cupy_fp32_inference: + check_backend('cupy') + self.cupy_fp32_inference = cupy_fp32_inference + def neuronal_charge(self, x: torch.Tensor): self.v = self.v + x + def forward(self, x: torch.Tensor): + if self.cupy_fp32_inference and cupy is not None and not self.training and x.dtype == torch.float32: + # cupy is installed && eval mode && fp32 + device_id = x.get_device() + if device_id < 0: + return super().forward(x) + + # use cupy to accelerate + if isinstance(self.v, float): + v = torch.zeros_like(x) + if self.v != 0.: + torch.fill_(v, self.v) + self.v = v + + if self.v_reset is None: + hard_reset = False + else: + hard_reset = True + + code = rf''' + extern "C" __global__ + void IFNode_{'hard' if hard_reset else 'soft'}_reset_inference_forward( + const float * x, const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} + float * spike, float * v, + const int & numel) + ''' + + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < numel) + { + v[index] += x[index]; + spike[index] = (float) (v[index] >= v_threshold); + ''' + + code += rf''' + {'v[index] = (1.0f - spike[index]) * v[index] + spike[index] * v_reset;' if hard_reset else 'v[index] -= spike[index] * v_threshold;'} + ''' + + code += r''' + } + } + ''' + if hasattr(self, 'cp_kernel'): + if self.cp_kernel.code != code: + # replace codes + del self.cp_kernel + self.cp_kernel = cupy.RawKernel(code, f"IFNode_{'hard' if hard_reset else 'soft'}_reset_inference_forward", options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) + else: + self.cp_kernel = cupy.RawKernel(code, f"IFNode_{'hard' if hard_reset else 'soft'}_reset_inference_forward", options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) + + with cu_kernel_opt.DeviceEnvironment(device_id): + numel = x.numel() + threads = configure.cuda_threads + blocks = cu_kernel_opt.cal_blocks(numel) + cp_numel = cupy.asarray(numel) + cp_v_threshold = cupy.asarray(self.v_threshold, dtype=np.float32) + if hard_reset: + cp_v_reset = cupy.asarray(self.v_reset, dtype=np.float32) + + spike = torch.zeros_like(x) + if hard_reset: + x, cp_v_threshold, cp_v_reset, spike, self.v, cp_numel = cu_kernel_opt.get_contiguous(x, cp_v_threshold, cp_v_reset, spike, self.v, cp_numel) + kernel_args = [x, cp_v_threshold, cp_v_reset, spike, self.v, cp_numel] + else: + x, cp_v_threshold, spike, self.v, cp_numel = cu_kernel_opt.get_contiguous(x, cp_v_threshold, spike, self.v, cp_numel) + kernel_args = [x, cp_v_threshold, spike, self.v, cp_numel] + self.cp_kernel( + (blocks,), (threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device_id, + *kernel_args + ) + ) + return spike + else: + return super().forward(x) class MultiStepIFNode(IFNode): def __init__(self, v_threshold: float = 1., v_reset: float = 0., - surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False, backend='torch'): + surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False, backend='torch', lava_s_cale=1 << 6): """ * :ref:`API in English ` @@ -301,24 +447,32 @@ class MultiStepIFNode(IFNode): super().__init__(v_threshold, v_reset, surrogate_function, detach_reset) self.register_memory('v_seq', None) - self.register_memory('spike_seq', None) - assert backend == 'torch' or backend == 'cupy' - assert not (backend == 'cupy' and neuron_kernel is None), 'cupy is not installed' + check_backend(backend) self.backend = backend + self.lava_s_cale = lava_s_cale + + if backend == 'lava': + self.lava_neuron = self.to_lava() + else: + self.lava_neuron = None + + def forward(self, x_seq: torch.Tensor): assert x_seq.dim() > 1 # x_seq.shape = [T, *] - self.v_seq = torch.zeros_like(x_seq.data) - self.spike_seq = torch.zeros_like(x_seq.data) if self.backend == 'torch': + spike_seq = [] + self.v_seq = [] for t in range(x_seq.shape[0]): - self.spike_seq[t] = super().forward(x_seq[t]) - self.v_seq[t] = self.v - return self.spike_seq + spike_seq.append(super().forward(x_seq[t]).unsqueeze(0)) + self.v_seq.append(self.v.unsqueeze(0)) + spike_seq = torch.cat(spike_seq, 0) + self.v_seq = torch.cat(self.v_seq, 0) + return spike_seq elif self.backend == 'cupy': if isinstance(self.v, float): @@ -327,27 +481,43 @@ class MultiStepIFNode(IFNode): if v_init != 0.: torch.fill_(self.v, v_init) - self.spike_seq, self.v_seq = neuron_kernel.MultiStepIFNodePTT.apply( + spike_seq, self.v_seq = neuron_kernel.MultiStepIFNodePTT.apply( x_seq.flatten(1), self.v.flatten(0), self.v_threshold, self.v_reset, self.detach_reset, self.surrogate_function.cuda_code) - self.spike_seq = self.spike_seq.reshape(x_seq.shape) + spike_seq = spike_seq.reshape(x_seq.shape) self.v_seq = self.v_seq.reshape(x_seq.shape) - - self.spike = self.spike_seq[-1].clone() self.v = self.v_seq[-1].clone() - return self.spike_seq + return spike_seq + + elif self.backend == 'lava': + if self.lava_neuron is None: + self.lava_neuron = self.to_lava() + + spike, self.v = lava_exchange.lava_neuron_forward(self.lava_neuron, x_seq, self.v) + + return spike + else: - raise NotImplementedError + raise NotImplementedError(self.backend) def extra_repr(self): return super().extra_repr() + f', backend={self.backend}' + def to_lava(self): + return lava_exchange.to_lava_neuron(self) + + def reset(self): + super().reset() + if self.lava_neuron is not None: + self.lava_neuron.current_state.zero_() + self.lava_neuron.voltage_state.zero_() + class LIFNode(BaseNode): - def __init__(self, tau: float = 2., v_threshold: float = 1., + def __init__(self, tau: float = 2., decay_input: bool = True, v_threshold: float = 1., v_reset: float = 0., surrogate_function: Callable = surrogate.Sigmoid(), - detach_reset: bool = False): + detach_reset: bool = False, cupy_fp32_inference=False): """ * :ref:`API in English ` @@ -356,6 +526,9 @@ class LIFNode(BaseNode): :param tau: 膜电位时间常数 :type tau: float + :param decay_input: 输入是否会衰减 + :type decay_input: bool + :param v_threshold: 神经元的阈值电压 :type v_threshold: float @@ -369,11 +542,24 @@ class LIFNode(BaseNode): :param detach_reset: 是否将reset过程的计算图分离 :type detach_reset: bool + :param cupy_fp32_inference: 若为 `True`,在 `eval` 模式下,使用float32,却在GPU上运行,并且 `cupy` 已经安装,则会自动使用 `cupy` 进行加速 + :type cupy_fp32_inference: bool Leaky Integrate-and-Fire 神经元模型,可以看作是带漏电的积分器。其阈下神经动力学方程为: - .. math:: - V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset}) + 若 ``decay_input == True``: + + .. math:: + V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset})) + + 若 ``decay_input == False``: + + .. math:: + V[t] = V[t-1] - \\frac{1}{\\tau}(V[t-1] - V_{reset}) + X[t] + + .. tip:: + + 在 `eval` 模式下,使用float32,却在GPU上运行,并且 `cupy` 已经安装,则会自动使用 `cupy` 进行加速。 * :ref:`中文API ` @@ -382,6 +568,9 @@ class LIFNode(BaseNode): :param tau: membrane time constant :type tau: float + :param decay_input: whether the input will decay + :type decay_input: bool + :param v_threshold: threshold voltage of neurons :type v_threshold: float @@ -395,34 +584,159 @@ class LIFNode(BaseNode): :param detach_reset: whether detach the computation graph of reset :type detach_reset: bool + :param cupy_fp32_inference: If `True`, if this module is in `eval` mode, using float32, running on GPU, and `cupy` is installed, then this + module will use `cupy` to accelerate + :type cupy_fp32_inference: bool + The Leaky Integrate-and-Fire neuron, which can be seen as a leaky integrator. The subthreshold neural dynamics of it is as followed: - .. math:: - V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset}) + IF ``decay_input == True``: + + .. math:: + V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset})) + + IF ``decay_input == False``: + + .. math:: + V[t] = V[t-1] - \\frac{1}{\\tau}(V[t-1] - V_{reset}) + X[t] + + .. admonition:: Tip + :class: tip + + If this module is in `eval` mode, using float32, running on GPU, and `cupy` is installed, then this + module will use `cupy` to accelerate. + """ assert isinstance(tau, float) and tau > 1. super().__init__(v_threshold, v_reset, surrogate_function, detach_reset) self.tau = tau + self.decay_input = decay_input + + if cupy_fp32_inference: + check_backend('cupy') + self.cupy_fp32_inference = cupy_fp32_inference def extra_repr(self): return super().extra_repr() + f', tau={self.tau}' def neuronal_charge(self, x: torch.Tensor): - if self.v_reset is None: - self.v = self.v + (x - self.v) / self.tau - - else: - if isinstance(self.v_reset, float) and self.v_reset == 0.: + if self.decay_input: + if self.v_reset is None or self.v_reset == 0.: self.v = self.v + (x - self.v) / self.tau else: self.v = self.v + (x - (self.v - self.v_reset)) / self.tau + else: + if self.v_reset is None or self.v_reset == 0.: + self.v = self.v * (1. - 1. / self.tau) + x + else: + self.v = self.v - (self.v - self.v_reset) / self.tau + x + + def forward(self, x: torch.Tensor): + if self.cupy_fp32_inference and cupy is not None and not self.training and x.dtype == torch.float32: + # cupy is installed && eval mode && fp32 + device_id = x.get_device() + if device_id < 0: + return super().forward(x) + + # use cupy to accelerate + if isinstance(self.v, float): + v = torch.zeros_like(x) + if self.v != 0.: + torch.fill_(v, self.v) + self.v = v + + if self.v_reset is None: + hard_reset = False + else: + hard_reset = True + + code = rf''' + extern "C" __global__ + void LIFNode_{'hard' if hard_reset else 'soft'}_reset_decayInput_{self.decay_input}_inference_forward( + const float * x, const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} const float & tau, + float * spike, float * v, + const int & numel) + ''' + + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < numel) + { + + ''' + + if self.decay_input: + if hard_reset: + code += r''' + v[index] += (x[index] - (v[index] - v_reset)) / tau; + ''' + else: + code += r''' + v[index] += (x[index] - v[index]) / tau; + ''' + else: + if hard_reset: + code += r''' + v[index] = x[index] + v[index] - (v[index] - v_reset) / tau; + ''' + else: + code += r''' + v[index] = x[index] + v[index] * (1.0f - 1.0f / tau); + ''' + + code += rf''' + spike[index] = (float) (v[index] >= v_threshold); + {'v[index] = (1.0f - spike[index]) * v[index] + spike[index] * v_reset;' if hard_reset else 'v[index] -= spike[index] * v_threshold;'} + ''' + + code += r''' + } + } + ''' + if hasattr(self, 'cp_kernel'): + if self.cp_kernel.code != code: + # replace codes + del self.cp_kernel + self.cp_kernel = cupy.RawKernel(code, f"LIFNode_{'hard' if hard_reset else 'soft'}_reset_decayInput_{self.decay_input}_inference_forward", options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) + else: + self.cp_kernel = cupy.RawKernel(code, f"LIFNode_{'hard' if hard_reset else 'soft'}_reset_decayInput_{self.decay_input}_inference_forward", options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) + + with cu_kernel_opt.DeviceEnvironment(device_id): + numel = x.numel() + threads = configure.cuda_threads + blocks = cu_kernel_opt.cal_blocks(numel) + cp_numel = cupy.asarray(numel) + cp_v_threshold = cupy.asarray(self.v_threshold, dtype=np.float32) + if hard_reset: + cp_v_reset = cupy.asarray(self.v_reset, dtype=np.float32) + cp_tau = cupy.asarray(self.tau, dtype=np.float32) + spike = torch.zeros_like(x) + if hard_reset: + x, cp_v_threshold, cp_v_reset, cp_tau, spike, self.v, cp_numel = cu_kernel_opt.get_contiguous(x, cp_v_threshold, cp_v_reset, cp_tau, spike, self.v, cp_numel) + kernel_args = [x, cp_v_threshold, cp_v_reset, cp_tau, spike, self.v, cp_numel] + else: + x, cp_v_threshold, cp_tau, spike, self.v, cp_numel = cu_kernel_opt.get_contiguous(x, cp_v_threshold, cp_tau, spike, self.v, cp_numel) + kernel_args = [x, cp_v_threshold, cp_tau, spike, self.v, cp_numel] + + self.cp_kernel( + (blocks,), (threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device_id, + *kernel_args + ) + ) + return spike + else: + return super().forward(x) + class MultiStepLIFNode(LIFNode): - def __init__(self, tau: float = 2., v_threshold: float = 1., + def __init__(self, tau: float = 2., decay_input: bool = True, v_threshold: float = 1., v_reset: float = 0., surrogate_function: Callable = surrogate.Sigmoid(), - detach_reset: bool = False, backend='torch'): + detach_reset: bool = False, backend='torch', lava_s_cale=1 << 6): """ * :ref:`API in English ` @@ -431,6 +745,9 @@ class MultiStepLIFNode(LIFNode): :param tau: 膜电位时间常数 :type tau: float + :param decay_input: 输入是否会衰减 + :type decay_input: bool + :param v_threshold: 神经元的阈值电压 :type v_threshold: float @@ -465,6 +782,9 @@ class MultiStepLIFNode(LIFNode): :param tau: membrane time constant :type tau: float + :param decay_input: whether the input will decay + :type decay_input: bool + :param v_threshold: threshold voltage of neurons :type v_threshold: float @@ -497,25 +817,33 @@ class MultiStepLIFNode(LIFNode): and multi-step propagation. """ - super().__init__(tau, v_threshold, v_reset, surrogate_function, detach_reset) + super().__init__(tau, decay_input, v_threshold, v_reset, surrogate_function, detach_reset) self.register_memory('v_seq', None) - self.register_memory('spike_seq', None) - assert backend == 'torch' or backend == 'cupy' - assert not (backend == 'cupy' and neuron_kernel is None), 'cupy is not installed' + check_backend(backend) + self.backend = backend + self.lava_s_cale = lava_s_cale + + if backend == 'lava': + self.lava_neuron = self.to_lava() + else: + self.lava_neuron = None + def forward(self, x_seq: torch.Tensor): assert x_seq.dim() > 1 # x_seq.shape = [T, *] - self.v_seq = torch.zeros_like(x_seq.data) - self.spike_seq = torch.zeros_like(x_seq.data) if self.backend == 'torch': + spike_seq = [] + self.v_seq = [] for t in range(x_seq.shape[0]): - self.spike_seq[t] = super().forward(x_seq[t]) - self.v_seq[t] = self.v - return self.spike_seq + spike_seq.append(super().forward(x_seq[t]).unsqueeze(0)) + self.v_seq.append(self.v.unsqueeze(0)) + spike_seq = torch.cat(spike_seq, 0) + self.v_seq = torch.cat(self.v_seq, 0) + return spike_seq elif self.backend == 'cupy': if isinstance(self.v, float): @@ -524,25 +852,41 @@ class MultiStepLIFNode(LIFNode): if v_init != 0.: torch.fill_(self.v, v_init) + spike_seq, self.v_seq = neuron_kernel.MultiStepLIFNodePTT.apply( + x_seq.flatten(1), self.v.flatten(0), self.decay_input, self.tau, self.v_threshold, self.v_reset, self.detach_reset, self.surrogate_function.cuda_code) - self.spike_seq, self.v_seq = neuron_kernel.MultiStepLIFNodePTT.apply( - x_seq.flatten(1), self.v.flatten(0), self.tau, self.v_threshold, self.v_reset, self.detach_reset, self.surrogate_function.cuda_code) - - self.spike_seq = self.spike_seq.reshape(x_seq.shape) + spike_seq = spike_seq.reshape(x_seq.shape) self.v_seq = self.v_seq.reshape(x_seq.shape) - self.spike = self.spike_seq[-1].clone() self.v = self.v_seq[-1].clone() - return self.spike_seq + return spike_seq + + elif self.backend == 'lava': + if self.lava_neuron is None: + self.lava_neuron = self.to_lava() + + spike, self.v = lava_exchange.lava_neuron_forward(self.lava_neuron, x_seq, self.v) + + return spike + else: - raise NotImplementedError + raise NotImplementedError(self.backend) def extra_repr(self): return super().extra_repr() + f', backend={self.backend}' + def to_lava(self): + return lava_exchange.to_lava_neuron(self) + + def reset(self): + super().reset() + if self.lava_neuron is not None: + self.lava_neuron.current_state.zero_() + self.lava_neuron.voltage_state.zero_() + class ParametricLIFNode(BaseNode): - def __init__(self, init_tau: float = 2.0, v_threshold: float = 1., + def __init__(self, init_tau: float = 2.0, decay_input: bool = True, v_threshold: float = 1., v_reset: float = 0., surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False): """ @@ -553,6 +897,9 @@ class ParametricLIFNode(BaseNode): :param init_tau: 膜电位时间常数的初始值 :type init_tau: float + :param decay_input: 输入是否会衰减 + :type decay_input: bool + :param v_threshold: 神经元的阈值电压 :type v_threshold: float @@ -569,8 +916,15 @@ class ParametricLIFNode(BaseNode): `Incorporating Learnable Membrane Time Constant to Enhance Learning of Spiking Neural Networks `_ 提出的 Parametric Leaky Integrate-and-Fire (PLIF)神经元模型,可以看作是带漏电的积分器。其阈下神经动力学方程为: - .. math:: - V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset}) + 若 ``decay_input == True``: + + .. math:: + V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset})) + + 若 ``decay_input == False``: + + .. math:: + V[t] = V[t-1] - \\frac{1}{\\tau}(V[t-1] - V_{reset}) + X[t] 其中 :math:`\\frac{1}{\\tau} = {\\rm Sigmoid}(w)`,:math:`w` 是可学习的参数。 @@ -581,6 +935,9 @@ class ParametricLIFNode(BaseNode): :param init_tau: the initial value of membrane time constant :type init_tau: float + :param decay_input: whether the input will decay + :type decay_input: bool + :param v_threshold: threshold voltage of neurons :type v_threshold: float @@ -597,14 +954,22 @@ class ParametricLIFNode(BaseNode): The Parametric Leaky Integrate-and-Fire (PLIF) neuron, which is proposed by `Incorporating Learnable Membrane Time Constant to Enhance Learning of Spiking Neural Networks `_ and can be seen as a leaky integrator. The subthreshold neural dynamics of it is as followed: - .. math:: - V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset}) + IF ``decay_input == True``: + + .. math:: + V[t] = V[t-1] + \\frac{1}{\\tau}(X[t] - (V[t-1] - V_{reset})) + + IF ``decay_input == False``: + + .. math:: + V[t] = V[t-1] - \\frac{1}{\\tau}(V[t-1] - V_{reset}) + X[t] where :math:`\\frac{1}{\\tau} = {\\rm Sigmoid}(w)`, :math:`w` is a learnable parameter. """ assert isinstance(init_tau, float) and init_tau > 1. super().__init__(v_threshold, v_reset, surrogate_function, detach_reset) + self.decay_input = decay_input init_w = - math.log(init_tau - 1.) self.w = nn.Parameter(torch.as_tensor(init_w)) @@ -614,17 +979,19 @@ class ParametricLIFNode(BaseNode): return super().extra_repr() + f', tau={tau}' def neuronal_charge(self, x: torch.Tensor): - if self.v_reset is None: - self.v = self.v + (x - self.v) * self.w.sigmoid() - else: - if self.v_reset == 0.: + if self.decay_input: + if self.v_reset is None or self.v_reset == 0.: self.v = self.v + (x - self.v) * self.w.sigmoid() else: self.v = self.v + (x - (self.v - self.v_reset)) * self.w.sigmoid() - + else: + if self.v_reset is None or self.v_reset == 0.: + self.v = self.v * (1. - self.w.sigmoid()) + x + else: + self.v = self.v - (self.v - self.v_reset) * self.w.sigmoid() + x class MultiStepParametricLIFNode(ParametricLIFNode): - def __init__(self, init_tau: float = 2., v_threshold: float = 1., + def __init__(self, init_tau: float = 2., decay_input: bool = True, v_threshold: float = 1., v_reset: float = 0., surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False, backend='torch'): """ @@ -635,6 +1002,9 @@ class MultiStepParametricLIFNode(ParametricLIFNode): :param init_tau: 膜电位时间常数的初始值 :type init_tau: float + :param decay_input: 输入是否会衰减 + :type decay_input: bool + :param v_threshold: 神经元的阈值电压 :type v_threshold: float @@ -672,6 +1042,9 @@ class MultiStepParametricLIFNode(ParametricLIFNode): :param init_tau: the initial value of membrane time constant :type init_tau: float + :param decay_input: whether the input will decay + :type decay_input: bool + :param v_threshold: threshold voltage of neurons :type v_threshold: float @@ -709,25 +1082,26 @@ class MultiStepParametricLIFNode(ParametricLIFNode): Read :doc:`Propagation Pattern <./clock_driven_en/10_propagation_pattern>` for more details about single-step and multi-step propagation. """ - super().__init__(init_tau, v_threshold, v_reset, surrogate_function, detach_reset) + super().__init__(init_tau, decay_input, v_threshold, v_reset, surrogate_function, detach_reset) self.register_memory('v_seq', None) - self.register_memory('spike_seq', None) - assert backend == 'torch' or backend == 'cupy' - assert not (backend == 'cupy' and neuron_kernel is None), 'cupy is not installed' + check_backend(backend) + self.backend = backend def forward(self, x_seq: torch.Tensor): assert x_seq.dim() > 1 # x_seq.shape = [T, *] - self.v_seq = torch.zeros_like(x_seq.data) - self.spike_seq = torch.zeros_like(x_seq.data) if self.backend == 'torch': + spike_seq = [] + self.v_seq = [] for t in range(x_seq.shape[0]): - self.spike_seq[t] = super().forward(x_seq[t]) - self.v_seq[t] = self.v - return self.spike_seq + spike_seq.append(super().forward(x_seq[t]).unsqueeze(0)) + self.v_seq.append(self.v.unsqueeze(0)) + spike_seq = torch.cat(spike_seq, 0) + self.v_seq = torch.cat(self.v_seq, 0) + return spike_seq elif self.backend == 'cupy': if isinstance(self.v, float): @@ -737,16 +1111,15 @@ class MultiStepParametricLIFNode(ParametricLIFNode): torch.fill_(self.v, v_init) - self.spike_seq, self.v_seq = neuron_kernel.MultiStepParametricLIFNodePTT.apply( - x_seq.flatten(1), self.v.flatten(0), self.w.sigmoid(), self.v_threshold, self.v_reset, self.detach_reset, self.surrogate_function.cuda_code) + spike_seq, self.v_seq = neuron_kernel.MultiStepParametricLIFNodePTT.apply( + x_seq.flatten(1), self.v.flatten(0), self.w.sigmoid(), self.decay_input, self.v_threshold, self.v_reset, self.detach_reset, self.surrogate_function.cuda_code) - self.spike_seq = self.spike_seq.reshape(x_seq.shape) + spike_seq = spike_seq.reshape(x_seq.shape) self.v_seq = self.v_seq.reshape(x_seq.shape) - self.spike = self.spike_seq[-1].clone() self.v = self.v_seq[-1].clone() - return self.spike_seq + return spike_seq else: raise NotImplementedError @@ -1037,23 +1410,24 @@ class MultiStepEIFNode(EIFNode): super().__init__(tau, delta_T, theta_rh, v_threshold, v_rest, v_reset, surrogate_function, detach_reset) self.register_memory('v_seq', None) - self.register_memory('spike_seq', None) - assert backend == 'torch' or backend == 'cupy' - assert not (backend == 'cupy' and neuron_kernel is None), 'cupy is not installed' + check_backend(backend) + self.backend = backend def forward(self, x_seq: torch.Tensor): assert x_seq.dim() > 1 # x_seq.shape = [T, *] - self.v_seq = torch.zeros_like(x_seq.data) - self.spike_seq = torch.zeros_like(x_seq.data) if self.backend == 'torch': + spike_seq = [] + self.v_seq = [] for t in range(x_seq.shape[0]): - self.spike_seq[t] = super().forward(x_seq[t]) - self.v_seq[t] = self.v - return self.spike_seq + spike_seq.append(super().forward(x_seq[t]).unsqueeze(0)) + self.v_seq.append(self.v.unsqueeze(0)) + spike_seq = torch.cat(spike_seq, 0) + self.v_seq = torch.cat(self.v_seq, 0) + return spike_seq elif self.backend == 'cupy': if isinstance(self.v, float): @@ -1063,18 +1437,111 @@ class MultiStepEIFNode(EIFNode): torch.fill_(self.v, v_init) - self.spike_seq, self.v_seq = neuron_kernel.MultiStepEIFNodePTT.apply( + spike_seq, self.v_seq = neuron_kernel.MultiStepEIFNodePTT.apply( x_seq.flatten(1), self.v.flatten(0), self.tau, self.v_threshold, self.v_reset, self.v_rest, self.theta_rh, self.delta_T, self.detach_reset, self.surrogate_function.cuda_code) - self.spike_seq = self.spike_seq.reshape(x_seq.shape) + spike_seq = spike_seq.reshape(x_seq.shape) + self.v_seq = self.v_seq.reshape(x_seq.shape) + + self.v = self.v_seq[-1].clone() + + return spike_seq + else: + raise NotImplementedError + + def extra_repr(self): + return super().extra_repr() + f', backend={self.backend}' + +class GeneralNode(BaseNode): + def __init__(self, a: float or torch.Tensor, b: float or torch.Tensor, c: float or torch.Tensor = 0., v_threshold: float = 1., v_reset: float = 0., + surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False): + super().__init__(v_threshold, v_reset, surrogate_function, detach_reset) + self.a = self.register_buffer('a', torch.as_tensor(a)) + self.b = self.register_buffer('b', torch.as_tensor(b)) + self.c = self.register_buffer('c', torch.as_tensor(c)) + + def neuronal_charge(self, x: torch.Tensor): + self.v = self.a * self.v + self.b * x + self.c + +class MultiStepGeneralNode(GeneralNode): + def __init__(self, a: float, b: float, c: float, v_threshold: float = 1., v_reset: float = 0., + surrogate_function: Callable = surrogate.Sigmoid(), detach_reset: bool = False, backend='torch'): + + super().__init__(v_threshold, v_reset, surrogate_function, detach_reset) + + self.register_memory('v_seq', None) + + check_backend(backend) + + self.backend = backend + + def forward(self, x_seq: torch.Tensor): + assert x_seq.dim() > 1 + # x_seq.shape = [T, *] + + if self.backend == 'torch': + spike_seq = [] + self.v_seq = [] + for t in range(x_seq.shape[0]): + spike_seq.append(super().forward(x_seq[t]).unsqueeze(0)) + self.v_seq.append(self.v.unsqueeze(0)) + spike_seq = torch.cat(spike_seq, 0) + self.v_seq = torch.cat(self.v_seq, 0) + return spike_seq + + elif self.backend == 'cupy': + if isinstance(self.v, float): + v_init = self.v + self.v = torch.zeros_like(x_seq[0].data) + if v_init != 0.: + torch.fill_(self.v, v_init) + + raise NotImplementedError + + spike_seq = spike_seq.reshape(x_seq.shape) self.v_seq = self.v_seq.reshape(x_seq.shape) - self.spike = self.spike_seq[-1].clone() self.v = self.v_seq[-1].clone() - return self.spike_seq + return spike_seq else: raise NotImplementedError def extra_repr(self): return super().extra_repr() + f', backend={self.backend}' + + +class LIAFNode(LIFNode): + def __init__(self, act: Callable, threshold_related: bool, *args, **kwargs): + """ + :param act: the activation function + :type act: Callable + :param threshold_related: whether the neuron uses threshold related (TR mode). If true, `y = act(h - v_th)`, + otherwise `y = act(h)` + :type threshold_related: bool + + Other parameters in `*args, **kwargs` are same with :class:`LIFNode`. + + The LIAF neuron proposed in `LIAF-Net: Leaky Integrate and Analog Fire Network for Lightweight and Efficient Spatiotemporal Information Processing `_. + + .. admonition:: Warning + :class: warning + + The outputs of this neuron are not binary spikes. + + """ + super().__init__(*args, **kwargs) + self.act = act + self.threshold_related = threshold_related + + def forward(self, x: torch.Tensor): + self.neuronal_charge(x) + if self.threshold_related: + y = self.act(self.v - self.v_threshold) + else: + y = self.act(self.v) + spike = self.neuronal_fire() + self.neuronal_reset(spike) + return y + + diff --git a/spikingjelly/clock_driven/neuron_kernel.cu b/spikingjelly/clock_driven/neuron_kernel.cu index 6e289dc..acb429d 100644 --- a/spikingjelly/clock_driven/neuron_kernel.cu +++ b/spikingjelly/clock_driven/neuron_kernel.cu @@ -1,4 +1,5 @@ // This file is created by spikingjelly.clock_driven.neuron_kernel.save_cuda_codes. +// Note that codes in this file will not be executed This file is just created for reading. // MultiStepIFNodePTT @@ -66,7 +67,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -103,7 +104,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -176,7 +177,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -218,7 +219,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -286,7 +287,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -323,7 +324,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -392,7 +393,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -432,7 +433,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -489,8 +490,8 @@ // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code @@ -500,7 +501,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -525,8 +526,8 @@ // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code @@ -537,7 +538,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -599,7 +600,7 @@ // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); @@ -610,7 +611,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -641,7 +642,7 @@ // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); @@ -652,7 +653,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -709,8 +710,8 @@ // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code @@ -720,7 +721,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -745,8 +746,8 @@ // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code @@ -757,7 +758,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -815,7 +816,7 @@ // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); @@ -826,7 +827,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -855,7 +856,7 @@ // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); @@ -866,7 +867,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -942,7 +943,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -987,7 +988,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -1060,7 +1061,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -1102,7 +1103,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -1178,7 +1179,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -1223,7 +1224,7 @@ // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -1292,7 +1293,7 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } @@ -1332,19 +1333,16 @@ grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index]; + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT - -// MultiStepLIFNodePTT fptt ATan, hard_reset=True, dtype=fp32 +// MultiStepIFNodePTT fptt S2NN, hard_reset=True, dtype=fp32 extern "C" __global__ - void LIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, - const float & reciprocal_tau, + void IFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & v_threshold, const float & v_reset, - const int & neuron_num, const int & numel) + const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; @@ -1354,9 +1352,9 @@ for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { const int t = index + mem_offset; - - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + h_seq[t] = v_v_seq[t] + x_seq[t]; if (h_seq[t] >= v_threshold) + { spike_seq[t] = 1.0f; v_v_seq[t + dt] = v_reset; @@ -1367,18 +1365,16 @@ spike_seq[t] = 0.0f; v_v_seq[t + dt] = h_seq[t]; } - } } } -// MultiStepLIFNodePTT bptt ATan, hard_reset=True, dtype=fp32, detach_reset=True +// MultiStepIFNodePTT bptt S2NN, hard_reset=True, dtype=fp32, detach_reset=True extern "C" __global__ - void LIFNode_bptt_hardReset_detachReset_fp32( + void IFNode_bptt_hardReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, float* grad_x_seq, float* grad_v_last, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) @@ -1392,30 +1388,30 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; - const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t]; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); + grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT bptt ATan, hard_reset=True, dtype=fp32, detach_reset=False +// MultiStepIFNodePTT bptt S2NN, hard_reset=True, dtype=fp32, detach_reset=False extern "C" __global__ - void LIFNode_bptt_hardReset__fp32( + void IFNode_bptt_hardReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, float* grad_x_seq, float* grad_v_last, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) @@ -1429,30 +1425,30 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; - const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; - // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + // const float grad_v_to_h = fmaf(grad_s_to_h, v_reset - h_seq[t], 1.0f - spike_seq[t]); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); + grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT fptt ATan, hard_reset=True, dtype=fp16 +// MultiStepIFNodePTT fptt S2NN, hard_reset=True, dtype=fp16 #include extern "C" __global__ - void LIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, - const half & reciprocal_tau, + void IFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -1462,7 +1458,6 @@ if (index < stride) { const int numel_2 = numel >> 1; - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); @@ -1470,23 +1465,23 @@ for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hadd2(v_v_seq[t], x_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); } } } -// MultiStepLIFNodePTT bptt ATan, hard_reset=True, dtype=fp16, detach_reset=True +// MultiStepIFNodePTT bptt S2NN, hard_reset=True, dtype=fp16, detach_reset=True #include extern "C" __global__ - void LIFNode_bptt_hardReset_detachReset_fp16( + void IFNode_bptt_hardReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, half2* grad_x_seq, half2* grad_v_last, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -1495,8 +1490,6 @@ const int stride = neuron_num >> 1; if (index < stride) { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); @@ -1505,34 +1498,33 @@ for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const half2 sg_ATan_alpha = __float2half2_rn(2.0f); - const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); - const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + + grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT bptt ATan, hard_reset=True, dtype=fp16, detach_reset=False +// MultiStepIFNodePTT bptt S2NN, hard_reset=True, dtype=fp16, detach_reset=False #include extern "C" __global__ - void LIFNode_bptt_hardReset__fp16( + void IFNode_bptt_hardReset__fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, half2* grad_x_seq, half2* grad_v_last, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -1541,8 +1533,6 @@ const int stride = neuron_num >> 1; if (index < stride) { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); @@ -1551,33 +1541,32 @@ for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const half2 sg_ATan_alpha = __float2half2_rn(2.0f); - const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); - const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT fptt ATan, hard_reset=False, dtype=fp32 +// MultiStepIFNodePTT fptt S2NN, hard_reset=False, dtype=fp32 extern "C" __global__ - void LIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, - const float & reciprocal_tau, + void IFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & v_threshold, - const int & neuron_num, const int & numel) + const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; @@ -1587,9 +1576,9 @@ for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { const int t = index + mem_offset; - - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + h_seq[t] = v_v_seq[t] + x_seq[t]; if (h_seq[t] >= v_threshold) + { spike_seq[t] = 1.0f; v_v_seq[t + dt] = h_seq[t] - v_threshold; @@ -1600,18 +1589,16 @@ spike_seq[t] = 0.0f; v_v_seq[t + dt] = h_seq[t]; } - } } } -// MultiStepLIFNodePTT bptt ATan, hard_reset=False, dtype=fp32, detach_reset=True +// MultiStepIFNodePTT bptt S2NN, hard_reset=False, dtype=fp32, detach_reset=True extern "C" __global__ - void LIFNode_bptt_softReset_detachReset_fp32( + void IFNode_bptt_softReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, float* grad_x_seq, float* grad_v_last, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) @@ -1625,30 +1612,30 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; - const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const float grad_v_to_h = 1.0f; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); + grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT bptt ATan, hard_reset=False, dtype=fp32, detach_reset=False +// MultiStepIFNodePTT bptt S2NN, hard_reset=False, dtype=fp32, detach_reset=False extern "C" __global__ - void LIFNode_bptt_softReset__fp32( + void IFNode_bptt_softReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, float* grad_x_seq, float* grad_v_last, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) @@ -1662,30 +1649,30 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; - const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); + grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT fptt ATan, hard_reset=False, dtype=fp16 +// MultiStepIFNodePTT fptt S2NN, hard_reset=False, dtype=fp16 #include extern "C" __global__ - void LIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, - const half & reciprocal_tau, + void IFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & v_threshold, const int & neuron_num, const int & numel) @@ -1695,29 +1682,28 @@ if (index < stride) { const int numel_2 = numel >> 1; - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hadd2(v_v_seq[t], x_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); } } } -// MultiStepLIFNodePTT bptt ATan, hard_reset=False, dtype=fp16, detach_reset=True +// MultiStepIFNodePTT bptt S2NN, hard_reset=False, dtype=fp16, detach_reset=True #include extern "C" __global__ - void LIFNode_bptt_softReset_detachReset_fp16( + void IFNode_bptt_softReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, half2* grad_x_seq, half2* grad_v_last, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -1726,42 +1712,39 @@ const int stride = neuron_num >> 1; if (index < stride) { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const half2 sg_ATan_alpha = __float2half2_rn(2.0f); - const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); - const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const half2 grad_v_to_h = __float2half2_rn(1.0f); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + + grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT bptt ATan, hard_reset=False, dtype=fp16, detach_reset=False +// MultiStepIFNodePTT bptt S2NN, hard_reset=False, dtype=fp16, detach_reset=False #include extern "C" __global__ - void LIFNode_bptt_softReset__fp16( + void IFNode_bptt_softReset__fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, half2* grad_x_seq, half2* grad_v_last, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -1770,41 +1753,38 @@ const int stride = neuron_num >> 1; if (index < stride) { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const half2 sg_ATan_alpha = __float2half2_rn(2.0f); - const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); - const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + + grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT fptt Sigmoid, hard_reset=True, dtype=fp32 +// MultiStepIFNodePTT fptt QPseudoSpike, hard_reset=True, dtype=fp32 extern "C" __global__ - void LIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, - const float & reciprocal_tau, + void IFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & v_threshold, const float & v_reset, - const int & neuron_num, const int & numel) + const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; @@ -1814,9 +1794,9 @@ for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { const int t = index + mem_offset; - - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + h_seq[t] = v_v_seq[t] + x_seq[t]; if (h_seq[t] >= v_threshold) + { spike_seq[t] = 1.0f; v_v_seq[t + dt] = v_reset; @@ -1827,18 +1807,16 @@ spike_seq[t] = 0.0f; v_v_seq[t + dt] = h_seq[t]; } - } } } -// MultiStepLIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp32, detach_reset=True +// MultiStepIFNodePTT bptt QPseudoSpike, hard_reset=True, dtype=fp32, detach_reset=True extern "C" __global__ - void LIFNode_bptt_hardReset_detachReset_fp32( + void IFNode_bptt_hardReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, float* grad_x_seq, float* grad_v_last, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) @@ -1852,30 +1830,29 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t]; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); + grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp32, detach_reset=False +// MultiStepIFNodePTT bptt QPseudoSpike, hard_reset=True, dtype=fp32, detach_reset=False extern "C" __global__ - void LIFNode_bptt_hardReset__fp32( + void IFNode_bptt_hardReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, float* grad_x_seq, float* grad_v_last, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) @@ -1889,30 +1866,29 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; - // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + // const float grad_v_to_h = fmaf(grad_s_to_h, v_reset - h_seq[t], 1.0f - spike_seq[t]); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); + grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT fptt Sigmoid, hard_reset=True, dtype=fp16 +// MultiStepIFNodePTT fptt QPseudoSpike, hard_reset=True, dtype=fp16 #include extern "C" __global__ - void LIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, - const half & reciprocal_tau, + void IFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -1922,7 +1898,6 @@ if (index < stride) { const int numel_2 = numel >> 1; - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); @@ -1930,23 +1905,23 @@ for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hadd2(v_v_seq[t], x_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); } } } -// MultiStepLIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp16, detach_reset=True +// MultiStepIFNodePTT bptt QPseudoSpike, hard_reset=True, dtype=fp16, detach_reset=True #include extern "C" __global__ - void LIFNode_bptt_hardReset_detachReset_fp16( + void IFNode_bptt_hardReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, half2* grad_x_seq, half2* grad_v_last, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -1955,8 +1930,6 @@ const int stride = neuron_num >> 1; if (index < stride) { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); @@ -1965,34 +1938,32 @@ for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); - const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); - const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + + grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp16, detach_reset=False +// MultiStepIFNodePTT bptt QPseudoSpike, hard_reset=True, dtype=fp16, detach_reset=False #include extern "C" __global__ - void LIFNode_bptt_hardReset__fp16( + void IFNode_bptt_hardReset__fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, half2* grad_x_seq, half2* grad_v_last, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -2001,8 +1972,6 @@ const int stride = neuron_num >> 1; if (index < stride) { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); @@ -2011,33 +1980,31 @@ for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); - const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); - const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT fptt Sigmoid, hard_reset=False, dtype=fp32 +// MultiStepIFNodePTT fptt QPseudoSpike, hard_reset=False, dtype=fp32 extern "C" __global__ - void LIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, - const float & reciprocal_tau, + void IFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & v_threshold, - const int & neuron_num, const int & numel) + const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; @@ -2047,9 +2014,9 @@ for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { const int t = index + mem_offset; - - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + h_seq[t] = v_v_seq[t] + x_seq[t]; if (h_seq[t] >= v_threshold) + { spike_seq[t] = 1.0f; v_v_seq[t + dt] = h_seq[t] - v_threshold; @@ -2060,18 +2027,16 @@ spike_seq[t] = 0.0f; v_v_seq[t + dt] = h_seq[t]; } - } } } -// MultiStepLIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp32, detach_reset=True +// MultiStepIFNodePTT bptt QPseudoSpike, hard_reset=False, dtype=fp32, detach_reset=True extern "C" __global__ - void LIFNode_bptt_softReset_detachReset_fp32( + void IFNode_bptt_softReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, float* grad_x_seq, float* grad_v_last, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) @@ -2085,30 +2050,29 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const float grad_v_to_h = 1.0f; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); + grad_x_seq[t] = grad_h; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp32, detach_reset=False +// MultiStepIFNodePTT bptt QPseudoSpike, hard_reset=False, dtype=fp32, detach_reset=False extern "C" __global__ - void LIFNode_bptt_softReset__fp32( + void IFNode_bptt_softReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, float* grad_x_seq, float* grad_v_last, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) @@ -2122,30 +2086,29 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; - } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; + } } -// MultiStepLIFNodePTT fptt Sigmoid, hard_reset=False, dtype=fp16 +// MultiStepIFNodePTT fptt QPseudoSpike, hard_reset=False, dtype=fp16 #include extern "C" __global__ - void LIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, - const half & reciprocal_tau, + void IFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & v_threshold, const int & neuron_num, const int & numel) @@ -2155,29 +2118,28 @@ if (index < stride) { const int numel_2 = numel >> 1; - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hadd2(v_v_seq[t], x_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); } } } -// MultiStepLIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp16, detach_reset=True +// MultiStepIFNodePTT bptt QPseudoSpike, hard_reset=False, dtype=fp16, detach_reset=True #include extern "C" __global__ - void LIFNode_bptt_softReset_detachReset_fp16( + void IFNode_bptt_softReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, half2* grad_x_seq, half2* grad_v_last, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -2186,42 +2148,38 @@ const int stride = neuron_num >> 1; if (index < stride) { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); - const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); - const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const half2 grad_v_to_h = __float2half2_rn(1.0f); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + + grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; } } -// MultiStepLIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp16, detach_reset=False +// MultiStepIFNodePTT bptt QPseudoSpike, hard_reset=False, dtype=fp16, detach_reset=False #include extern "C" __global__ - void LIFNode_bptt_softReset__fp16( + void IFNode_bptt_softReset__fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, half2* grad_x_seq, half2* grad_v_last, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -2230,38 +2188,151 @@ const int stride = neuron_num >> 1; if (index < stride) { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); const half2 v_threshold_half2 = __half2half2(v_threshold); half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); - const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); - const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; + } + } + +// MultiStepLIFNodePTT + +// MultiStepLIFNodePTT fptt ATan, decay_input=True, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; } } -// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp32 +// MultiStepLIFNodePTT fptt ATan, decay_input=False, hard_reset=True, dtype=fp32 extern "C" __global__ - void LIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + void LIFNode_fptt_decayInputFalse_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) @@ -2275,7 +2346,8 @@ { const int t = index + mem_offset; - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + if (h_seq[t] >= v_threshold) { spike_seq[t] = 1.0f; @@ -2292,10 +2364,10 @@ } } -// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp32, detach_reset=True +// MultiStepLIFNodePTT bptt ATan, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=True extern "C" __global__ - void LIFNode_bptt_hardReset_detachReset_fp32( + void LIFNode_bptt_decayInputFalse_hardReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, float* grad_x_seq, float* grad_v_last, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, @@ -2312,35 +2384,29 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code - const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); - float grad_s_to_h; - if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) - { - grad_s_to_h = 0.01f; - } - else - { - grad_s_to_h = 1.0f; - } + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t]; grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; + + grad_x_seq[t] = grad_h; + } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; } } -// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp32, detach_reset=False +// MultiStepLIFNodePTT bptt ATan, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=False extern "C" __global__ - void LIFNode_bptt_hardReset__fp32( + void LIFNode_bptt_decayInputFalse_hardReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, float* grad_x_seq, float* grad_v_last, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, @@ -2357,37 +2423,31 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code - const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); - float grad_s_to_h; - if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) - { - grad_s_to_h = 0.01f; - } - else - { - grad_s_to_h = 1.0f; - } + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; + + grad_x_seq[t] = grad_h; + } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; } } -// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp16 +// MultiStepLIFNodePTT fptt ATan, decay_input=True, hard_reset=True, dtype=fp16 #include extern "C" __global__ - void LIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + void LIFNode_fptt_decayInputTrue_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & reciprocal_tau, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -2407,7 +2467,8 @@ { const int t = index + mem_offset; - h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); @@ -2415,11 +2476,11 @@ } } -// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp16, detach_reset=True +// MultiStepLIFNodePTT bptt ATan, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=True #include extern "C" __global__ - void LIFNode_bptt_hardReset_detachReset_fp16( + void LIFNode_bptt_decayInputTrue_hardReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, half2* grad_x_seq, half2* grad_v_last, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, @@ -2444,28 +2505,30 @@ const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code - const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); - const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); - half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); } } -// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp16, detach_reset=False +// MultiStepLIFNodePTT bptt ATan, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=False #include extern "C" __global__ - void LIFNode_bptt_hardReset__fp16( + void LIFNode_bptt_decayInputTrue_hardReset__fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, half2* grad_x_seq, half2* grad_v_last, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, @@ -2490,68 +2553,12809 @@ const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code - const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); - const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); - half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); } } -// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp32 +// MultiStepLIFNodePTT fptt ATan, decay_input=False, hard_reset=True, dtype=fp16 + #include extern "C" __global__ - void LIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, - const float & reciprocal_tau, - const float & v_threshold, - const int & neuron_num, const int & numel) + void LIFNode_fptt_decayInputFalse_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index < neuron_num) + const int stride = neuron_num >> 1; + if (index < stride) { - const int dt = neuron_num; - for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); - if (h_seq[t] >= v_threshold) - { - spike_seq[t] = 1.0f; - v_v_seq[t + dt] = h_seq[t] - v_threshold; - } + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); - else - { - spike_seq[t] = 0.0f; - v_v_seq[t + dt] = h_seq[t]; - } - } } } -// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp32, detach_reset=True +// MultiStepLIFNodePTT bptt ATan, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=True + #include extern "C" __global__ - void LIFNode_bptt_softReset_detachReset_fp32( - const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, - float* grad_x_seq, float* grad_v_last, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, - const float & v_threshold, - const int & neuron_num, const int & numel) - - { + void LIFNode_bptt_decayInputFalse_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt ATan, decay_input=True, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt ATan, decay_input=False, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt ATan, decay_input=True, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt ATan, decay_input=False, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt ATan, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt S2NN, decay_input=True, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt S2NN, decay_input=False, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt S2NN, decay_input=True, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt S2NN, decay_input=False, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt S2NN, decay_input=True, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt S2NN, decay_input=False, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt S2NN, decay_input=True, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt S2NN, decay_input=False, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt S2NN, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + +// MultiStepLIFNodePTT fptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputTrue_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputTrue_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT fptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void LIFNode_fptt_decayInputFalse_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void LIFNode_bptt_decayInputFalse_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + +// MultiStepParametricLIFNodePTT + +// MultiStepParametricLIFNodePTT fptt ATan, decay_input=True, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt ATan, decay_input=False, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt ATan, decay_input=True, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt ATan, decay_input=False, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt ATan, decay_input=True, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt ATan, decay_input=False, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt ATan, decay_input=True, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt ATan, decay_input=False, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt ATan, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt Sigmoid, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt S2NN, decay_input=True, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt S2NN, decay_input=False, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt S2NN, decay_input=True, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt S2NN, decay_input=False, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt S2NN, decay_input=True, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt S2NN, decay_input=False, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt S2NN, decay_input=True, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt S2NN, decay_input=False, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt S2NN, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); + + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h * reciprocal_tau; + sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } + + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ float sdata[1024]; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + + } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + +// MultiStepParametricLIFNodePTT fptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputTrue_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=True, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputTrue_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT fptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void ParametricLIFNode_fptt_decayInputFalse_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepParametricLIFNodePTT bptt QPseudoSpike, decay_input=False, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void ParametricLIFNode_bptt_decayInputFalse_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + __shared__ float sdata[1024]; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + +// MultiStepEIFNodePTT + +// MultiStepEIFNodePTT fptt ATan, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void EIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + + v_v_seq[t + dt] = v_reset; + + } + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepEIFNodePTT bptt ATan, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void EIFNode_bptt_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; + grad_x_seq[t] = grad_h * reciprocal_tau; + } + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); + } + } + +// MultiStepEIFNodePTT bptt ATan, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void EIFNode_bptt_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; + grad_x_seq[t] = grad_h * reciprocal_tau; + } + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); + } + } + +// MultiStepEIFNodePTT fptt ATan, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void EIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepEIFNodePTT bptt ATan, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void EIFNode_bptt_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + } + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); + } + } + +// MultiStepEIFNodePTT bptt ATan, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void EIFNode_bptt_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + } + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); + } + } + +// MultiStepEIFNodePTT fptt ATan, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void EIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + + v_v_seq[t + dt] = h_seq[t] - v_threshold; + + } + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepEIFNodePTT bptt ATan, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void EIFNode_bptt_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; + grad_x_seq[t] = grad_h * reciprocal_tau; + } + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); + } + } + +// MultiStepEIFNodePTT bptt ATan, hard_reset=False, dtype=fp32, detach_reset=False + + extern "C" __global__ + void EIFNode_bptt_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; + const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; + grad_x_seq[t] = grad_h * reciprocal_tau; + } + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); + } + } + +// MultiStepEIFNodePTT fptt ATan, hard_reset=False, dtype=fp16 + + #include + extern "C" __global__ + void EIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepEIFNodePTT bptt ATan, hard_reset=False, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void EIFNode_bptt_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __float2half2_rn(1.0f); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + } + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); + } + } + +// MultiStepEIFNodePTT bptt ATan, hard_reset=False, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void EIFNode_bptt_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 sg_ATan_alpha = __float2half2_rn(2.0f); + const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); + const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + + // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + } + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); + } + } + +// MultiStepEIFNodePTT fptt Sigmoid, hard_reset=True, dtype=fp32 + + extern "C" __global__ + void EIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + + v_v_seq[t + dt] = v_reset; + + } + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepEIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp32, detach_reset=True + + extern "C" __global__ + void EIFNode_bptt_hardReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t]; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; + grad_x_seq[t] = grad_h * reciprocal_tau; + } + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); + } + } + +// MultiStepEIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp32, detach_reset=False + + extern "C" __global__ + void EIFNode_bptt_hardReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, const float & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; + grad_x_seq[t] = grad_h * reciprocal_tau; + } + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); + } + } + +// MultiStepEIFNodePTT fptt Sigmoid, hard_reset=True, dtype=fp16 + + #include + extern "C" __global__ + void EIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); + + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + + } + } + } + +// MultiStepEIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp16, detach_reset=True + + #include + extern "C" __global__ + void EIFNode_bptt_hardReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + } + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); + } + } + +// MultiStepEIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp16, detach_reset=False + + #include + extern "C" __global__ + void EIFNode_bptt_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + + const half2 v_reset_half2 = __half2half2(v_reset); + + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + } + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); + } + } + +// MultiStepEIFNodePTT fptt Sigmoid, hard_reset=False, dtype=fp32 + + extern "C" __global__ + void EIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, + const int & neuron_num, const int & numel) + + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + + v_v_seq[t + dt] = h_seq[t] - v_threshold; + + } + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + + } + } + } + +// MultiStepEIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp32, detach_reset=True + + extern "C" __global__ + void EIFNode_bptt_softReset_detachReset_fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, + const int & neuron_num, const int & numel) + + { const int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < neuron_num) { @@ -2561,37 +15365,29 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); - float grad_s_to_h; - if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) - { - grad_s_to_h = 0.01f; - } - else - { - grad_s_to_h = 1.0f; - } + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code const float grad_v_to_h = 1.0f; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp32, detach_reset=False +// MultiStepEIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp32, detach_reset=False extern "C" __global__ - void LIFNode_bptt_softReset__fp32( - const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + void EIFNode_bptt_softReset__fp32( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) @@ -2606,39 +15402,32 @@ const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); - float grad_s_to_h; - if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) - { - grad_s_to_h = 0.01f; - } - else - { - grad_s_to_h = 1.0f; - } + const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 4.0f; - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; - // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepLIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp16 +// MultiStepEIFNodePTT fptt Sigmoid, hard_reset=False, dtype=fp16 #include extern "C" __global__ - void LIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + void EIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & reciprocal_tau, - const half & v_threshold, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, const int & neuron_num, const int & numel) { @@ -2648,27 +15437,32 @@ { const int numel_2 = numel >> 1; const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); } } } -// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp16, detach_reset=True +// MultiStepEIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp16, detach_reset=True #include extern "C" __global__ - void LIFNode_bptt_softReset_detachReset_fp16( - const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + void EIFNode_bptt_softReset_detachReset_fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -2680,6 +15474,8 @@ { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively @@ -2689,30 +15485,31 @@ const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); - const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); - half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code const half2 grad_v_to_h = __float2half2_rn(1.0f); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp16, detach_reset=False +// MultiStepEIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp16, detach_reset=False #include extern "C" __global__ - void LIFNode_bptt_softReset__fp16( - const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + void EIFNode_bptt_softReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -2724,6 +15521,8 @@ { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively @@ -2733,31 +15532,32 @@ const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code - const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); - const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); - half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + const half2 sg_Sigmoid_alpha = __float2half2_rn(4.0f); + const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT - -// MultiStepParametricLIFNodePTT fptt ATan, hard_reset=True, dtype=fp32 +// MultiStepEIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp32 extern "C" __global__ - void ParametricLIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + void EIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & reciprocal_tau, - const float & v_threshold, const float & v_reset, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, const float & v_reset, const int & neuron_num, const int & numel) { @@ -2768,14 +15568,14 @@ for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { const int t = index + mem_offset; - - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); if (h_seq[t] >= v_threshold) { spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; - } + } else { spike_seq[t] = 0.0f; @@ -2786,134 +15586,106 @@ } } -// MultiStepParametricLIFNodePTT bptt ATan, hard_reset=True, dtype=fp32, detach_reset=True +// MultiStepEIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp32, detach_reset=True extern "C" __global__ - void ParametricLIFNode_bptt_hardReset_detachReset_fp32( + void EIFNode_bptt_hardReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code - const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; - const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t]; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT bptt ATan, hard_reset=True, dtype=fp32, detach_reset=False +// MultiStepEIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp32, detach_reset=False extern "C" __global__ - void ParametricLIFNode_bptt_hardReset__fp32( + void EIFNode_bptt_hardReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code - const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; - const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; - // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; - } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; + grad_x_seq[t] = grad_h * reciprocal_tau; + } + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT fptt ATan, hard_reset=True, dtype=fp16 +// MultiStepEIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp16 #include extern "C" __global__ - void ParametricLIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + void EIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & reciprocal_tau, - const half & v_threshold, const half & v_reset, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, const half & v_reset, const int & neuron_num, const int & numel) { @@ -2923,29 +15695,34 @@ { const int numel_2 = numel >> 1; const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); const half2 v_reset_half2 = __half2half2(v_reset); for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); } } } -// MultiStepParametricLIFNodePTT bptt ATan, hard_reset=True, dtype=fp16, detach_reset=True +// MultiStepEIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp16, detach_reset=True #include extern "C" __global__ - void ParametricLIFNode_bptt_hardReset_detachReset_fp16( + void EIFNode_bptt_hardReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -2953,87 +15730,48 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code - const half2 sg_ATan_alpha = __float2half2_rn(2.0f); - const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); - const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT bptt ATan, hard_reset=True, dtype=fp16, detach_reset=False +// MultiStepEIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp16, detach_reset=False #include extern "C" __global__ - void ParametricLIFNode_bptt_hardReset__fp16( + void EIFNode_bptt_hardReset__fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -3041,86 +15779,49 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code - const half2 sg_ATan_alpha = __float2half2_rn(2.0f); - const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); - const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT fptt ATan, hard_reset=False, dtype=fp32 +// MultiStepEIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp32 extern "C" __global__ - void ParametricLIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + void EIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & reciprocal_tau, - const float & v_threshold, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, const int & neuron_num, const int & numel) { @@ -3131,14 +15832,14 @@ for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { const int t = index + mem_offset; - - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); if (h_seq[t] >= v_threshold) { spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; - } + } else { spike_seq[t] = 0.0f; @@ -3149,134 +15850,106 @@ } } -// MultiStepParametricLIFNodePTT bptt ATan, hard_reset=False, dtype=fp32, detach_reset=True +// MultiStepEIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp32, detach_reset=True extern "C" __global__ - void ParametricLIFNode_bptt_softReset_detachReset_fp32( + void EIFNode_bptt_softReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code - const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; - const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code const float grad_v_to_h = 1.0f; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT bptt ATan, hard_reset=False, dtype=fp32, detach_reset=False +// MultiStepEIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp32, detach_reset=False extern "C" __global__ - void ParametricLIFNode_bptt_softReset__fp32( + void EIFNode_bptt_softReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code - const float sg_ATan_M_PI_2__alpha__x = ((float) 1.57079632679489661923) * 2.0f * over_th; - const float grad_s_to_h = 2.0f / 2.0f / (1.0f + sg_ATan_M_PI_2__alpha__x * sg_ATan_M_PI_2__alpha__x); + const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); + float grad_s_to_h; + if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) + { + grad_s_to_h = 0.01f; + } + else + { + grad_s_to_h = 1.0f; + } - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; - // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT fptt ATan, hard_reset=False, dtype=fp16 +// MultiStepEIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp16 #include extern "C" __global__ - void ParametricLIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + void EIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & reciprocal_tau, - const half & v_threshold, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, const int & neuron_num, const int & numel) { @@ -3286,27 +15959,32 @@ { const int numel_2 = numel >> 1; const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); } } } -// MultiStepParametricLIFNodePTT bptt ATan, hard_reset=False, dtype=fp16, detach_reset=True +// MultiStepEIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp16, detach_reset=True #include extern "C" __global__ - void ParametricLIFNode_bptt_softReset_detachReset_fp16( + void EIFNode_bptt_softReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -3314,85 +15992,46 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code - const half2 sg_ATan_alpha = __float2half2_rn(2.0f); - const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); - const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code const half2 grad_v_to_h = __float2half2_rn(1.0f); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT bptt ATan, hard_reset=False, dtype=fp16, detach_reset=False +// MultiStepEIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp16, detach_reset=False #include extern "C" __global__ - void ParametricLIFNode_bptt_softReset__fp16( + void EIFNode_bptt_softReset__fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -3400,84 +16039,47 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code - const half2 sg_ATan_alpha = __float2half2_rn(2.0f); - const half2 sg_ATan_M_PI_2__alpha__x = __hmul2(__hmul2(__float2half2_rn((float) 1.57079632679489661923), sg_ATan_alpha), over_th); - const half2 grad_s_to_h = __h2div(__h2div(sg_ATan_alpha, __float2half2_rn(2.0f)), __hfma2(sg_ATan_M_PI_2__alpha__x, sg_ATan_M_PI_2__alpha__x, __float2half2_rn(1.0f))); + const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); + const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); + half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); - // end: spikingjelly.clock_driven.surrogate.ATan.cuda_code + // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT fptt Sigmoid, hard_reset=True, dtype=fp32 +// MultiStepEIFNodePTT fptt S2NN, hard_reset=True, dtype=fp32 extern "C" __global__ - void ParametricLIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + void EIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & reciprocal_tau, - const float & v_threshold, const float & v_reset, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, const float & v_reset, const int & neuron_num, const int & numel) { @@ -3488,14 +16090,14 @@ for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { const int t = index + mem_offset; - - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); if (h_seq[t] >= v_threshold) { spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; - } + } else { spike_seq[t] = 0.0f; @@ -3506,134 +16108,92 @@ } } -// MultiStepParametricLIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp32, detach_reset=True +// MultiStepEIFNodePTT bptt S2NN, hard_reset=True, dtype=fp32, detach_reset=True extern "C" __global__ - void ParametricLIFNode_bptt_hardReset_detachReset_fp32( + void EIFNode_bptt_hardReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t]; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp32, detach_reset=False +// MultiStepEIFNodePTT bptt S2NN, hard_reset=True, dtype=fp32, detach_reset=False extern "C" __global__ - void ParametricLIFNode_bptt_hardReset__fp32( + void EIFNode_bptt_hardReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; - // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT fptt Sigmoid, hard_reset=True, dtype=fp16 +// MultiStepEIFNodePTT fptt S2NN, hard_reset=True, dtype=fp16 #include extern "C" __global__ - void ParametricLIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + void EIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & reciprocal_tau, - const half & v_threshold, const half & v_reset, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, const half & v_reset, const int & neuron_num, const int & numel) { @@ -3643,29 +16203,34 @@ { const int numel_2 = numel >> 1; const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); const half2 v_reset_half2 = __half2half2(v_reset); for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); } } } -// MultiStepParametricLIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp16, detach_reset=True +// MultiStepEIFNodePTT bptt S2NN, hard_reset=True, dtype=fp16, detach_reset=True #include extern "C" __global__ - void ParametricLIFNode_bptt_hardReset_detachReset_fp16( + void EIFNode_bptt_hardReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -3673,87 +16238,49 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); - const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); - const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT bptt Sigmoid, hard_reset=True, dtype=fp16, detach_reset=False +// MultiStepEIFNodePTT bptt S2NN, hard_reset=True, dtype=fp16, detach_reset=False #include extern "C" __global__ - void ParametricLIFNode_bptt_hardReset__fp16( + void EIFNode_bptt_hardReset__fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -3761,86 +16288,50 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); - const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); - const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT fptt Sigmoid, hard_reset=False, dtype=fp32 +// MultiStepEIFNodePTT fptt S2NN, hard_reset=False, dtype=fp32 extern "C" __global__ - void ParametricLIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + void EIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & reciprocal_tau, - const float & v_threshold, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, const int & neuron_num, const int & numel) { @@ -3851,14 +16342,14 @@ for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { const int t = index + mem_offset; - - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); if (h_seq[t] >= v_threshold) { spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; - } + } else { spike_seq[t] = 0.0f; @@ -3869,134 +16360,92 @@ } } -// MultiStepParametricLIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp32, detach_reset=True +// MultiStepEIFNodePTT bptt S2NN, hard_reset=False, dtype=fp32, detach_reset=True extern "C" __global__ - void ParametricLIFNode_bptt_softReset_detachReset_fp32( + void EIFNode_bptt_softReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const float grad_v_to_h = 1.0f; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp32, detach_reset=False +// MultiStepEIFNodePTT bptt S2NN, hard_reset=False, dtype=fp32, detach_reset=False extern "C" __global__ - void ParametricLIFNode_bptt_softReset__fp32( + void EIFNode_bptt_softReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const float sg_Sigmoid_sigmoid_ax = 1.0f / (1.0f + expf(- 1.0f * over_th)); - const float grad_s_to_h = (1.0f - sg_Sigmoid_sigmoid_ax) * sg_Sigmoid_sigmoid_ax * 1.0f; + const float sg_S2NN_sigmoid_ax = 1.0f / (1.0f + expf(- 4.0f * over_th)); + const float sg_S2NN_mask_l = (float)(over_th < 0.0f); + const float grad_s_to_h = (1.0f - sg_S2NN_sigmoid_ax) * sg_S2NN_sigmoid_ax * 4.0f * sg_S2NN_mask_l + 1.0f / (over_th + 1.0f) * (1.0f - sg_S2NN_mask_l); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; - // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT fptt Sigmoid, hard_reset=False, dtype=fp16 +// MultiStepEIFNodePTT fptt S2NN, hard_reset=False, dtype=fp16 #include extern "C" __global__ - void ParametricLIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + void EIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & reciprocal_tau, - const half & v_threshold, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, const int & neuron_num, const int & numel) { @@ -4006,27 +16455,32 @@ { const int numel_2 = numel >> 1; const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); } } } -// MultiStepParametricLIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp16, detach_reset=True +// MultiStepEIFNodePTT bptt S2NN, hard_reset=False, dtype=fp16, detach_reset=True #include extern "C" __global__ - void ParametricLIFNode_bptt_softReset_detachReset_fp16( + void EIFNode_bptt_softReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -4034,85 +16488,47 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); - const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); - const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const half2 grad_v_to_h = __float2half2_rn(1.0f); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT bptt Sigmoid, hard_reset=False, dtype=fp16, detach_reset=False +// MultiStepEIFNodePTT bptt S2NN, hard_reset=False, dtype=fp16, detach_reset=False #include extern "C" __global__ - void ParametricLIFNode_bptt_softReset__fp16( + void EIFNode_bptt_softReset__fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -4120,84 +16536,48 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // start: spikingjelly.clock_driven.surrogate.S2NN.cuda_code - const half2 sg_Sigmoid_alpha = __float2half2_rn(1.0f); - const half2 sg_Sigmoid_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_Sigmoid_alpha, over_th))), __float2half2_rn(1.0f))); - const half2 grad_s_to_h = __hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_sigmoid_ax), sg_Sigmoid_alpha); + const half2 sg_S2NN_alpha = __float2half2_rn(4.0f); + const half2 sg_S2NN_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2(sg_S2NN_alpha, over_th))), __float2half2_rn(1.0f))); + const half2 sg_S2NN_mask_l = __hlt2(over_th, __float2half2_rn(0.0f)); + const half2 grad_s_to_h = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), sg_S2NN_sigmoid_ax), sg_S2NN_sigmoid_ax), sg_S2NN_alpha), sg_S2NN_mask_l), __hmul2(__h2div(__float2half2_rn(1.0f), __hadd2(over_th, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), sg_S2NN_mask_l))); - // end: spikingjelly.clock_driven.surrogate.Sigmoid.cuda_code + // end: spikingjelly.clock_driven.surrogate.S2NN.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp32 +// MultiStepEIFNodePTT fptt QPseudoSpike, hard_reset=True, dtype=fp32 extern "C" __global__ - void ParametricLIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + void EIFNode_fptt_hardReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & reciprocal_tau, - const float & v_threshold, const float & v_reset, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, const float & v_reset, const int & neuron_num, const int & numel) { @@ -4208,14 +16588,14 @@ for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { const int t = index + mem_offset; - - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); if (h_seq[t] >= v_threshold) { spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; - } + } else { spike_seq[t] = 0.0f; @@ -4226,150 +16606,90 @@ } } -// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp32, detach_reset=True +// MultiStepEIFNodePTT bptt QPseudoSpike, hard_reset=True, dtype=fp32, detach_reset=True extern "C" __global__ - void ParametricLIFNode_bptt_hardReset_detachReset_fp32( + void EIFNode_bptt_hardReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); - float grad_s_to_h; - if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) - { - grad_s_to_h = 0.01f; - } - else - { - grad_s_to_h = 1.0f; - } + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t]; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp32, detach_reset=False +// MultiStepEIFNodePTT bptt QPseudoSpike, hard_reset=True, dtype=fp32, detach_reset=False extern "C" __global__ - void ParametricLIFNode_bptt_hardReset__fp32( + void EIFNode_bptt_hardReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const float & v_reset, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); - float grad_s_to_h; - if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) - { - grad_s_to_h = 0.01f; - } - else - { - grad_s_to_h = 1.0f; - } + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; - // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp16 +// MultiStepEIFNodePTT fptt QPseudoSpike, hard_reset=True, dtype=fp16 #include extern "C" __global__ - void ParametricLIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + void EIFNode_fptt_hardReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & reciprocal_tau, - const half & v_threshold, const half & v_reset, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, const half & v_reset, const int & neuron_num, const int & numel) { @@ -4378,118 +16698,35 @@ if (index < stride) { const int numel_2 = numel >> 1; - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 v_threshold_half2 = __half2half2(v_threshold); - - const half2 v_reset_half2 = __half2half2(v_reset); - - for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) - { - const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); - spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); - v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); - - } - } - } - -// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp16, detach_reset=True - - #include - extern "C" __global__ - void ParametricLIFNode_bptt_hardReset_detachReset_fp16( - const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, - const half & v_threshold, const half & v_reset, - const int & neuron_num, const int & numel) - - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; - if (index < stride) - { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); - const half2 v_threshold_half2 = __half2half2(v_threshold); - - const half2 v_reset_half2 = __half2half2(v_reset); - - - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); - for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) - { - const int t = index + mem_offset; - - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code - - const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); - const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); - half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); - - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code - - const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); - } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ + const half2 v_reset_half2 = __half2half2(v_reset); + + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + } } } -// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=True, dtype=fp16, detach_reset=False +// MultiStepEIFNodePTT bptt QPseudoSpike, hard_reset=True, dtype=fp16, detach_reset=True #include extern "C" __global__ - void ParametricLIFNode_bptt_hardReset__fp16( + void EIFNode_bptt_hardReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const half & v_reset, const int & neuron_num, const int & numel) @@ -4497,86 +16734,98 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); const half2 v_reset_half2 = __half2half2(v_reset); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); - const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); - half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. +// MultiStepEIFNodePTT bptt QPseudoSpike, hard_reset=True, dtype=fp16, detach_reset=False - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + #include + extern "C" __global__ + void EIFNode_bptt_hardReset__fp16( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, const half & v_reset, + const int & neuron_num, const int & numel) - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); + const half2 v_reset_half2 = __half2half2(v_reset); + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code + + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + } + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp32 +// MultiStepEIFNodePTT fptt QPseudoSpike, hard_reset=False, dtype=fp32 extern "C" __global__ - void ParametricLIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + void EIFNode_fptt_softReset_fp32(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, const float & reciprocal_tau, - const float & v_threshold, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, const int & neuron_num, const int & numel) { @@ -4587,14 +16836,14 @@ for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { const int t = index + mem_offset; - - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); if (h_seq[t] >= v_threshold) { spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; - } + } else { spike_seq[t] = 0.0f; @@ -4605,150 +16854,90 @@ } } -// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp32, detach_reset=True +// MultiStepEIFNodePTT bptt QPseudoSpike, hard_reset=False, dtype=fp32, detach_reset=True extern "C" __global__ - void ParametricLIFNode_bptt_softReset_detachReset_fp32( + void EIFNode_bptt_softReset_detachReset_fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); - float grad_s_to_h; - if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) - { - grad_s_to_h = 0.01f; - } - else - { - grad_s_to_h = 1.0f; - } + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const float grad_v_to_h = 1.0f; - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp32, detach_reset=False +// MultiStepEIFNodePTT bptt QPseudoSpike, hard_reset=False, dtype=fp32, detach_reset=False extern "C" __global__ - void ParametricLIFNode_bptt_softReset__fp32( + void EIFNode_bptt_softReset__fp32( const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, const float & reciprocal_tau, const float & one_sub_reciprocal_tau, const float & v_threshold, const int & neuron_num, const int & numel) { const int index = blockIdx.x * blockDim.x + threadIdx.x; - __shared__ float sdata[1024]; if (index < neuron_num) { float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; const float over_th = h_seq[t] - v_threshold; - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const float sg_PiecewiseLeakyReLU_x_abs = fabsf(over_th); - float grad_s_to_h; - if (sg_PiecewiseLeakyReLU_x_abs > 1.0f) - { - grad_s_to_h = 0.01f; - } - else - { - grad_s_to_h = 1.0f; - } + const float sg_QPseudoSpike_base = 1.0f + 2.0f / (2.0f - 1.0f) * fabsf(over_th); + const float grad_s_to_h = powf(sg_QPseudoSpike_base, -2.0f); - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; - // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; grad_x_seq[t] = grad_h * reciprocal_tau; - sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); } } -// MultiStepParametricLIFNodePTT fptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp16 +// MultiStepEIFNodePTT fptt QPseudoSpike, hard_reset=False, dtype=fp16 #include extern "C" __global__ - void ParametricLIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + void EIFNode_fptt_softReset_fp16(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, const half & reciprocal_tau, - const half & v_threshold, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, const int & neuron_num, const int & numel) { @@ -4758,27 +16947,32 @@ { const int numel_2 = numel >> 1; const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { const int t = index + mem_offset; - - h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); } } } -// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp16, detach_reset=True +// MultiStepEIFNodePTT bptt QPseudoSpike, hard_reset=False, dtype=fp16, detach_reset=True #include extern "C" __global__ - void ParametricLIFNode_bptt_softReset_detachReset_fp16( + void EIFNode_bptt_softReset_detachReset_fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -4786,85 +16980,46 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); - const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); - half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const half2 grad_v_to_h = __float2half2_rn(1.0f); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } -// MultiStepParametricLIFNodePTT bptt PiecewiseLeakyReLU, hard_reset=False, dtype=fp16, detach_reset=False +// MultiStepEIFNodePTT bptt QPseudoSpike, hard_reset=False, dtype=fp16, detach_reset=False #include extern "C" __global__ - void ParametricLIFNode_bptt_softReset__fp16( + void EIFNode_bptt_softReset__fp16( const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, const half & reciprocal_tau, const half & one_sub_reciprocal_tau, const half & v_threshold, const int & neuron_num, const int & numel) @@ -4872,75 +17027,35 @@ { const int index = blockIdx.x * blockDim.x + threadIdx.x; const int stride = neuron_num >> 1; - - __shared__ half2 sdata[1024]; if (index < stride) { const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); const half2 v_threshold_half2 = __half2half2(v_threshold); - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) { const int t = index + mem_offset; const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - // start: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // start: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code - const half2 sg_PiecewiseLeakyReLU_x_abs = __habs2(over_th); - const half2 sg_PiecewiseLeakyReLU_x_abs_ge_w = __hge2(sg_PiecewiseLeakyReLU_x_abs, __float2half2_rn(1.0f)); - half2 grad_s_to_h = __hadd2(__hmul2(__float2half2_rn(0.01f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __hmul2(__hsub2(__float2half2_rn(1.0f), sg_PiecewiseLeakyReLU_x_abs_ge_w), __float2half2_rn(1.0f))); + const half2 sg_QPseudoSpike_alpha = __float2half2_rn(2.0f); + const half2 sg_QPseudoSpike_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2(over_th)), __hsub2(sg_QPseudoSpike_alpha, __float2half2_rn(1.0f)))); + const half2 grad_s_to_h = h2exp2(__hmul2(h2log2(sg_QPseudoSpike_base), __hneg2(sg_QPseudoSpike_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. - // end: spikingjelly.clock_driven.surrogate.PiecewiseLeakyReLU.cuda_code + // end: spikingjelly.clock_driven.surrogate.QPseudoSpike.cuda_code const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); - - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); } } \ No newline at end of file diff --git a/spikingjelly/clock_driven/neuron_kernel.md b/spikingjelly/clock_driven/neuron_kernel.md index 9960400..42fa9b4 100644 --- a/spikingjelly/clock_driven/neuron_kernel.md +++ b/spikingjelly/clock_driven/neuron_kernel.md @@ -85,7 +85,7 @@ $$ $$ ## Leaky-Integrate-and-Fire Neuron (LIF Neuron) -For the LIF neuron, the charge function is +For the LIF neuron with decay input, the charge function is $$ H[t] = V[t - 1] + \frac{1}{\tau}(X[t] - (V[t - 1] - V_{reset})) $$ @@ -98,9 +98,22 @@ $$ \end{align} $$ +For the LIF neuron without decay input, the charge function is +$$ +H[t] = V[t - 1] - \frac{1}{\tau}(V[t - 1] - V_{reset}) + X[t] +$$ +Then the gradients are +$$ +\begin{align} +\frac{\mathrm{d} L}{\mathrm{d} H[t]} &=\frac{\partial L}{\partial S[t]}\frac{\mathrm{d} S[t]}{\mathrm{d} H[t]} + (\frac{\partial L}{\partial V[t]}+\frac{\mathrm{d} L}{\mathrm{d} H[t+1]}(1 - \frac{1}{\tau}))\frac{\mathrm{d} V[t]}{\mathrm{d} H[t]}\\ +\frac{\mathrm{d} L}{\mathrm{d} X[t]} &= \frac{\mathrm{d} L}{\mathrm{d} H[t]}\\ +\frac{\mathrm{d} L}{\mathrm{d} V[0]} &= \frac{\mathrm{d} L}{\mathrm{d} H[1]} (1 - \frac{1}{\tau}) +\end{align} +$$ + ## Parametric Leaky-Integrate-and-Fire Neuron (PLIF Neuron) -For the PLIF neuron, the charge function is +For the PLIF neuron with decay input, the charge function is $$ H[t] = V[t - 1] + \frac{1}{\tau}(X[t] - (V[t - 1] - V_{reset})) $$ @@ -114,6 +127,20 @@ $$ \end{align} $$ +For the PLIF neuron without decay input, the charge function is +$$ +H[t] = V[t - 1] - \frac{1}{\tau}(V[t - 1] - V_{reset}) + X[t] +$$ +Then the gradients are +$$ +\begin{align} +\frac{\mathrm{d} L}{\mathrm{d} H[t]} &=\frac{\partial L}{\partial S[t]}\frac{\mathrm{d} S[t]}{\mathrm{d} H[t]} + (\frac{\partial L}{\partial V[t]}+\frac{\mathrm{d} L}{\mathrm{d} H[t+1]}(1 - \frac{1}{\tau}))\frac{\mathrm{d} V[t]}{\mathrm{d} H[t]}\\ +\frac{\mathrm{d} L}{\mathrm{d} X[t]} &= \frac{\mathrm{d} L}{\mathrm{d} H[t]}\\ +\frac{\mathrm{d} L}{\mathrm{d} \frac{1}{\tau}} &= \sum_{t} \frac{\mathrm{d} L}{\mathrm{d} H[t]} (V_{reset} - V[t - 1])\\ +\frac{\mathrm{d} L}{\mathrm{d} V[0]} &= \frac{\mathrm{d} L}{\mathrm{d} H[1]} (1 - \frac{1}{\tau}) +\end{align} +$$ + ## Exponential Integrate-and-Fire Neuron (EIF Neuron) For the EIF neuron, the charge function is diff --git a/spikingjelly/clock_driven/neuron_kernel.py b/spikingjelly/clock_driven/neuron_kernel.py index 61071b3..41ce535 100644 --- a/spikingjelly/clock_driven/neuron_kernel.py +++ b/spikingjelly/clock_driven/neuron_kernel.py @@ -1,1600 +1,1744 @@ +import logging + try: import cupy - import torch - import torch.nn.functional as F - from . import cu_kernel_opt, surrogate - from ..configure import cuda_threads, cuda_compiler_options, cuda_compiler_backend - import numpy as np - +except BaseException as e: + logging.info(f'spikingjelly.clock_driven.neuron_kernel: {e}') + pass +import torch +import torch.nn.functional as F +from . import cu_kernel_opt, surrogate, tensor_cache +from .. import configure +import numpy as np + - class MultiStepIFNodePTT(torch.autograd.Function): - @staticmethod - def create_fptt_kernel(hard_reset: bool, dtype: str): - kernel_name = f'IFNode_fptt_{"hard" if hard_reset else "soft"}Reset_{dtype}' - if dtype == 'fp32': - code = rf''' - extern "C" __global__ - void {kernel_name}(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, - const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) - ''' +class MultiStepIFNodePTT(torch.autograd.Function): + @staticmethod + def create_fptt_kernel(hard_reset: bool, dtype: str): + kernel_name = f'IFNode_fptt_{"hard" if hard_reset else "soft"}Reset_{dtype}' + + if dtype == 'fp32': + code = rf''' + extern "C" __global__ + void {kernel_name}(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + { + const int t = index + mem_offset; + h_seq[t] = v_v_seq[t] + x_seq[t]; + if (h_seq[t] >= v_threshold) + ''' + if hard_reset: code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index < neuron_num) - { - const int dt = neuron_num; - for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { - const int t = index + mem_offset; - h_seq[t] = v_v_seq[t] + x_seq[t]; - if (h_seq[t] >= v_threshold) + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; + } ''' - - if hard_reset: - code += r''' - { - spike_seq[t] = 1.0f; - v_v_seq[t + dt] = v_reset; - } - ''' - else: - code += r''' - { - spike_seq[t] = 1.0f; - v_v_seq[t + dt] = h_seq[t] - v_threshold; - } - ''' - + else: code += r''' - else - { - spike_seq[t] = 0.0f; - v_v_seq[t + dt] = h_seq[t]; - } + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; } - } - } ''' - elif dtype == 'fp16': - code = rf''' - #include - extern "C" __global__ - void {kernel_name}(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, - const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) + code += r''' + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + } + } + } + ''' + + elif dtype == 'fp16': + code = rf''' + #include + extern "C" __global__ + void {kernel_name}(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 v_threshold_half2 = __half2half2(v_threshold); + ''' + + if hard_reset: + code += r''' + const half2 v_reset_half2 = __half2half2(v_reset); ''' - code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - const int stride = neuron_num >> 1; - if (index < stride) + code += r''' + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) { - const int numel_2 = numel >> 1; - const half2 v_threshold_half2 = __half2half2(v_threshold); - ''' + const int t = index + mem_offset; + h_seq[t] = __hadd2(v_v_seq[t], x_seq[t]); - if hard_reset: - code += r''' - const half2 v_reset_half2 = __half2half2(v_reset); - ''' + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + ''' + if hard_reset: code += r''' - for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) - { - const int t = index + mem_offset; - h_seq[t] = __hadd2(v_v_seq[t], x_seq[t]); - - spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); ''' - - if hard_reset: - code += r''' - v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); - ''' - else: - code += r''' - v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); - ''' - + else: code += r''' - } - } - } + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); ''' - else: - raise TypeError - return cupy.RawKernel(code, kernel_name, options=cuda_compiler_options, backend=cuda_compiler_backend) + code += r''' + } + } + } + ''' + else: + raise TypeError - @staticmethod - def create_bptt_kernel(sg_cuda_code_fun, hard_reset: bool, detach_reset: bool, dtype: str): + return cupy.RawKernel(code, kernel_name, options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) - kernel_name = f'IFNode_bptt_{"hard" if hard_reset else "soft"}Reset_{"detachReset" if detach_reset else ""}_{dtype}' + @staticmethod + def create_bptt_kernel(sg_cuda_code_fun, hard_reset: bool, detach_reset: bool, dtype: str): - code_grad_s_to_h = sg_cuda_code_fun(x='over_th', y='grad_s_to_h', dtype=dtype) + kernel_name = f'IFNode_bptt_{"hard" if hard_reset else "soft"}Reset_{"detachReset" if detach_reset else ""}_{dtype}' - if dtype == 'fp32': - code = fr''' - extern "C" __global__ - void {kernel_name}( - const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, - float* grad_x_seq, float* grad_v_last, - const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) - ''' + code_grad_s_to_h = sg_cuda_code_fun(x='over_th', y='grad_s_to_h', dtype=dtype) - code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index < neuron_num) - { - float grad_h = 0.0f; // grad_h will be used recursively - for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) - { - const int t = index + mem_offset; - const float over_th = h_seq[t] - v_threshold; - ''' - code += code_grad_s_to_h - if detach_reset: - if hard_reset: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - spike_seq[t]; - ''' - else: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f; - ''' - else: - if hard_reset: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; - // const float grad_v_to_h = fmaf(grad_s_to_h, v_reset - h_seq[t], 1.0f - spike_seq[t]); - ''' - else: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; - // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); - ''' - - code += code_grad_v_to_h - code += r''' - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); - grad_x_seq[t] = grad_h; - } - grad_v_last[index] = grad_x_seq[index]; - } - } - ''' + if dtype == 'fp32': + code = fr''' + extern "C" __global__ + void {kernel_name}( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' - elif dtype == 'fp16': - code = fr''' - #include - extern "C" __global__ - void {kernel_name}( - const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, - half2* grad_x_seq, half2* grad_v_last, - const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) - ''' - code += r''' - { + code += r''' + { const int index = blockIdx.x * blockDim.x + threadIdx.x; - const int stride = neuron_num >> 1; - if (index < stride) + if (index < neuron_num) { - const half2 v_threshold_half2 = __half2half2(v_threshold); - ''' - - if hard_reset: - code += r''' - const half2 v_reset_half2 = __half2half2(v_reset); - ''' - - code += r''' - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - ''' - code += code_grad_s_to_h - - if detach_reset: - if hard_reset: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); - ''' - else: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __float2half2_rn(1.0f); - ''' + const float over_th = h_seq[t] - v_threshold; + ''' + code += code_grad_s_to_h + if detach_reset: + if hard_reset: + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - spike_seq[t]; + ''' else: - if hard_reset: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); - ''' - else: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); - ''' - - code += code_grad_v_to_h - code += r''' - grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = grad_h; - } - grad_v_last[index] = grad_x_seq[index]; - } - } - ''' - else: - raise TypeError - return cupy.RawKernel(code, kernel_name, options=cuda_compiler_options, backend=cuda_compiler_backend) - - @staticmethod - def forward(ctx, x_seq: torch.Tensor, v_last: torch.Tensor, v_threshold: float, v_reset: float, - detach_reset: bool, sg_cuda_code_fun): - requires_grad = x_seq.requires_grad or v_last.requires_grad - device = x_seq.get_device() - if x_seq.dtype == torch.float32: - dtype = 'fp32' - cp_dtype = np.float32 - elif x_seq.dtype == torch.float16: - dtype = 'fp16' - cp_dtype = np.half + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f; + ''' else: - raise NotImplementedError - - use_pad = False - if dtype == 'fp16' and v_last.numel() % 2 != 0: - # only fp16 needs even numel because we use half2 to accelerate - # when numel is odd, we will pad x_seq - use_pad = True - x_seq = F.pad(x_seq, (0, 1)) # [T, N] -> [T, N + 1] - v_last = F.pad(v_last, (0, 1)) # [N] -> [N + 1] - - v_seq = torch.zeros_like(x_seq.data) - h_seq = torch.zeros_like(x_seq.data) - spike_seq = torch.zeros_like(x_seq.data) + if hard_reset: + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(grad_s_to_h, v_reset - h_seq[t], 1.0f - spike_seq[t]); + ''' + else: + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); + ''' - v_v_seq = torch.cat((v_last.unsqueeze(0), v_seq)) + code += code_grad_v_to_h + code += r''' + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, (grad_v_seq[t] + grad_h) * grad_v_to_h); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; + } + } + ''' + + elif dtype == 'fp16': + code = fr''' + #include + extern "C" __global__ + void {kernel_name}( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 v_threshold_half2 = __half2half2(v_threshold); + ''' + + if hard_reset: + code += r''' + const half2 v_reset_half2 = __half2half2(v_reset); + ''' - with cupy.cuda.Device(device): - numel = x_seq.numel() - neuron_num = numel // x_seq.shape[0] + code += r''' + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + ''' + code += code_grad_s_to_h - threads = cuda_threads - if dtype == 'fp16': - assert neuron_num % 2 == 0 - blocks = cu_kernel_opt.cal_blocks(neuron_num >> 1) - # we will take two neurons to calculate as one neuron in cuda half2 + if detach_reset: + if hard_reset: + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + ''' else: - blocks = cu_kernel_opt.cal_blocks(neuron_num) - cp_numel = cupy.asarray(numel) - cp_neuron_num = cupy.asarray(neuron_num) - cp_v_threshold = cupy.asarray(v_threshold, dtype=cp_dtype) - if v_reset is None: - cp_v_reset = None - hard_reset = False - x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( - x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_neuron_num, cp_numel) - kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_neuron_num, cp_numel] + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __float2half2_rn(1.0f); + ''' + else: + if hard_reset: + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + ''' else: - cp_v_reset = cupy.asarray(v_reset, dtype=cp_dtype) - hard_reset = True - x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_v_reset, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( - x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_v_reset, cp_neuron_num, cp_numel) - kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_v_reset, cp_neuron_num, - cp_numel] - - kernel = MultiStepIFNodePTT.create_fptt_kernel(hard_reset, dtype) - - kernel( - (blocks,), (threads,), - cu_kernel_opt.wrap_args_to_raw_kernel( - device, - *kernel_args - ) - ) + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + ''' - if requires_grad: - ctx.use_pad = use_pad - ctx.save_for_backward(h_seq, spike_seq) - ctx.blocks = blocks - ctx.threads = threads - ctx.cp_numel = cp_numel - ctx.cp_neuron_num = cp_neuron_num - ctx.cp_v_threshold = cp_v_threshold - ctx.cp_v_reset = cp_v_reset - ctx.detach_reset = detach_reset - ctx.sg_cuda_code_fun = sg_cuda_code_fun - - if use_pad: - return spike_seq[..., :-1], v_v_seq[1:, ..., :-1] + code += code_grad_v_to_h + code += r''' + grad_h = __hfma2(__hadd2(grad_v_seq[t], grad_h), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = grad_h; + } + grad_v_last[index] = grad_h; + } + } + ''' + else: + raise TypeError + return cupy.RawKernel(code, kernel_name, options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) + + @staticmethod + def forward(ctx, x_seq: torch.Tensor, v_last: torch.Tensor, v_threshold: float, v_reset: float, + detach_reset: bool, sg_cuda_code_fun): + requires_grad = x_seq.requires_grad or v_last.requires_grad + device = x_seq.get_device() + if x_seq.dtype == torch.float32: + dtype = 'fp32' + cp_dtype = np.float32 + elif x_seq.dtype == torch.float16: + dtype = 'fp16' + cp_dtype = np.half + else: + raise NotImplementedError + + use_pad = False + if dtype == 'fp16' and v_last.numel() % 2 != 0: + # only fp16 needs even numel because we use half2 to accelerate + # when numel is odd, we will pad x_seq + use_pad = True + x_seq = F.pad(x_seq, (0, 1)) # [T, N] -> [T, N + 1] + v_last = F.pad(v_last, (0, 1)) # [N] -> [N + 1] + + zero_shape = list(x_seq.shape) + zero_shape[0] *= 3 + v_seq, h_seq, spike_seq = torch.split(torch.zeros(zero_shape, device=x_seq.device, dtype=x_seq.dtype), x_seq.shape[0]) + + v_v_seq = torch.cat((v_last.unsqueeze(0), v_seq)) + + with cu_kernel_opt.DeviceEnvironment(device): + numel = x_seq.numel() + neuron_num = numel // x_seq.shape[0] + + threads = configure.cuda_threads + if dtype == 'fp16': + assert neuron_num % 2 == 0 + blocks = cu_kernel_opt.cal_blocks(neuron_num >> 1) + # we will take two neurons to calculate as one neuron in cuda half2 else: - return spike_seq, v_v_seq[1:, ] - - @staticmethod - def backward(ctx, grad_spike_seq, grad_v_seq): - if ctx.use_pad: - # grad_spike_seq.shape = [T, N] - # grad_v_seq.shape = [T, N] - # h_seq.shape = [T, N + 1] - # spike_seq.shape = [T, N + 1] - grad_spike_seq = F.pad(grad_spike_seq, (0, 1)) - grad_v_seq = F.pad(grad_v_seq, (0, 1)) - - device = grad_spike_seq.get_device() - h_seq, spike_seq = ctx.saved_tensors - grad_x_seq = torch.zeros_like(grad_spike_seq) - grad_v_last = torch.zeros_like(grad_spike_seq[0]) - - if ctx.cp_v_reset is None: + blocks = cu_kernel_opt.cal_blocks(neuron_num) + cp_numel = cupy.asarray(numel) + cp_neuron_num = cupy.asarray(neuron_num) + cp_v_threshold = cupy.asarray(v_threshold, dtype=cp_dtype) + if v_reset is None: + cp_v_reset = None hard_reset = False + x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( + x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_neuron_num, cp_numel) + kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_neuron_num, cp_numel] else: + cp_v_reset = cupy.asarray(v_reset, dtype=cp_dtype) hard_reset = True - - if grad_spike_seq.dtype == torch.float32: - dtype = 'fp32' - elif grad_spike_seq.dtype == torch.float16: - dtype = 'fp16' + x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_v_reset, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( + x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_v_reset, cp_neuron_num, cp_numel) + kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_v_threshold, cp_v_reset, cp_neuron_num, + cp_numel] + + kernel = MultiStepIFNodePTT.create_fptt_kernel(hard_reset, dtype) + + kernel( + (blocks,), (threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device, + *kernel_args + ) + ) + + if requires_grad: + ctx.use_pad = use_pad + if configure.save_spike_as_bool_in_neuron_kernel: + ctx.s_shape = spike_seq.shape + ctx.s_tk = tensor_cache.BOOL_TENSOR_CACHE.store_bool(spike_seq) + ctx.save_for_backward(h_seq) else: - raise NotImplementedError + ctx.save_for_backward(h_seq, spike_seq) + ctx.blocks = blocks + ctx.threads = threads + ctx.cp_numel = cp_numel + ctx.cp_neuron_num = cp_neuron_num + ctx.cp_v_threshold = cp_v_threshold + ctx.cp_v_reset = cp_v_reset + ctx.detach_reset = detach_reset + ctx.sg_cuda_code_fun = sg_cuda_code_fun + + if use_pad: + return spike_seq[..., :-1], v_v_seq[1:, ..., :-1] + else: + return spike_seq, v_v_seq[1:, ] + + @staticmethod + def backward(ctx, grad_spike_seq, grad_v_seq): + if ctx.use_pad: + # grad_spike_seq.shape = [T, N] + # grad_v_seq.shape = [T, N] + # h_seq.shape = [T, N + 1] + # spike_seq.shape = [T, N + 1] + grad_spike_seq = F.pad(grad_spike_seq, (0, 1)) + grad_v_seq = F.pad(grad_v_seq, (0, 1)) + + device = grad_spike_seq.get_device() + + if configure.save_spike_as_bool_in_neuron_kernel: + h_seq = ctx.saved_tensors[0] + spike_seq = tensor_cache.BOOL_TENSOR_CACHE.get_float(ctx.s_tk, ctx.s_shape) + else: + h_seq, spike_seq = ctx.saved_tensors - kernel = MultiStepIFNodePTT.create_bptt_kernel(ctx.sg_cuda_code_fun, hard_reset, ctx.detach_reset, dtype) + zero_shape = list(grad_spike_seq.shape) + zero_shape[0] += 1 + zero_data = torch.zeros(zero_shape, device=grad_spike_seq.device, dtype=grad_spike_seq.dtype) + grad_x_seq = zero_data[0: -1] + grad_v_last = zero_data[-1] - with cupy.cuda.Device(device): - if hard_reset: - grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( - grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_v_threshold, - ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel) - kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, - ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel] - else: - grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( - grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_v_threshold, - ctx.cp_neuron_num, ctx.cp_numel) - kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, - ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel] - - kernel( - (ctx.blocks,), (ctx.threads,), - cu_kernel_opt.wrap_args_to_raw_kernel( - device, - *kernel_args - ) - ) - if ctx.use_pad: - return grad_x_seq[..., :-1], grad_v_last[..., :-1], None, None, None, None - else: - return grad_x_seq, grad_v_last, None, None, None, None + if ctx.cp_v_reset is None: + hard_reset = False + else: + hard_reset = True + if grad_spike_seq.dtype == torch.float32: + dtype = 'fp32' + elif grad_spike_seq.dtype == torch.float16: + dtype = 'fp16' + else: + raise NotImplementedError - class MultiStepLIFNodePTT(torch.autograd.Function): - @staticmethod - def create_fptt_kernel(hard_reset: bool, dtype: str, kernel_name_prefix: str = 'LIFNode'): - kernel_name = f'{kernel_name_prefix}_fptt_{"hard" if hard_reset else "soft"}Reset_{dtype}' + kernel = MultiStepIFNodePTT.create_bptt_kernel(ctx.sg_cuda_code_fun, hard_reset, ctx.detach_reset, dtype) - if dtype == 'fp32': - code = rf''' - extern "C" __global__ - void {kernel_name}(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, - const float & reciprocal_tau, - const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) - ''' - code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index < neuron_num) + with cu_kernel_opt.DeviceEnvironment(device): + + if hard_reset: + grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( + grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_v_threshold, + ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel) + kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, + ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel] + else: + grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( + grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_v_threshold, + ctx.cp_neuron_num, ctx.cp_numel) + kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, + ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel] + + kernel( + (ctx.blocks,), (ctx.threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device, + *kernel_args + ) + ) + if ctx.use_pad: + return grad_x_seq[..., :-1], grad_v_last[..., :-1], None, None, None, None + else: + return grad_x_seq, grad_v_last, None, None, None, None + + +class MultiStepLIFNodePTT(torch.autograd.Function): + @staticmethod + def create_fptt_kernel(decay_input: bool, hard_reset: bool, dtype: str, kernel_name_prefix: str = 'LIFNode'): + kernel_name = f'{kernel_name_prefix}_fptt_decayInput{decay_input}_{"hard" if hard_reset else "soft"}Reset_{dtype}' + + if dtype == 'fp32': + code = rf''' + extern "C" __global__ + void {kernel_name}(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { - const int dt = neuron_num; - for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) - { - const int t = index + mem_offset; - ''' + const int t = index + mem_offset; + ''' - if hard_reset: + if hard_reset: + if decay_input: code += r''' h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_reset); - if (h_seq[t] >= v_threshold) - { - spike_seq[t] = 1.0f; - v_v_seq[t + dt] = v_reset; - } ''' else: code += r''' - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); - if (h_seq[t] >= v_threshold) - { - spike_seq[t] = 1.0f; - v_v_seq[t + dt] = h_seq[t] - v_threshold; - } + h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; ''' - code += r''' - else - { - spike_seq[t] = 0.0f; - v_v_seq[t + dt] = h_seq[t]; - } - + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = v_reset; } - } - } - ''' - - elif dtype == 'fp16': - code = rf''' - #include - extern "C" __global__ - void {kernel_name}(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, - const half & reciprocal_tau, - const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) ''' - + else: + if decay_input: + code += r''' + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t]); + ''' + else: + code += r''' + h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + ''' code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - const int stride = neuron_num >> 1; - if (index < stride) - { - const int numel_2 = numel >> 1; - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 v_threshold_half2 = __half2half2(v_threshold); + if (h_seq[t] >= v_threshold) + { + spike_seq[t] = 1.0f; + v_v_seq[t + dt] = h_seq[t] - v_threshold; + } ''' - if hard_reset: - code += r''' - const half2 v_reset_half2 = __half2half2(v_reset); - ''' + code += r''' + else + { + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } + } + } + } + ''' + + elif dtype == 'fp16': + code = rf''' + #include + extern "C" __global__ + void {kernel_name}(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + ''' + + if hard_reset: code += r''' - for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) - { - const int t = index + mem_offset; + const half2 v_reset_half2 = __half2half2(v_reset); ''' - if hard_reset: + + code += r''' + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + ''' + if hard_reset: + if decay_input: code += r''' h_seq[t] = __hfma2(__hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_reset_half2), reciprocal_tau_half2, v_v_seq[t]); - spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); - v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); ''' else: code += r''' - h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); - spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); - v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + // h_seq[t] = v_v_seq[t] - reciprocal_tau * (v_v_seq[t] - v_reset) + x_seq[t]; + // = reciprocal_tau * (v_reset - v_v_seq[t]) + v_v_seq[t] + x_seq[t]; + h_seq[t] = __hadd2(__hfma2(__hsub2(v_reset_half2, v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]), x_seq[t]); ''' - code += r''' - } - } - } + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); ''' else: - raise TypeError - - return cupy.RawKernel(code, kernel_name, options=cuda_compiler_options, backend=cuda_compiler_backend) - - @staticmethod - def create_bptt_kernel(sg_cuda_code_fun, hard_reset: bool, detach_reset: bool, dtype: str): - - kernel_name = f'LIFNode_bptt_{"hard" if hard_reset else "soft"}Reset_{"detachReset" if detach_reset else ""}_{dtype}' - - code_grad_s_to_h = sg_cuda_code_fun(x='over_th', y='grad_s_to_h', dtype=dtype) - - if dtype == 'fp32': - code = fr''' - extern "C" __global__ - void {kernel_name}( - const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, - float* grad_x_seq, float* grad_v_last, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, - const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) - ''' - - code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index < neuron_num) - { - float grad_h = 0.0f; // grad_h will be used recursively - for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) - { - const int t = index + mem_offset; - const float over_th = h_seq[t] - v_threshold; - ''' - code += code_grad_s_to_h - if detach_reset: - if hard_reset: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - spike_seq[t]; - ''' - else: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f; - ''' + if decay_input: + code += r''' + h_seq[t] = __hfma2(__hsub2(x_seq[t], v_v_seq[t]), reciprocal_tau_half2, v_v_seq[t]); + ''' else: - if hard_reset: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; - // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); - ''' - else: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; - // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); - ''' - - code += code_grad_v_to_h + code += r''' + // h_seq[t] = v_v_seq[t] * (1.0f - reciprocal_tau) + x_seq[t]; + h_seq[t] = __hfma2(__hsub2(__float2half2_rn(1.0f), reciprocal_tau_half2), v_v_seq[t], x_seq[t]); + ''' code += r''' - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); - grad_x_seq[t] = grad_h * reciprocal_tau; - } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - } + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); ''' - elif dtype == 'fp16': - code = fr''' - #include - extern "C" __global__ - void {kernel_name}( - const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, - half2* grad_x_seq, half2* grad_v_last, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, - const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) - ''' - code += r''' - { + code += r''' + } + } + } + ''' + else: + raise TypeError + return cupy.RawKernel(code, kernel_name, options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) + + @staticmethod + def create_bptt_kernel(sg_cuda_code_fun, decay_input: bool, hard_reset: bool, detach_reset: bool, dtype: str): + + kernel_name = f'LIFNode_bptt_decayInput{decay_input}_{"hard" if hard_reset else "soft"}Reset_{"detachReset" if detach_reset else ""}_{dtype}' + + code_grad_s_to_h = sg_cuda_code_fun(x='over_th', y='grad_s_to_h', dtype=dtype) + + if dtype == 'fp32': + code = fr''' + extern "C" __global__ + void {kernel_name}( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, + float* grad_x_seq, float* grad_v_last, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + + code += r''' + { const int index = blockIdx.x * blockDim.x + threadIdx.x; - const int stride = neuron_num >> 1; - if (index < stride) + if (index < neuron_num) { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); - const half2 v_threshold_half2 = __half2half2(v_threshold); - ''' - + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + ''' + code += code_grad_s_to_h + if detach_reset: if hard_reset: - code += r''' - const half2 v_reset_half2 = __half2half2(v_reset); + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - spike_seq[t]; + ''' + else: + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f; + ''' + else: + if hard_reset: + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + ''' + else: + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-grad_s_to_h, v_threshold, 1.0f); ''' + code += code_grad_v_to_h + code += r''' + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + ''' + if decay_input: code += r''' - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) - { - const int t = index + mem_offset; - - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + grad_x_seq[t] = grad_h * reciprocal_tau; ''' - - code += code_grad_s_to_h - - if detach_reset: - if hard_reset: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); - ''' - else: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __float2half2_rn(1.0f); - ''' - else: - if hard_reset: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); - ''' - else: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); - ''' - - code += code_grad_v_to_h - code += r''' - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - } - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } + else: + code += r''' + grad_x_seq[t] = grad_h; + ''' + code += r''' } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + } + ''' + + elif dtype == 'fp16': + code = fr''' + #include + extern "C" __global__ + void {kernel_name}( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + ''' + + if hard_reset: + code += r''' + const half2 v_reset_half2 = __half2half2(v_reset); ''' - else: - raise TypeError - return cupy.RawKernel(code, kernel_name, options=cuda_compiler_options, backend=cuda_compiler_backend) - - @staticmethod - def forward(ctx, x_seq: torch.Tensor, v_last: torch.Tensor, tau: float, v_threshold: float, v_reset: float, - detach_reset: bool, sg_cuda_code_fun): - requires_grad = x_seq.requires_grad or v_last.requires_grad - device = x_seq.get_device() - if x_seq.dtype == torch.float32: - dtype = 'fp32' - cp_dtype = np.float32 - elif x_seq.dtype == torch.float16: - dtype = 'fp16' - cp_dtype = np.half - else: - raise NotImplementedError - - use_pad = False - if dtype == 'fp16' and v_last.numel() % 2 != 0: - # only fp16 needs even numel because we use half2 to accelerate - # when numel is odd, we will pad x_seq - use_pad = True - x_seq = F.pad(x_seq, (0, 1)) # [T, N] -> [T, N + 1] - v_last = F.pad(v_last, (0, 1)) # [N] -> [N + 1] - - v_seq = torch.zeros_like(x_seq.data) - h_seq = torch.zeros_like(x_seq.data) - spike_seq = torch.zeros_like(x_seq.data) + code += r''' + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; - v_v_seq = torch.cat((v_last.unsqueeze(0), v_seq)) + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + ''' - with cupy.cuda.Device(device): - numel = x_seq.numel() - neuron_num = numel // x_seq.shape[0] + code += code_grad_s_to_h - threads = cuda_threads - if dtype == 'fp16': - assert neuron_num % 2 == 0 - blocks = cu_kernel_opt.cal_blocks(neuron_num >> 1) - # we will take two neurons to calculate as one neuron in cuda half2 + if detach_reset: + if hard_reset: + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + ''' else: - blocks = cu_kernel_opt.cal_blocks(neuron_num) - - cp_numel = cupy.asarray(numel) - cp_neuron_num = cupy.asarray(neuron_num) - cp_v_threshold = cupy.asarray(v_threshold, dtype=cp_dtype) - cp_reciprocal_tau = cupy.asarray(1. / tau, dtype=cp_dtype) - cp_one_sub_reciprocal_tau = cupy.asarray(1. - 1. / tau, dtype=cp_dtype) - - if v_reset is None: - cp_v_reset = None - hard_reset = False - x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( - x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, cp_numel) - kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, - cp_numel] + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __float2half2_rn(1.0f); + ''' + else: + if hard_reset: + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + ''' else: - cp_v_reset = cupy.asarray(v_reset, dtype=cp_dtype) - hard_reset = True - x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( - x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, cp_neuron_num, - cp_numel) - kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, - cp_neuron_num, cp_numel] - - kernel = MultiStepLIFNodePTT.create_fptt_kernel(hard_reset, dtype) - - kernel( - (blocks,), (threads,), - cu_kernel_opt.wrap_args_to_raw_kernel( - device, - *kernel_args - ) - ) + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + ''' - if requires_grad: - ctx.use_pad = use_pad - ctx.save_for_backward(h_seq, spike_seq) - ctx.blocks = blocks - ctx.threads = threads - ctx.cp_numel = cp_numel - ctx.cp_neuron_num = cp_neuron_num - ctx.cp_reciprocal_tau = cp_reciprocal_tau - ctx.cp_one_sub_reciprocal_tau = cp_one_sub_reciprocal_tau - ctx.cp_v_threshold = cp_v_threshold - ctx.cp_v_reset = cp_v_reset - ctx.detach_reset = detach_reset - ctx.sg_cuda_code_fun = sg_cuda_code_fun - - if use_pad: - return spike_seq[..., :-1], v_v_seq[1:, ..., :-1] + code += code_grad_v_to_h + code += r''' + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + ''' + if decay_input: + code += r''' + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + ''' else: - return spike_seq, v_v_seq[1:, ] - - @staticmethod - def backward(ctx, grad_spike_seq, grad_v_seq): - if ctx.use_pad: - # grad_spike_seq.shape = [T, N] - # grad_v_seq.shape = [T, N] - # h_seq.shape = [T, N + 1] - # spike_seq.shape = [T, N + 1] - grad_spike_seq = F.pad(grad_spike_seq, (0, 1)) - grad_v_seq = F.pad(grad_v_seq, (0, 1)) - - device = grad_spike_seq.get_device() - h_seq, spike_seq = ctx.saved_tensors - grad_x_seq = torch.zeros_like(grad_spike_seq) - grad_v_last = torch.zeros_like(grad_spike_seq[0]) + code += r''' + grad_x_seq[t] = grad_h; + ''' + code += r''' + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + } + ''' + + else: + raise TypeError + return cupy.RawKernel(code, kernel_name, options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) + + @staticmethod + def forward(ctx, x_seq: torch.Tensor, v_last: torch.Tensor, decay_input: bool, tau: float, v_threshold: float, v_reset: float, + detach_reset: bool, sg_cuda_code_fun): + requires_grad = x_seq.requires_grad or v_last.requires_grad + device = x_seq.get_device() + if x_seq.dtype == torch.float32: + dtype = 'fp32' + cp_dtype = np.float32 + elif x_seq.dtype == torch.float16: + dtype = 'fp16' + cp_dtype = np.half + else: + raise NotImplementedError + + use_pad = False + if dtype == 'fp16' and v_last.numel() % 2 != 0: + # only fp16 needs even numel because we use half2 to accelerate + # when numel is odd, we will pad x_seq + use_pad = True + x_seq = F.pad(x_seq, (0, 1)) # [T, N] -> [T, N + 1] + v_last = F.pad(v_last, (0, 1)) # [N] -> [N + 1] + + zero_shape = list(x_seq.shape) + zero_shape[0] *= 3 + v_seq, h_seq, spike_seq = torch.split(torch.zeros(zero_shape, device=x_seq.device, dtype=x_seq.dtype), x_seq.shape[0]) + + v_v_seq = torch.cat((v_last.unsqueeze(0), v_seq)) + + with cu_kernel_opt.DeviceEnvironment(device): + numel = x_seq.numel() + neuron_num = numel // x_seq.shape[0] + + threads = configure.cuda_threads + if dtype == 'fp16': + assert neuron_num % 2 == 0 + blocks = cu_kernel_opt.cal_blocks(neuron_num >> 1) + # we will take two neurons to calculate as one neuron in cuda half2 + else: + blocks = cu_kernel_opt.cal_blocks(neuron_num) + + cp_numel = cupy.asarray(numel) + cp_neuron_num = cupy.asarray(neuron_num) + cp_v_threshold = cupy.asarray(v_threshold, dtype=cp_dtype) + cp_reciprocal_tau = cupy.asarray(1. / tau, dtype=cp_dtype) + cp_one_sub_reciprocal_tau = cupy.asarray(1. - 1. / tau, dtype=cp_dtype) - if ctx.cp_v_reset is None: + if v_reset is None: + cp_v_reset = None hard_reset = False + x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( + x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, cp_numel) + kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, + cp_numel] else: + cp_v_reset = cupy.asarray(v_reset, dtype=cp_dtype) hard_reset = True - - if grad_spike_seq.dtype == torch.float32: - dtype = 'fp32' - elif grad_spike_seq.dtype == torch.float16: - dtype = 'fp16' + x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( + x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, cp_neuron_num, + cp_numel) + kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, + cp_neuron_num, cp_numel] + + kernel = MultiStepLIFNodePTT.create_fptt_kernel(decay_input, hard_reset, dtype) + kernel( + (blocks,), (threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device, + *kernel_args + ) + ) + + if requires_grad: + ctx.decay_input = decay_input + ctx.use_pad = use_pad + if configure.save_spike_as_bool_in_neuron_kernel: + ctx.s_shape = spike_seq.shape + ctx.s_tk = tensor_cache.BOOL_TENSOR_CACHE.store_bool(spike_seq) + ctx.save_for_backward(h_seq) else: - raise NotImplementedError - - kernel = MultiStepLIFNodePTT.create_bptt_kernel(ctx.sg_cuda_code_fun, hard_reset, ctx.detach_reset, dtype) - - with cupy.cuda.Device(device): - + ctx.save_for_backward(h_seq, spike_seq) + ctx.blocks = blocks + ctx.threads = threads + ctx.cp_numel = cp_numel + ctx.cp_neuron_num = cp_neuron_num + ctx.cp_reciprocal_tau = cp_reciprocal_tau + ctx.cp_one_sub_reciprocal_tau = cp_one_sub_reciprocal_tau + ctx.cp_v_threshold = cp_v_threshold + ctx.cp_v_reset = cp_v_reset + ctx.detach_reset = detach_reset + ctx.sg_cuda_code_fun = sg_cuda_code_fun + + if use_pad: + return spike_seq[..., :-1], v_v_seq[1:, ..., :-1] + else: + return spike_seq, v_v_seq[1:, ] + + @staticmethod + def backward(ctx, grad_spike_seq, grad_v_seq): + if ctx.use_pad: + # grad_spike_seq.shape = [T, N] + # grad_v_seq.shape = [T, N] + # h_seq.shape = [T, N + 1] + # spike_seq.shape = [T, N + 1] + grad_spike_seq = F.pad(grad_spike_seq, (0, 1)) + grad_v_seq = F.pad(grad_v_seq, (0, 1)) + + device = grad_spike_seq.get_device() + if configure.save_spike_as_bool_in_neuron_kernel: + h_seq = ctx.saved_tensors[0] + spike_seq = tensor_cache.BOOL_TENSOR_CACHE.get_float(ctx.s_tk, ctx.s_shape) + else: + h_seq, spike_seq = ctx.saved_tensors + zero_shape = list(grad_spike_seq.shape) + zero_shape[0] += 1 + zero_data = torch.zeros(zero_shape, device=grad_spike_seq.device, dtype=grad_spike_seq.dtype) + grad_x_seq = zero_data[0: -1] + grad_v_last = zero_data[-1] + + if ctx.cp_v_reset is None: + hard_reset = False + else: + hard_reset = True + + if grad_spike_seq.dtype == torch.float32: + dtype = 'fp32' + elif grad_spike_seq.dtype == torch.float16: + dtype = 'fp16' + else: + raise NotImplementedError + + kernel = MultiStepLIFNodePTT.create_bptt_kernel(ctx.sg_cuda_code_fun, ctx.decay_input, hard_reset, ctx.detach_reset, dtype) + + with cu_kernel_opt.DeviceEnvironment(device): + + if hard_reset: + grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( + grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_reciprocal_tau, + ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, + ctx.cp_numel) + kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, + ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, + ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel] + else: + grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( + grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_reciprocal_tau, + ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel) + kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, + ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, + ctx.cp_neuron_num, ctx.cp_numel] + + kernel( + (ctx.blocks,), (ctx.threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device, + *kernel_args + ) + ) + if ctx.use_pad: + return grad_x_seq[..., :-1], grad_v_last[..., :-1], None, None, None, None, None, None + else: + return grad_x_seq, grad_v_last, None, None, None, None, None, None + + +class MultiStepParametricLIFNodePTT(torch.autograd.Function): + @staticmethod + def create_fptt_kernel(decay_input: bool, hard_reset: bool, dtype: str): + return MultiStepLIFNodePTT.create_fptt_kernel(decay_input, hard_reset, dtype, kernel_name_prefix='ParametricLIFNode') + + @staticmethod + def create_bptt_kernel(sg_cuda_code_fun, decay_input: bool, hard_reset: bool, detach_reset: bool, dtype: str): + kernel_name = f'ParametricLIFNode_bptt_decayInput{decay_input}_{"hard" if hard_reset else "soft"}Reset_{"detachReset" if detach_reset else ""}_{dtype}' + + code_grad_s_to_h = sg_cuda_code_fun(x='over_th', y='grad_s_to_h', dtype=dtype) + + if dtype == 'fp32': + code = fr''' + extern "C" __global__ + void {kernel_name}( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + ''' + code += f'__shared__ float sdata[{configure.cuda_threads}];' + code += r''' + if (index < neuron_num) + { + float grad_h = 0.0f; // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) + { + const int t = index + mem_offset; + const float over_th = h_seq[t] - v_threshold; + ''' + code += code_grad_s_to_h + if detach_reset: if hard_reset: - grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( - grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_reciprocal_tau, - ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, - ctx.cp_numel) - kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, - ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, - ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel] + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - spike_seq[t]; + ''' else: - grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( - grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, ctx.cp_reciprocal_tau, - ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel) - kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, grad_x_seq, grad_v_last, - ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, - ctx.cp_neuron_num, ctx.cp_numel] - - kernel( - (ctx.blocks,), (ctx.threads,), - cu_kernel_opt.wrap_args_to_raw_kernel( - device, - *kernel_args - ) - ) - if ctx.use_pad: - return grad_x_seq[..., :-1], grad_v_last[..., :-1], None, None, None, None, None + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f; + ''' else: - return grad_x_seq, grad_v_last, None, None, None, None, None - - - class MultiStepParametricLIFNodePTT(torch.autograd.Function): - @staticmethod - def create_fptt_kernel(hard_reset: bool, dtype: str): - return MultiStepLIFNodePTT.create_fptt_kernel(hard_reset, dtype, kernel_name_prefix='ParametricLIFNode') - - @staticmethod - def create_bptt_kernel(sg_cuda_code_fun, hard_reset: bool, detach_reset: bool, dtype: str): - - kernel_name = f'ParametricLIFNode_bptt_{"hard" if hard_reset else "soft"}Reset_{"detachReset" if detach_reset else ""}_{dtype}' - - code_grad_s_to_h = sg_cuda_code_fun(x='over_th', y='grad_s_to_h', dtype=dtype) - - if dtype == 'fp32': - code = fr''' - extern "C" __global__ - void {kernel_name}( - const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, float* grad_reciprocal_tau, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, - const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) - ''' - code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - ''' - code += f'__shared__ float sdata[{cuda_threads}];' - code += r''' - if (index < neuron_num) - { - float grad_h = 0.0f; // grad_h will be used recursively - sdata[threadIdx.x] = 0.0f; - for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) - { - const int t = index + mem_offset; - const float over_th = h_seq[t] - v_threshold; - ''' - code += code_grad_s_to_h - if detach_reset: - if hard_reset: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - spike_seq[t]; - ''' - else: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f; - ''' + if hard_reset: + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); + ''' else: - if hard_reset: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; - // const float grad_v_to_h = fmaf(v_reset - h_seq[t], grad_s_to_h, 1.0f - spike_seq[t]); - ''' - else: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; - // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); - ''' - - code += code_grad_v_to_h + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + // const float grad_v_to_h = fmaf(-v_threshold, grad_s_to_h, 1.0f); + ''' + + code += code_grad_v_to_h + code += r''' + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; + // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); + ''' + if decay_input: code += r''' - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * one_sub_reciprocal_tau) * grad_v_to_h; - // grad_h = fmaf(grad_spike_seq[t], grad_s_to_h, fmaf(grad_h, one_sub_reciprocal_tau, grad_v_seq[t]) * grad_v_to_h); grad_x_seq[t] = grad_h * reciprocal_tau; sdata[threadIdx.x] += grad_h * (h_seq[t] - v_v_seq[t]) / reciprocal_tau; - } - grad_v_last[index] = grad_x_seq[index] * one_sub_reciprocal_tau; - } - else - { - sdata[threadIdx.x] = 0.0f; - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] += sdata[threadIdx.x + stride]; - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - atomicAdd(grad_reciprocal_tau, sdata[0]); - } - } - ''' - - elif dtype == 'fp16': - code = fr''' - #include - extern "C" __global__ - void {kernel_name}( - const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, half* grad_reciprocal_tau, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, - const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) ''' - code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - const int stride = neuron_num >> 1; - - ''' - code += f'__shared__ half2 sdata[{cuda_threads}];' - code += r''' - if (index < stride) - { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); - const half2 v_threshold_half2 = __half2half2(v_threshold); - ''' - + else: if hard_reset: code += r''' - const half2 v_reset_half2 = __half2half2(v_reset); + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] += grad_h * (v_reset - v_v_seq[t]); ''' - - code += r''' - - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - sdata[threadIdx.x] = __float2half2_rn(0.0f); - for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) - { - const int t = index + mem_offset; - - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - - ''' - code += code_grad_s_to_h - - if detach_reset: - if hard_reset: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); - ''' - else: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __float2half2_rn(1.0f); - ''' else: - if hard_reset: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); - ''' - else: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); - ''' - - code += code_grad_v_to_h - code += r''' - grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - sdata[threadIdx.x] = __hadd2(__h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2), sdata[threadIdx.x]); - } - - grad_v_last[index] = __hmul2(grad_x_seq[index], one_sub_reciprocal_tau_half2); - } - else - { - sdata[threadIdx.x] = __float2half2_rn(0.0f); - } - int threadx = blockDim.x; - #pragma unroll - for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) - { - // Synchronize all thread before next loop - __syncthreads(); - if (threadIdx.x < stride) - { - sdata[threadIdx.x] = __hadd2(sdata[threadIdx.x], sdata[threadIdx.x + stride]); - } - } - __syncthreads(); - if (threadIdx.x == 0) - { - //grad_reciprocal_tau[0] = __hadd(__low2half(sdata[0]), __high2half(sdata[0])); - - /* - The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. - - The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. - - The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. - - The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. - - The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. - */ - - atomicAdd(grad_reciprocal_tau, __hadd(__low2half(sdata[0]), __high2half(sdata[0]))); - - } + code += r''' + grad_x_seq[t] = grad_h; + sdata[threadIdx.x] -= grad_h * v_v_seq[t]; + ''' + code += r''' } + grad_v_last[index] = grad_h * one_sub_reciprocal_tau; + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int stride = threadx >> 1; stride > 0; stride = stride >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < stride) + { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + atomicAdd(grad_reciprocal_tau, sdata[0]); + } + } + ''' + + elif dtype == 'fp16': + code = fr''' + #include + extern "C" __global__ + void {kernel_name}( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, float* grad_reciprocal_tau, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel)\ + // note that grad_reciprocal_tau is float to avoid overflow + ''' + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + + ''' + code += f'__shared__ float sdata[{configure.cuda_threads}];' + code += r''' + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 v_threshold_half2 = __half2half2(v_threshold); + ''' + + if hard_reset: + code += r''' + const half2 v_reset_half2 = __half2half2(v_reset); ''' - else: - raise TypeError - - return cupy.RawKernel(code, kernel_name, options=cuda_compiler_options, backend=cuda_compiler_backend) - - @staticmethod - def forward(ctx, x_seq: torch.Tensor, v_last: torch.Tensor, reciprocal_tau: torch.Tensor, v_threshold: float, - v_reset: float, detach_reset: bool, sg_cuda_code_fun): - requires_grad = x_seq.requires_grad or v_last.requires_grad - device = x_seq.get_device() - if x_seq.dtype == torch.float32: - dtype = 'fp32' - cp_dtype = np.float32 - elif x_seq.dtype == torch.float16: - dtype = 'fp16' - cp_dtype = np.half - assert torch.cuda.get_device_capability(device)[0] >= 7, "MultiStepParametricLIFNodePTT can not run in the current device with float16 because the 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher." - - else: - raise NotImplementedError - - use_pad = False - if dtype == 'fp16' and v_last.numel() % 2 != 0: - # only fp16 needs even numel because we use half2 to accelerate - # when numel is odd, we will pad x_seq - use_pad = True - x_seq = F.pad(x_seq, (0, 1)) # [T, N] -> [T, N + 1] - v_last = F.pad(v_last, (0, 1)) # [N] -> [N + 1] + code += r''' - v_seq = torch.zeros_like(x_seq.data) - h_seq = torch.zeros_like(x_seq.data) - spike_seq = torch.zeros_like(x_seq.data) + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + sdata[threadIdx.x] = 0.0f; + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; - v_v_seq = torch.cat((v_last.unsqueeze(0), v_seq)) - tau = 1. / reciprocal_tau.item() + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - with cupy.cuda.Device(device): - numel = x_seq.numel() - neuron_num = numel // x_seq.shape[0] + ''' + code += code_grad_s_to_h - threads = cuda_threads - if dtype == 'fp16': - assert neuron_num % 2 == 0 - blocks = cu_kernel_opt.cal_blocks(neuron_num >> 1) - # we will take two neurons to calculate as one neuron in cuda half2 + if detach_reset: + if hard_reset: + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + ''' else: - blocks = cu_kernel_opt.cal_blocks(neuron_num) - - cp_numel = cupy.asarray(numel) - cp_neuron_num = cupy.asarray(neuron_num) - cp_v_threshold = cupy.asarray(v_threshold, dtype=cp_dtype) - cp_reciprocal_tau = cupy.asarray(1. / tau, dtype=cp_dtype) - cp_one_sub_reciprocal_tau = cupy.asarray(1. - 1. / tau, dtype=cp_dtype) - - if v_reset is None: - cp_v_reset = None - hard_reset = False - x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( - x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, cp_numel) - kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, - cp_numel] + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __float2half2_rn(1.0f); + ''' + else: + if hard_reset: + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + ''' else: - cp_v_reset = cupy.asarray(v_reset, dtype=cp_dtype) - hard_reset = True - x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( - x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, cp_neuron_num, - cp_numel) - kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, - cp_neuron_num, cp_numel] - - kernel = MultiStepParametricLIFNodePTT.create_fptt_kernel(hard_reset, dtype) - - kernel( - (blocks,), (threads,), - cu_kernel_opt.wrap_args_to_raw_kernel( - device, - *kernel_args - ) - ) + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + ''' - if requires_grad: - ctx.use_pad = use_pad - ctx.save_for_backward(h_seq, spike_seq, v_v_seq) - ctx.blocks = blocks - ctx.threads = threads - ctx.cp_numel = cp_numel - ctx.cp_neuron_num = cp_neuron_num - ctx.cp_reciprocal_tau = cp_reciprocal_tau - ctx.cp_one_sub_reciprocal_tau = cp_one_sub_reciprocal_tau - ctx.cp_v_threshold = cp_v_threshold - ctx.cp_v_reset = cp_v_reset - ctx.detach_reset = detach_reset - ctx.sg_cuda_code_fun = sg_cuda_code_fun - - if use_pad: - return spike_seq[..., :-1], v_v_seq[1:, ..., :-1] + code += code_grad_v_to_h + code += r''' + grad_h = __hfma2(__hfma2(grad_h, one_sub_reciprocal_tau_half2, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + ''' + if decay_input: + code += r''' + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + half2 temp_sum = __h2div(__hmul2(grad_h, __hsub2(h_seq[t], v_v_seq[t])), reciprocal_tau_half2); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + ''' else: - return spike_seq, v_v_seq[1:, ] - - @staticmethod - def backward(ctx, grad_spike_seq, grad_v_seq): - if ctx.use_pad: - # grad_spike_seq.shape = [T, N] - # grad_v_seq.shape = [T, N] - # h_seq.shape = [T, N + 1] - # spike_seq.shape = [T, N + 1] - grad_spike_seq = F.pad(grad_spike_seq, (0, 1)) - grad_v_seq = F.pad(grad_v_seq, (0, 1)) - - device = grad_spike_seq.get_device() - h_seq, spike_seq, v_v_seq = ctx.saved_tensors - grad_x_seq = torch.zeros_like(grad_spike_seq) - grad_v_last = torch.zeros_like(grad_spike_seq[0]) - grad_reciprocal_tau = torch.as_tensor(0.).to(grad_spike_seq) + if hard_reset: + code += r''' + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hsub2(v_reset_half2, v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + ''' + else: + code += r''' + grad_x_seq[t] = grad_h; + half2 temp_sum = __hmul2(grad_h, __hneg2(v_v_seq[t])); + sdata[threadIdx.x] += __half2float(__hadd(__low2half(temp_sum), __high2half(temp_sum))); + ''' + code += r''' + } + grad_v_last[index] = __hmul2(grad_h, one_sub_reciprocal_tau_half2); + } + else + { + sdata[threadIdx.x] = 0.0f; + } + int threadx = blockDim.x; + #pragma unroll + for (int i = threadx >> 1; i > 0; i = i >> 1) + { + // Synchronize all thread before next loop + __syncthreads(); + if (threadIdx.x < i) + { + sdata[threadIdx.x] += sdata[threadIdx.x + i]; + } + } + __syncthreads(); + if (threadIdx.x == 0) + { + /* + The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher. + + The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. + + The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher. The atomicity of the __half2 or __nv_bfloat162 add operation is guaranteed separately for each of the two __half or __nv_bfloat16 elements; the entire __half2 or __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. + + The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. + + The 16-bit __nv_bfloat16 floating-point version of atomicAdd() is only supported by devices of compute capability 8.x and higher. + */ + + atomicAdd(grad_reciprocal_tau, sdata[0]); + + } + } + ''' + else: + raise TypeError + + return cupy.RawKernel(code, kernel_name, options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) + + @staticmethod + def forward(ctx, x_seq: torch.Tensor, v_last: torch.Tensor, reciprocal_tau: torch.Tensor, decay_input: bool, v_threshold: float, + v_reset: float, detach_reset: bool, sg_cuda_code_fun): + # reciprocal_tau.dtype is float32 even when using amp + requires_grad = x_seq.requires_grad or v_last.requires_grad + device = x_seq.get_device() + if x_seq.dtype == torch.float32: + dtype = 'fp32' + cp_dtype = np.float32 + elif x_seq.dtype == torch.float16: + dtype = 'fp16' + cp_dtype = np.half + # assert torch.cuda.get_device_capability(device)[0] >= 7, "MultiStepParametricLIFNodePTT can not run in the current device with float16 because the 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher." + + else: + raise NotImplementedError + + use_pad = False + if dtype == 'fp16' and v_last.numel() % 2 != 0: + # only fp16 needs even numel because we use half2 to accelerate + # when numel is odd, we will pad x_seq + use_pad = True + x_seq = F.pad(x_seq, (0, 1)) # [T, N] -> [T, N + 1] + v_last = F.pad(v_last, (0, 1)) # [N] -> [N + 1] + + zero_shape = list(x_seq.shape) + zero_shape[0] *= 3 + v_seq, h_seq, spike_seq = torch.split(torch.zeros(zero_shape, device=x_seq.device, dtype=x_seq.dtype), x_seq.shape[0]) + + v_v_seq = torch.cat((v_last.unsqueeze(0), v_seq)) + tau = 1. / reciprocal_tau.item() + + with cu_kernel_opt.DeviceEnvironment(device): + numel = x_seq.numel() + neuron_num = numel // x_seq.shape[0] + + threads = configure.cuda_threads + if dtype == 'fp16': + assert neuron_num % 2 == 0 + blocks = cu_kernel_opt.cal_blocks(neuron_num >> 1) + # we will take two neurons to calculate as one neuron in cuda half2 + else: + blocks = cu_kernel_opt.cal_blocks(neuron_num) - if ctx.cp_v_reset is None: + cp_numel = cupy.asarray(numel) + cp_neuron_num = cupy.asarray(neuron_num) + cp_v_threshold = cupy.asarray(v_threshold, dtype=cp_dtype) + cp_reciprocal_tau = cupy.asarray(1. / tau, dtype=cp_dtype) + cp_one_sub_reciprocal_tau = cupy.asarray(1. - 1. / tau, dtype=cp_dtype) + + if v_reset is None: + cp_v_reset = None hard_reset = False + x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( + x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, cp_numel) + kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_neuron_num, + cp_numel] else: + cp_v_reset = cupy.asarray(v_reset, dtype=cp_dtype) hard_reset = True - - if grad_spike_seq.dtype == torch.float32: - dtype = 'fp32' - elif grad_spike_seq.dtype == torch.float16: - dtype = 'fp16' - else: - raise NotImplementedError - - kernel = MultiStepParametricLIFNodePTT.create_bptt_kernel(ctx.sg_cuda_code_fun, hard_reset, - ctx.detach_reset, dtype) - - with cupy.cuda.Device(device): - - if hard_reset: - grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( - grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, - grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, - ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel) - kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, - grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, - ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel] - else: - grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( - grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, - grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, - ctx.cp_neuron_num, ctx.cp_numel) - kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, - grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, - ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel] - - kernel( - (ctx.blocks,), (ctx.threads,), - cu_kernel_opt.wrap_args_to_raw_kernel( - device, - *kernel_args - ) + x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous( + x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, cp_neuron_num, + cp_numel) + kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_v_threshold, cp_v_reset, + cp_neuron_num, cp_numel] + + kernel = MultiStepParametricLIFNodePTT.create_fptt_kernel(decay_input, hard_reset, dtype) + + kernel( + (blocks,), (threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device, + *kernel_args ) - - if ctx.use_pad: - return grad_x_seq[..., :-1], grad_v_last[..., :-1], grad_reciprocal_tau, None, None, None, None + ) + + if requires_grad: + ctx.decay_input = decay_input + ctx.use_pad = use_pad + if configure.save_spike_as_bool_in_neuron_kernel: + ctx.s_shape = spike_seq.shape + ctx.s_tk = tensor_cache.BOOL_TENSOR_CACHE.store_bool(spike_seq) + ctx.save_for_backward(h_seq, v_v_seq) else: - return grad_x_seq, grad_v_last, grad_reciprocal_tau, None, None, None, None - - - def check_multi_step_neuron_output_and_grad(device, multi_step_neuron, *neu_args, **neu_kwargs): - @torch.no_grad() - def max_error(x, y): - return (x - y).abs().max().item() - - def fbptt(m, x: torch.Tensor): - x = x.detach() - x.requires_grad_(True) - m(x) - (m.spike_seq * m.v_seq ** 2).sum().backward() - ret = { - 'spike_seq': m.spike_seq.detach().clone(), - 'v_seq': m.v_seq.detach().clone(), - 'x.grad': x.grad.clone() - } - for i, param in enumerate(m.parameters()): - ret[f'param_{i}.grad'] = param.grad.detach().clone() - param.grad.zero_() - x.grad.zero_() - m.reset() - return ret - - shape = [65, 15, 2047] - for hard_reset in [True, False]: - for detach_reset in [False, True]: - for dtype in ['fp32', 'fp16']: - x = (torch.rand(shape, device=device) - 0.5) * 3. - if dtype == 'fp16': - x = x.half() - print(f'hard_reset={hard_reset}, detach_reset={detach_reset}, dtype={dtype}') - model = multi_step_neuron(v_reset=0. if hard_reset else None, detach_reset=detach_reset, *neu_args, - **neu_kwargs) - # print(model) - model.to(device) - model.backend = 'torch' - y_torch = fbptt(model, x) - - model.backend = 'cupy' - y_cupy = fbptt(model, x) - - for key in y_torch.keys(): - print(key, 'max error', max_error(y_torch[key], y_cupy[key])) - print('\n') - - - def save_cuda_codes(cu_file_path: str = './spikingjelly/clock_driven/neuron_kernel.cu'): - # save all cuda codes to files - with open(cu_file_path, 'w+') as cu_file: - cu_file.write('// This file is created by spikingjelly.clock_driven.neuron_kernel.save_cuda_codes.\n') - for ms_neu in [MultiStepIFNodePTT, MultiStepLIFNodePTT, MultiStepParametricLIFNodePTT]: - cu_file.write('\n// ' + ms_neu.__name__ + '\n') - for sg in [surrogate.ATan, surrogate.Sigmoid, surrogate.PiecewiseLeakyReLU]: - for hard_reset in [True, False]: - for dtype in ['fp32', 'fp16']: - cu_file.write( - f'\n// {ms_neu.__name__} fptt {sg.__name__}, hard_reset={hard_reset}, dtype={dtype}\n') - fp_codes = ms_neu.create_fptt_kernel(hard_reset, dtype).code - cu_file.write(fp_codes) - for detach_reset in [True, False]: - cu_file.write( - f'\n// {ms_neu.__name__} bptt {sg.__name__}, hard_reset={hard_reset}, dtype={dtype}, detach_reset={detach_reset}\n') - bp_codes = ms_neu.create_bptt_kernel(sg().cuda_code, hard_reset, detach_reset, - dtype).code - cu_file.write(bp_codes) - - - class MultiStepEIFNodePTT(torch.autograd.Function): - @staticmethod - def create_fptt_kernel(hard_reset: bool, dtype: str): - kernel_name = f'EIFNode_fptt_{"hard" if hard_reset else "soft"}Reset_{dtype}' - - if dtype == 'fp32': - code = rf''' - extern "C" __global__ - void {kernel_name}(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, - const float & reciprocal_tau, - const float & delta_T, - const float & theta_rh, - const float & v_threshold, - const float & v_rest, {'const float & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) - ''' - code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index < neuron_num) + ctx.save_for_backward(h_seq, spike_seq, v_v_seq) + ctx.blocks = blocks + ctx.threads = threads + ctx.cp_numel = cp_numel + ctx.cp_neuron_num = cp_neuron_num + ctx.cp_reciprocal_tau = cp_reciprocal_tau + ctx.cp_one_sub_reciprocal_tau = cp_one_sub_reciprocal_tau + ctx.cp_v_threshold = cp_v_threshold + ctx.cp_v_reset = cp_v_reset + ctx.detach_reset = detach_reset + ctx.sg_cuda_code_fun = sg_cuda_code_fun + + if use_pad: + return spike_seq[..., :-1], v_v_seq[1:, ..., :-1] + else: + return spike_seq, v_v_seq[1:, ] + + @staticmethod + def backward(ctx, grad_spike_seq, grad_v_seq): + if ctx.use_pad: + # grad_spike_seq.shape = [T, N] + # grad_v_seq.shape = [T, N] + # h_seq.shape = [T, N + 1] + # spike_seq.shape = [T, N + 1] + grad_spike_seq = F.pad(grad_spike_seq, (0, 1)) + grad_v_seq = F.pad(grad_v_seq, (0, 1)) + + device = grad_spike_seq.get_device() + if configure.save_spike_as_bool_in_neuron_kernel: + spike_seq = tensor_cache.BOOL_TENSOR_CACHE.get_float(ctx.s_tk, ctx.s_shape) + h_seq, v_v_seq = ctx.saved_tensors + else: + h_seq, spike_seq, v_v_seq = ctx.saved_tensors + zero_shape = list(grad_spike_seq.shape) + zero_shape[0] += 1 + zero_data = torch.zeros(zero_shape, device=grad_spike_seq.device, dtype=grad_spike_seq.dtype) + grad_x_seq = zero_data[0: -1] + grad_v_last = zero_data[-1] + grad_reciprocal_tau = torch.as_tensor(0., device=grad_spike_seq.device, dtype=torch.float32) + + if ctx.cp_v_reset is None: + hard_reset = False + else: + hard_reset = True + + if grad_spike_seq.dtype == torch.float32: + dtype = 'fp32' + elif grad_spike_seq.dtype == torch.float16: + dtype = 'fp16' + else: + raise NotImplementedError + + kernel = MultiStepParametricLIFNodePTT.create_bptt_kernel(ctx.sg_cuda_code_fun, ctx.decay_input, hard_reset, + ctx.detach_reset, dtype) + + with cu_kernel_opt.DeviceEnvironment(device): + + if hard_reset: + grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( + grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, + grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, + ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel) + kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, + grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, + ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel] + else: + grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous( + grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, + grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, + ctx.cp_neuron_num, ctx.cp_numel) + kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, + grad_reciprocal_tau, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, + ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel] + + kernel( + (ctx.blocks,), (ctx.threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device, + *kernel_args + ) + ) + + if ctx.use_pad: + return grad_x_seq[..., :-1], grad_v_last[..., :-1], grad_reciprocal_tau, None, None, None, None, None + else: + return grad_x_seq, grad_v_last, grad_reciprocal_tau, None, None, None, None, None + + +def check_multi_step_neuron_output_and_grad(device, multi_step_neuron, shape = [65, 15, 511], *neu_args, **neu_kwargs): + @torch.no_grad() + def max_error(x, y): + return (x - y).abs().max().item() + + def fbptt(m, x: torch.Tensor): + x = x.detach() + x.requires_grad_(True) + spike_seq = m(x) + (spike_seq * m.v_seq ** 2).sum().backward() + ret = { + 'spike_seq': spike_seq.detach().clone(), + 'v_seq': m.v_seq.detach().clone(), + 'x.grad': x.grad.clone() + } + for i, param in enumerate(m.parameters()): + ret[f'param_{i}.grad'] = param.grad.detach().clone() + param.grad.zero_() + x.grad.zero_() + m.reset() + return ret + + for hard_reset in [True, False]: + for detach_reset in [False, True]: + for dtype in ['fp32', 'fp16']: + x = (torch.rand(shape, device=device) - 0.5) * 3. + if dtype == 'fp16': + x = x.half() + print(f'hard_reset={hard_reset}, detach_reset={detach_reset}, dtype={dtype}') + model = multi_step_neuron(v_reset=0. if hard_reset else None, detach_reset=detach_reset, *neu_args, + **neu_kwargs) + # print(model) + model.to(device) + if dtype == 'fp16': + model = model.half() + model.backend = 'torch' + y_torch = fbptt(model, x) + + model.backend = 'cupy' + y_cupy = fbptt(model, x) + + for key in y_torch.keys(): + me = max_error(y_torch[key], y_cupy[key]) + print(key, 'max error', me) + if me > 0.5: + print(f'y_torch[{key}]={y_torch[key]}, y_cupy[{key}]={y_cupy[key]}') + print('\n') + +class MultiStepEIFNodePTT(torch.autograd.Function): + @staticmethod + def create_fptt_kernel(hard_reset: bool, dtype: str): + kernel_name = f'EIFNode_fptt_{"hard" if hard_reset else "soft"}Reset_{dtype}' + + if dtype == 'fp32': + code = rf''' + extern "C" __global__ + void {kernel_name}(const float* x_seq, float* v_v_seq, float* h_seq, float* spike_seq, + const float & reciprocal_tau, + const float & delta_T, + const float & theta_rh, + const float & v_threshold, + const float & v_rest, {'const float & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < neuron_num) + { + const int dt = neuron_num; + for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) { - const int dt = neuron_num; - for(int mem_offset = 0; mem_offset < numel; mem_offset += neuron_num) + const int t = index + mem_offset; + h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); + if (h_seq[t] >= v_threshold) { - const int t = index + mem_offset; - h_seq[t] = v_v_seq[t] + reciprocal_tau * (x_seq[t] - v_v_seq[t] + v_rest + delta_T * expf((v_v_seq[t] - theta_rh) / delta_T)); - if (h_seq[t] >= v_threshold) - { - spike_seq[t] = 1.0f; - ''' - - if hard_reset: - code += r''' - v_v_seq[t + dt] = v_reset; - ''' - else: - code += r''' - v_v_seq[t + dt] = h_seq[t] - v_threshold; - ''' + spike_seq[t] = 1.0f; + ''' + if hard_reset: code += r''' - } - else - { - spike_seq[t] = 0.0f; - v_v_seq[t + dt] = h_seq[t]; - } - - } - } - } - ''' - - elif dtype == 'fp16': - code = rf''' - #include - extern "C" __global__ - void {kernel_name}(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, - const half & reciprocal_tau, - const half & delta_T, - const half & theta_rh, - const half & v_threshold, - const half & v_rest, {'const half & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) + v_v_seq[t + dt] = v_reset; ''' - + else: code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - const int stride = neuron_num >> 1; - if (index < stride) - { - const int numel_2 = numel >> 1; - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 delta_T_half2 = __half2half2(delta_T); - const half2 theta_rh_half2 = __half2half2(theta_rh); - const half2 v_threshold_half2 = __half2half2(v_threshold); - const half2 v_rest_half2 = __half2half2(v_rest); + v_v_seq[t + dt] = h_seq[t] - v_threshold; ''' - if hard_reset: - code += r''' - const half2 v_reset_half2 = __half2half2(v_reset); - ''' - - code += r''' - for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + code += r''' + } + else { - const int t = index + mem_offset; - h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + spike_seq[t] = 0.0f; + v_v_seq[t + dt] = h_seq[t]; + } - spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + } + } + } + ''' + + elif dtype == 'fp16': + code = rf''' + #include + extern "C" __global__ + void {kernel_name}(const half2* x_seq, half2* v_v_seq, half2* h_seq, half2* spike_seq, + const half & reciprocal_tau, + const half & delta_T, + const half & theta_rh, + const half & v_threshold, + const half & v_rest, {'const half & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const int numel_2 = numel >> 1; + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 delta_T_half2 = __half2half2(delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + const half2 v_rest_half2 = __half2half2(v_rest); + ''' + + if hard_reset: + code += r''' + const half2 v_reset_half2 = __half2half2(v_reset); ''' - - if hard_reset: - code += r''' - v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); - ''' - else: - code += r''' - v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); - ''' + code += r''' + for(int mem_offset = 0; mem_offset < numel_2; mem_offset += stride) + { + const int t = index + mem_offset; + h_seq[t] = __hfma2(__hfma2(h2exp(__h2div(__hsub2(v_v_seq[t], theta_rh_half2), delta_T_half2)), delta_T_half2, __hadd2(__hsub2(x_seq[t], v_v_seq[t]), v_rest_half2)), reciprocal_tau_half2, v_v_seq[t]); + + spike_seq[t] = __hgeu2(h_seq[t], v_threshold_half2); + ''' + + if hard_reset: code += r''' - } - } - } + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], v_reset_half2), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); ''' else: - raise TypeError + code += r''' + v_v_seq[t + stride] = __hadd2(__hmul2(spike_seq[t], __hsub2(h_seq[t], v_threshold_half2)), __hmul2(__hsub2(__float2half2_rn(1.0f), spike_seq[t]), h_seq[t])); + ''' - return cupy.RawKernel(code, kernel_name, options=cuda_compiler_options, backend=cuda_compiler_backend) + code += r''' + } + } + } + ''' + else: + raise TypeError - @staticmethod - def create_bptt_kernel(sg_cuda_code_fun, hard_reset: bool, detach_reset: bool, dtype: str): + return cupy.RawKernel(code, kernel_name, options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) - kernel_name = f'EIFNode_bptt_{"hard" if hard_reset else "soft"}Reset_{"detachReset" if detach_reset else ""}_{dtype}' + @staticmethod + def create_bptt_kernel(sg_cuda_code_fun, hard_reset: bool, detach_reset: bool, dtype: str): - code_grad_s_to_h = sg_cuda_code_fun(x='over_th', y='grad_s_to_h', dtype=dtype) + kernel_name = f'EIFNode_bptt_{"hard" if hard_reset else "soft"}Reset_{"detachReset" if detach_reset else ""}_{dtype}' - if dtype == 'fp32': - code = fr''' - extern "C" __global__ - void {kernel_name}( - const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, - float* grad_x_seq, float* grad_v_last, - const float & theta_rh, const float & reciprocal_delta_T, - const float & reciprocal_tau, const float & one_sub_reciprocal_tau, - const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) - ''' + code_grad_s_to_h = sg_cuda_code_fun(x='over_th', y='grad_s_to_h', dtype=dtype) - code += r''' - { - const int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index < neuron_num) - { - float grad_h = 0.0f; // grad_h will be used recursively - for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) - { - const int t = index + mem_offset; - const float over_th = h_seq[t] - v_threshold; - ''' - code += code_grad_s_to_h - if detach_reset: - if hard_reset: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - spike_seq[t]; - ''' - else: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f; - ''' - else: - if hard_reset: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; - ''' - else: - code_grad_v_to_h = r''' - const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; - ''' - - code += code_grad_v_to_h - code += r''' - grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; - grad_x_seq[t] = grad_h * reciprocal_tau; - } - grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); - } - } - ''' + if dtype == 'fp32': + code = fr''' + extern "C" __global__ + void {kernel_name}( + const float* grad_spike_seq, const float* grad_v_seq, const float* h_seq, const float* spike_seq, const float* v_v_seq, + float* grad_x_seq, float* grad_v_last, + const float & theta_rh, const float & reciprocal_delta_T, + const float & reciprocal_tau, const float & one_sub_reciprocal_tau, + const float & v_threshold, {'const float & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' - elif dtype == 'fp16': - code = fr''' - #include - extern "C" __global__ - void {kernel_name}( - const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, - half2* grad_x_seq, half2* grad_v_last, - const half & theta_rh, const half & reciprocal_delta_T, - const half & reciprocal_tau, const half & one_sub_reciprocal_tau, - const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} - const int & neuron_num, const int & numel) - ''' - code += r''' - { + code += r''' + { const int index = blockIdx.x * blockDim.x + threadIdx.x; - const int stride = neuron_num >> 1; - if (index < stride) + if (index < neuron_num) { - const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); - const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); - const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); - const half2 theta_rh_half2 = __half2half2(theta_rh); - const half2 v_threshold_half2 = __half2half2(v_threshold); - ''' - - if hard_reset: - code += r''' - const half2 v_reset_half2 = __half2half2(v_reset); - ''' - - code += r''' - half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively - for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + float grad_h = 0.0f; // grad_h will be used recursively + for(int mem_offset = numel - neuron_num; mem_offset >= 0; mem_offset -= neuron_num) { const int t = index + mem_offset; - - const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); - ''' - code += code_grad_s_to_h - - if detach_reset: - if hard_reset: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); - ''' - else: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __float2half2_rn(1.0f); - ''' + const float over_th = h_seq[t] - v_threshold; + ''' + code += code_grad_s_to_h + if detach_reset: + if hard_reset: + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - spike_seq[t]; + ''' else: - if hard_reset: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); - ''' - else: - code_grad_v_to_h = r''' - const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); - ''' - - code += code_grad_v_to_h - code += r''' - grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); - grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); - } - grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); - } - } - ''' - else: - raise TypeError - return cupy.RawKernel(code, kernel_name, options=cuda_compiler_options, backend=cuda_compiler_backend) - - @staticmethod - def forward(ctx, x_seq: torch.Tensor, v_last: torch.Tensor, tau: float, v_threshold: float, v_reset: float, v_rest: float, theta_rh: float, delta_T: float, detach_reset: bool, sg_cuda_code_fun): - requires_grad = x_seq.requires_grad or v_last.requires_grad - device = x_seq.get_device() - if x_seq.dtype == torch.float32: - dtype = 'fp32' - cp_dtype = np.float32 - elif x_seq.dtype == torch.float16: - dtype = 'fp16' - cp_dtype = np.half + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f; + ''' else: - raise NotImplementedError - - use_pad = False - if dtype == 'fp16' and v_last.numel() % 2 != 0: - # only fp16 needs even numel because we use half2 to accelerate - # when numel is odd, we will pad x_seq - use_pad = True - x_seq = F.pad(x_seq, (0, 1)) # [T, N] -> [T, N + 1] - v_last = F.pad(v_last, (0, 1)) # [N] -> [N + 1] + if hard_reset: + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - spike_seq[t] + (v_reset - h_seq[t]) * grad_s_to_h; + ''' + else: + code_grad_v_to_h = r''' + const float grad_v_to_h = 1.0f - v_threshold * grad_s_to_h; + ''' - v_seq = torch.zeros_like(x_seq.data) - h_seq = torch.zeros_like(x_seq.data) - spike_seq = torch.zeros_like(x_seq.data) + code += code_grad_v_to_h + code += r''' + grad_h = grad_spike_seq[t] * grad_s_to_h + (grad_v_seq[t] + grad_h * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[t + neuron_num] - theta_rh) * reciprocal_delta_T))) * grad_v_to_h; + grad_x_seq[t] = grad_h * reciprocal_tau; + } + grad_v_last[index] = grad_x_seq[index] * (one_sub_reciprocal_tau + reciprocal_tau * expf((v_v_seq[index] - theta_rh) * reciprocal_delta_T)); + } + } + ''' + + elif dtype == 'fp16': + code = fr''' + #include + extern "C" __global__ + void {kernel_name}( + const half2* grad_spike_seq, const half2* grad_v_seq, const half2* h_seq, const half2* spike_seq, const half2* v_v_seq, + half2* grad_x_seq, half2* grad_v_last, + const half & theta_rh, const half & reciprocal_delta_T, + const half & reciprocal_tau, const half & one_sub_reciprocal_tau, + const half & v_threshold, {'const half & v_reset,' if hard_reset else ''} + const int & neuron_num, const int & numel) + ''' + code += r''' + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + const int stride = neuron_num >> 1; + if (index < stride) + { + const half2 reciprocal_tau_half2 = __half2half2(reciprocal_tau); + const half2 one_sub_reciprocal_tau_half2 = __half2half2(one_sub_reciprocal_tau); + const half2 reciprocal_delta_T_half2 = __half2half2(reciprocal_delta_T); + const half2 theta_rh_half2 = __half2half2(theta_rh); + const half2 v_threshold_half2 = __half2half2(v_threshold); + ''' + + if hard_reset: + code += r''' + const half2 v_reset_half2 = __half2half2(v_reset); + ''' - v_v_seq = torch.cat((v_last.unsqueeze(0), v_seq)) + code += r''' + half2 grad_h = __float2half2_rn(0.0f); // grad_h will be used recursively + for(int mem_offset = (numel >> 1) - stride; mem_offset >= 0; mem_offset -= stride) + { + const int t = index + mem_offset; - with cupy.cuda.Device(device): - numel = x_seq.numel() - neuron_num = numel // x_seq.shape[0] + const half2 over_th = __hsub2(h_seq[t], v_threshold_half2); + ''' + code += code_grad_s_to_h - threads = cuda_threads - if dtype == 'fp16': - assert neuron_num % 2 == 0 - blocks = cu_kernel_opt.cal_blocks(neuron_num >> 1) - # we will take two neurons to calculate as one neuron in cuda half2 + if detach_reset: + if hard_reset: + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), spike_seq[t]); + ''' else: - blocks = cu_kernel_opt.cal_blocks(neuron_num) - - cp_numel = cupy.asarray(numel) - cp_neuron_num = cupy.asarray(neuron_num) - cp_v_threshold = cupy.asarray(v_threshold, dtype=cp_dtype) - cp_v_rest = cupy.asarray(v_rest, dtype=cp_dtype) - cp_theta_rh = cupy.asarray(theta_rh, dtype=cp_dtype) - cp_delta_T = cupy.asarray(delta_T, dtype=cp_dtype) - cp_reciprocal_delta_T = cupy.asarray(1. / delta_T, dtype=cp_dtype) - cp_reciprocal_tau = cupy.asarray(1./tau, dtype=cp_dtype) - cp_one_sub_reciprocal_tau = cupy.asarray(1. - 1./tau, dtype=cp_dtype) - - if v_reset is None: - cp_v_reset = None - hard_reset = False - x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous(x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_neuron_num, cp_numel) - kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_neuron_num, cp_numel] + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __float2half2_rn(1.0f); + ''' + else: + if hard_reset: + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hfma2(__hsub2(v_reset_half2, h_seq[t]), grad_s_to_h, __hsub2(__float2half2_rn(1.0f), spike_seq[t])); + ''' else: - cp_v_reset = cupy.asarray(v_reset, dtype=cp_dtype) - hard_reset = True - x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_v_reset, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous(x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_v_reset, cp_neuron_num, cp_numel) - kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_v_reset, cp_neuron_num, cp_numel] - - kernel = MultiStepEIFNodePTT.create_fptt_kernel(hard_reset, dtype) - - - kernel( - (blocks,), (threads,), - cu_kernel_opt.wrap_args_to_raw_kernel( - device, - *kernel_args - ) - ) + code_grad_v_to_h = r''' + const half2 grad_v_to_h = __hsub2(__float2half2_rn(1.0f), __hmul2(v_threshold_half2, grad_s_to_h)); + ''' - if requires_grad: - ctx.use_pad = use_pad - ctx.save_for_backward(h_seq, spike_seq, v_v_seq) - ctx.blocks = blocks - ctx.threads = threads - ctx.cp_numel = cp_numel - ctx.cp_neuron_num = cp_neuron_num - ctx.cp_reciprocal_tau = cp_reciprocal_tau - ctx.cp_one_sub_reciprocal_tau = cp_one_sub_reciprocal_tau - ctx.cp_theta_rh = cp_theta_rh - ctx.cp_reciprocal_delta_T = cp_reciprocal_delta_T - ctx.cp_v_threshold = cp_v_threshold - ctx.cp_v_reset = cp_v_reset - ctx.detach_reset = detach_reset - ctx.sg_cuda_code_fun = sg_cuda_code_fun - - if use_pad: - return spike_seq[..., :-1], v_v_seq[1:, ..., :-1] + code += code_grad_v_to_h + code += r''' + grad_h = __hfma2(__hfma2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[t + stride], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_h, grad_v_seq[t]), grad_v_to_h, __hmul2(grad_spike_seq[t], grad_s_to_h)); + grad_x_seq[t] = __hmul2(grad_h, reciprocal_tau_half2); + } + grad_v_last[index] = __hmul2(__hfma2(h2exp(__hmul2(__hsub2(v_v_seq[index], theta_rh_half2), reciprocal_delta_T_half2)), reciprocal_tau_half2, one_sub_reciprocal_tau_half2), grad_x_seq[index]); + } + } + ''' + else: + raise TypeError + return cupy.RawKernel(code, kernel_name, options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) + + @staticmethod + def forward(ctx, x_seq: torch.Tensor, v_last: torch.Tensor, tau: float, v_threshold: float, v_reset: float, v_rest: float, theta_rh: float, delta_T: float, detach_reset: bool, sg_cuda_code_fun): + requires_grad = x_seq.requires_grad or v_last.requires_grad + device = x_seq.get_device() + if x_seq.dtype == torch.float32: + dtype = 'fp32' + cp_dtype = np.float32 + elif x_seq.dtype == torch.float16: + dtype = 'fp16' + cp_dtype = np.half + else: + raise NotImplementedError + + use_pad = False + if dtype == 'fp16' and v_last.numel() % 2 != 0: + # only fp16 needs even numel because we use half2 to accelerate + # when numel is odd, we will pad x_seq + use_pad = True + x_seq = F.pad(x_seq, (0, 1)) # [T, N] -> [T, N + 1] + v_last = F.pad(v_last, (0, 1)) # [N] -> [N + 1] + + zero_shape = list(x_seq.shape) + zero_shape[0] *= 3 + v_seq, h_seq, spike_seq = torch.split(torch.zeros(zero_shape, device=x_seq.device, dtype=x_seq.dtype), x_seq.shape[0]) + + v_v_seq = torch.cat((v_last.unsqueeze(0), v_seq)) + + with cu_kernel_opt.DeviceEnvironment(device): + numel = x_seq.numel() + neuron_num = numel // x_seq.shape[0] + + threads = configure.cuda_threads + if dtype == 'fp16': + assert neuron_num % 2 == 0 + blocks = cu_kernel_opt.cal_blocks(neuron_num >> 1) + # we will take two neurons to calculate as one neuron in cuda half2 else: - return spike_seq, v_v_seq[1:, ] - - @staticmethod - def backward(ctx, grad_spike_seq, grad_v_seq): - if ctx.use_pad: - # grad_spike_seq.shape = [T, N] - # grad_v_seq.shape = [T, N] - # h_seq.shape = [T, N + 1] - # spike_seq.shape = [T, N + 1] - grad_spike_seq = F.pad(grad_spike_seq, (0, 1)) - grad_v_seq = F.pad(grad_v_seq, (0, 1)) - - device = grad_spike_seq.get_device() - h_seq, spike_seq, v_v_seq = ctx.saved_tensors - grad_x_seq = torch.zeros_like(grad_spike_seq) - grad_v_last = torch.zeros_like(grad_spike_seq[0]) - - if ctx.cp_v_reset is None: + blocks = cu_kernel_opt.cal_blocks(neuron_num) + + cp_numel = cupy.asarray(numel) + cp_neuron_num = cupy.asarray(neuron_num) + cp_v_threshold = cupy.asarray(v_threshold, dtype=cp_dtype) + cp_v_rest = cupy.asarray(v_rest, dtype=cp_dtype) + cp_theta_rh = cupy.asarray(theta_rh, dtype=cp_dtype) + cp_delta_T = cupy.asarray(delta_T, dtype=cp_dtype) + cp_reciprocal_delta_T = cupy.asarray(1. / delta_T, dtype=cp_dtype) + cp_reciprocal_tau = cupy.asarray(1./tau, dtype=cp_dtype) + cp_one_sub_reciprocal_tau = cupy.asarray(1. - 1./tau, dtype=cp_dtype) + + if v_reset is None: + cp_v_reset = None hard_reset = False + x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous(x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_neuron_num, cp_numel) + kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_neuron_num, cp_numel] else: + cp_v_reset = cupy.asarray(v_reset, dtype=cp_dtype) hard_reset = True + x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_v_reset, cp_neuron_num, cp_numel = cu_kernel_opt.get_contiguous(x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_v_reset, cp_neuron_num, cp_numel) + kernel_args = [x_seq, v_v_seq, h_seq, spike_seq, cp_reciprocal_tau, cp_delta_T, cp_theta_rh, cp_v_threshold, cp_v_rest, cp_v_reset, cp_neuron_num, cp_numel] - if grad_spike_seq.dtype == torch.float32: - dtype = 'fp32' - elif grad_spike_seq.dtype == torch.float16: - dtype = 'fp16' - else: - raise NotImplementedError - - kernel = MultiStepEIFNodePTT.create_bptt_kernel(ctx.sg_cuda_code_fun, hard_reset, ctx.detach_reset, dtype) + kernel = MultiStepEIFNodePTT.create_fptt_kernel(hard_reset, dtype) - with cupy.cuda.Device(device): - if hard_reset: - grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous(grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel) - kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel] - else: - grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous(grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel) - kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel] - - kernel( - (ctx.blocks,), (ctx.threads,), - cu_kernel_opt.wrap_args_to_raw_kernel( - device, - *kernel_args - ) + kernel( + (blocks,), (threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device, + *kernel_args ) - if ctx.use_pad: - return grad_x_seq[..., :-1], grad_v_last[..., :-1], None, None, None, None, None, None, None, None + ) + + if requires_grad: + ctx.use_pad = use_pad + if configure.save_spike_as_bool_in_neuron_kernel: + ctx.s_shape = spike_seq.shape + ctx.s_tk = tensor_cache.BOOL_TENSOR_CACHE.store_bool(spike_seq) + ctx.save_for_backward(h_seq, v_v_seq) else: - return grad_x_seq, grad_v_last, None, None, None, None, None, None, None, None - - - - - - -except ImportError: - pass - - - + ctx.save_for_backward(h_seq, spike_seq, v_v_seq) + ctx.blocks = blocks + ctx.threads = threads + ctx.cp_numel = cp_numel + ctx.cp_neuron_num = cp_neuron_num + ctx.cp_reciprocal_tau = cp_reciprocal_tau + ctx.cp_one_sub_reciprocal_tau = cp_one_sub_reciprocal_tau + ctx.cp_theta_rh = cp_theta_rh + ctx.cp_reciprocal_delta_T = cp_reciprocal_delta_T + ctx.cp_v_threshold = cp_v_threshold + ctx.cp_v_reset = cp_v_reset + ctx.detach_reset = detach_reset + ctx.sg_cuda_code_fun = sg_cuda_code_fun + + if use_pad: + return spike_seq[..., :-1], v_v_seq[1:, ..., :-1] + else: + return spike_seq, v_v_seq[1:, ] + + @staticmethod + def backward(ctx, grad_spike_seq, grad_v_seq): + if ctx.use_pad: + # grad_spike_seq.shape = [T, N] + # grad_v_seq.shape = [T, N] + # h_seq.shape = [T, N + 1] + # spike_seq.shape = [T, N + 1] + grad_spike_seq = F.pad(grad_spike_seq, (0, 1)) + grad_v_seq = F.pad(grad_v_seq, (0, 1)) + + device = grad_spike_seq.get_device() + if configure.save_spike_as_bool_in_neuron_kernel: + spike_seq = tensor_cache.BOOL_TENSOR_CACHE.get_float(ctx.s_tk, ctx.s_shape) + h_seq, v_v_seq = ctx.saved_tensors + else: + h_seq, spike_seq, v_v_seq = ctx.saved_tensors + zero_shape = list(grad_spike_seq.shape) + zero_shape[0] += 1 + zero_data = torch.zeros(zero_shape, device=grad_spike_seq.device, dtype=grad_spike_seq.dtype) + grad_x_seq = zero_data[0: -1] + grad_v_last = zero_data[-1] + + if ctx.cp_v_reset is None: + hard_reset = False + else: + hard_reset = True + + if grad_spike_seq.dtype == torch.float32: + dtype = 'fp32' + elif grad_spike_seq.dtype == torch.float16: + dtype = 'fp16' + else: + raise NotImplementedError + + kernel = MultiStepEIFNodePTT.create_bptt_kernel(ctx.sg_cuda_code_fun, hard_reset, ctx.detach_reset, dtype) + + with cu_kernel_opt.DeviceEnvironment(device): + + if hard_reset: + grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous(grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel) + kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_v_reset, ctx.cp_neuron_num, ctx.cp_numel] + else: + grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel = cu_kernel_opt.get_contiguous(grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel) + kernel_args = [grad_spike_seq, grad_v_seq, h_seq, spike_seq, v_v_seq, grad_x_seq, grad_v_last, ctx.cp_theta_rh, ctx.cp_reciprocal_delta_T, ctx.cp_reciprocal_tau, ctx.cp_one_sub_reciprocal_tau, ctx.cp_v_threshold, ctx.cp_neuron_num, ctx.cp_numel] + + kernel( + (ctx.blocks,), (ctx.threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device, + *kernel_args + ) + ) + if ctx.use_pad: + return grad_x_seq[..., :-1], grad_v_last[..., :-1], None, None, None, None, None, None, None, None + else: + return grad_x_seq, grad_v_last, None, None, None, None, None, None, None, None + + +def save_cuda_codes(cu_file_path: str = './spikingjelly/clock_driven/neuron_kernel.cu'): + # save all cuda codes to files + with open(cu_file_path, 'w+') as cu_file: + cu_file.write('// This file is created by spikingjelly.clock_driven.neuron_kernel.save_cuda_codes.\n') + cu_file.write('// Note that codes in this file will not be executed This file is just created for reading.\n') + for ms_neu in [MultiStepIFNodePTT, MultiStepLIFNodePTT, MultiStepParametricLIFNodePTT, MultiStepEIFNodePTT]: + cu_file.write('\n// ' + ms_neu.__name__ + '\n') + for sg in surrogate._has_cuda_: + for hard_reset in [True, False]: + for dtype in ['fp32', 'fp16']: + if ms_neu == MultiStepLIFNodePTT or ms_neu == MultiStepParametricLIFNodePTT: + for decay_input in [True, False]: + cu_file.write( + f'\n// {ms_neu.__name__} fptt {sg.__name__}, decay_input={decay_input}, hard_reset={hard_reset}, dtype={dtype}\n') + fp_codes = ms_neu.create_fptt_kernel(decay_input, hard_reset, dtype).code + cu_file.write(fp_codes) + for detach_reset in [True, False]: + cu_file.write( + f'\n// {ms_neu.__name__} bptt {sg.__name__}, decay_input={decay_input}, hard_reset={hard_reset}, dtype={dtype}, detach_reset={detach_reset}\n') + bp_codes = ms_neu.create_bptt_kernel(sg().cuda_code, decay_input, hard_reset, detach_reset, + dtype).code + cu_file.write(bp_codes) + else: + cu_file.write( + f'\n// {ms_neu.__name__} fptt {sg.__name__}, hard_reset={hard_reset}, dtype={dtype}\n') + fp_codes = ms_neu.create_fptt_kernel(hard_reset, dtype).code + cu_file.write(fp_codes) + for detach_reset in [True, False]: + cu_file.write( + f'\n// {ms_neu.__name__} bptt {sg.__name__}, hard_reset={hard_reset}, dtype={dtype}, detach_reset={detach_reset}\n') + bp_codes = ms_neu.create_bptt_kernel(sg().cuda_code, hard_reset, detach_reset, + dtype).code + cu_file.write(bp_codes) \ No newline at end of file diff --git a/spikingjelly/clock_driven/spike_op.py b/spikingjelly/clock_driven/spike_op.py new file mode 100644 index 0000000..ebd74af --- /dev/null +++ b/spikingjelly/clock_driven/spike_op.py @@ -0,0 +1,506 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.cpp_extension import load_inline +from torch.cuda.amp import custom_fwd, custom_bwd +import logging +from . import tensor_cache + +from torch import Tensor +from typing import Optional, Union +from torch.types import _int, _size +from torch.nn.modules.utils import _single, _pair, _triple + +try: + import cupy +except BaseException as e: + logging.info(f'spikingjelly.clock_driven.spike_op: {e}') + cupy = None + + +try: + logging.info('spikingjelly.clock_driven.spike_op: try to use `torch.utils.cpp_extension.load_inline` to load cudnn functions.') + logging.info(f'If it is hanging, pleast try to delete torch_extensions cache directory. (In most cases, the directory is {torch.utils.cpp_extension._get_build_directory("", False)}.)') + cpp_wrapper = load_inline( + name='cpp_wrapper', + cpp_sources='using namespace at;', + functions=[ + 'cudnn_convolution_backward', + 'cudnn_convolution_backward_input', + 'cudnn_convolution_backward_weight' + ], + with_cuda=True + ) +except BaseException as e: + logging.info(f'spikingjelly.clock_driven.spike_op: {e}') + cpp_wrapper = None + +''' +aten/src/ATen/native/cudnn/ConvPlaceholders.cpp + +at::Tensor cudnn_convolution( + const at::Tensor& input, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) + +There are two overloaded C++ methods `cudnn_convolution`. So, we need to use an alternative syntax to cast the overloaded function. +Refer to https://pybind11.readthedocs.io/en/stable/classes.html#overloaded-methods and https://github.com/pytorch/pytorch/issues/39518 for more details. + +aten/src/ATen/native/cudnn/ConvShared.cpp + +Tensor cudnn_convolution_forward( + CheckedFrom c, + const TensorArg& input, const TensorArg& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) + +aten/src/ATen/native/cudnn/ConvPlaceholders.cpp + +std::tuple cudnn_convolution_backward( + const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) + +aten/src/ATen/native/cudnn/ConvShared.cpp + +at::Tensor cudnn_convolution_backward_input( + IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) + +aten/src/ATen/native/cudnn/ConvShared.cpp + +at::Tensor cudnn_convolution_backward_weight( + IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) +''' + +class spikeConvolution(torch.autograd.Function): + # Pytorch only provides cudnn_convolution without bias. + # Refer to https://github.com/pytorch/pytorch/issues/3823 for more details. + @staticmethod + @custom_fwd + def forward(ctx, spike, weight, bias, stride, padding, dilation, groups): + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1] or ctx.needs_input_grad[2]: + if ctx.needs_input_grad[1]: + ctx.s_shape = spike.shape + ctx.s_tk = tensor_cache.BOOL_TENSOR_CACHE.store_bool(spike) + + if ctx.needs_input_grad[0]: + ctx.save_for_backward(weight) + + ctx.padding = padding + ctx.stride = stride + ctx.dilation = dilation + ctx.groups = groups + ctx.weight_shape = weight.shape + + if spike.dim() == 3: + return F.conv1d(input=spike, weight=weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups) + elif spike.dim() == 4: + return F.conv2d(input=spike, weight=weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups) + elif spike.dim() == 5: + return F.conv3d(input=spike, weight=weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups) + + + + @staticmethod + @custom_bwd + def backward(ctx, grad_output): + grad_spike = None + grad_weight = None + grad_bias = None + if ctx.needs_input_grad[0] and ctx.needs_input_grad[1]: + weight = ctx.saved_tensors[0] + spike = tensor_cache.BOOL_TENSOR_CACHE.get_float(ctx.s_tk, ctx.s_shape) + weight = weight.to(grad_output.dtype) + grad_spike, grad_weight = cpp_wrapper.cudnn_convolution_backward(spike, grad_output, weight, ctx.padding, + ctx.stride, ctx.dilation, ctx.groups, + torch.backends.cudnn.benchmark, + torch.backends.cudnn.deterministic, + torch.backends.cudnn.allow_tf32, ( + True, + True)) + + elif not ctx.needs_input_grad[0] and ctx.needs_input_grad[1]: + spike = tensor_cache.BOOL_TENSOR_CACHE.get_float(ctx.s_tk, ctx.s_shape) + grad_weight = cpp_wrapper.cudnn_convolution_backward_weight(ctx.weight_shape, grad_output, spike, ctx.padding, + ctx.stride, ctx.dilation, ctx.groups, + torch.backends.cudnn.benchmark, + torch.backends.cudnn.deterministic, + torch.backends.cudnn.allow_tf32) + + elif ctx.needs_input_grad[0] and not ctx.needs_input_grad[1]: + weight = ctx.saved_tensors[0] + weight = weight.to(grad_output.dtype) + grad_spike = cpp_wrapper.cudnn_convolution_backward_input(ctx.spike_shape, grad_output, weight, ctx.padding, + ctx.stride, ctx.dilation, ctx.groups, + torch.backends.cudnn.benchmark, + torch.backends.cudnn.deterministic, + torch.backends.cudnn.allow_tf32) + + if ctx.needs_input_grad[2]: + # grad_output.shape = [N, C, *] + out_channels = grad_output.shape[1] + grad_bias = grad_output.transpose(0, 1).reshape(out_channels, -1).sum(1) + return grad_spike, grad_weight, grad_bias, None, None, None, None + +class spikeLinear(torch.autograd.Function): + @staticmethod + @custom_fwd + def forward(ctx, spike, weight, bias=None): + # spike.shape = [N, *, in_features] + # weight.shape = [out_features, in_features] + # bias.shape = [out_features] + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1] or ctx.needs_input_grad[2]: + if ctx.needs_input_grad[1]: + ctx.s_shape = spike.shape + ctx.s_tk = tensor_cache.BOOL_TENSOR_CACHE.store_bool(spike) + if ctx.needs_input_grad[1]: + ctx.save_for_backward(weight) + return F.linear(spike, weight, bias) + + @staticmethod + @custom_bwd + def backward(ctx, grad_output): + # grad_output.shape = [N, *, out_features] + if ctx.needs_input_grad[1]: + weight = ctx.saved_tensors[0] + if ctx.needs_input_grad[0]: + spike = tensor_cache.BOOL_TENSOR_CACHE.get_float(ctx.s_tk, ctx.s_shape) + + grad_spike = grad_weight = grad_bias = None + + if ctx.needs_input_grad[0]: + grad_spike = F.linear(grad_output, weight.t(), bias=None) + if ctx.needs_input_grad[1]: + in_features = spike.shape[-1] + out_features = grad_output.shape[-1] + # grad_output.reshape(-1, out_features).t().shape = [out_features, N*] + # spike.reshape(-1, in_features).shape = [N*, in_features] + grad_weight = torch.mm(grad_output.reshape(-1, out_features).t(), spike.reshape(-1, in_features).to(grad_output.dtype)) + if ctx.needs_input_grad[2]: + out_features = grad_output.shape[-1] + grad_bias = grad_output.reshape(-1, out_features).sum(0) + return grad_spike, grad_weight, grad_bias + +def spike_linear(spike: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor: + """ + * :ref:`API in English ` + + .. _spike_linear-cn: + + :class:`torch.nn.functional.linear` 在输入为脉冲时的特例。 + + .. note:: + + 在CUDA设备上训练时拥有比 :class:`torch.nn.functional.linear` 更低的显存消耗。 + + .. warning:: + + `spike` 中的任何元素都必须为0或1。 + + * :ref:`中文API ` + + .. _spike_linear-en: + + A specific case of :class:`torch.nn.functional.linear` with inputs are spikes. + + .. admonition:: Note + :class: note + + This function has less memory consumption than :class:`torch.nn.functional.linear` when training on CUDA devices. + + .. admonition:: Warning + :class: warning + + Any element in `spike` must be 0 or 1. + """ + if spike.get_device() < 0: + return F.linear(spike, weight, bias) + else: + return spikeLinear.apply(spike, weight, bias) + +def spike_conv1d(spike: Tensor, weight: Tensor, bias: Tensor=None, stride: Union[_int, _size]=1, padding: str="valid", dilation: Union[_int, _size]=1, groups: _int=1) -> Tensor: + """ + * :ref:`API in English ` + + .. _spike_conv1d-cn: + + :class:`torch.nn.functional.conv1d` 在输入为脉冲时的特例。 + + .. note:: + + 在CUDA设备上训练时拥有比 :class:`torch.nn.functional.conv1d` 更低的显存消耗。 + + .. warning:: + + `spike` 中的任何元素都必须为0或1。 + + * :ref:`中文API ` + + .. _spike_conv1d-en: + + A specific case of :class:`torch.nn.functional.conv1d` with inputs are spikes. + + .. admonition:: Note + :class: note + + This function has less memory consumption than :class:`torch.nn.functional.conv1d` when training on CUDA devices. + + .. admonition:: Warning + :class: warning + + Any element in `spike` must be 0 or 1. + """ + if spike.get_device() < 0: + return F.conv1d(spike, weight, bias, stride, padding, dilation, groups) + else: + return spikeConvolution.apply(spike, weight, bias, stride, padding, dilation, groups) + +def spike_conv2d(spike: Tensor, weight: Tensor, bias: Optional[Tensor]=None, stride: Union[_int, _size]=1, padding: str="valid", dilation: Union[_int, _size]=1, groups: _int=1) -> Tensor: + """ + * :ref:`API in English ` + + .. _spike_conv2d-cn: + + :class:`torch.nn.functional.conv2d` 在输入为脉冲时的特例。 + + .. note:: + + 在CUDA设备上训练时拥有比 :class:`torch.nn.functional.conv2d` 更低的显存消耗。 + + .. warning:: + + `spike` 中的任何元素都必须为0或1。 + + * :ref:`中文API ` + + .. _spike_conv2d-en: + + A specific case of :class:`torch.nn.functional.conv2d` with inputs are spikes. + + .. admonition:: Note + :class: note + + This function has less memory consumption than :class:`torch.nn.functional.conv2d` when training on CUDA devices. + + .. admonition:: Warning + :class: warning + + Any element in `spike` must be 0 or 1. + """ + if spike.get_device() < 0: + return F.conv2d(spike, weight, bias, stride, padding, dilation, groups) + else: + return spikeConvolution.apply(spike, weight, bias, stride, padding, dilation, groups) + +def spike_conv3d(spike: Tensor, weight: Tensor, bias: Optional[Tensor]=None, stride: Union[_int, _size]=1, padding: str="valid", dilation: Union[_int, _size]=1, groups: _int=1) -> Tensor: + """ + * :ref:`API in English ` + + .. _spike_conv3d-cn: + + :class:`torch.nn.functional.conv3d` 在输入为脉冲时的特例。 + + .. note:: + + 在CUDA设备上训练时拥有比 :class:`torch.nn.functional.conv3d` 更低的显存消耗。 + + .. warning:: + + `spike` 中的任何元素都必须为0或1。 + + * :ref:`中文API ` + + .. _spike_conv3d-en: + + A specific case of :class:`torch.nn.functional.conv3d` with inputs are spikes. + + .. admonition:: Note + :class: note + + This function has less memory consumption than :class:`torch.nn.functional.conv3d` when training on CUDA devices. + + .. admonition:: Warning + :class: warning + + Any element in `spike` must be 0 or 1. + """ + if spike.get_device() < 0: + return F.conv3d(spike, weight, bias, stride, padding, dilation, groups) + else: + return spikeConvolution.apply(spike, weight, bias, stride, padding, dilation, groups) + + +class SpikeLinear(nn.Linear): + """ + * :ref:`API in English ` + + .. _SpikeLinear-cn: + + :class:`torch.nn.Linear` 在输入为脉冲时的特例。 + + .. note:: + + 在CUDA设备上运行时拥有比 :class:`torch.nn.Linear` 更低的显存消耗。 + + .. warning:: + + `spike` 中的任何元素都必须为0或1。 + + * :ref:`中文API ` + + .. _SpikeLinear-en: + + A specific case of :class:`torch.nn.Linear` with inputs are spikes. + + .. admonition:: Note + :class: note + + This function has less memory consumption than :class:`torch.nn.Linear` when training on CUDA devices. + + .. admonition:: Warning + :class: warning + + Any element in `spike` must be 0 or 1. + """ + + def forward(self, spike: Tensor) -> Tensor: + return spike_linear(spike, self.weight, self.bias) + + +class SpikeConv1d(nn.Conv1d): + """ + * :ref:`API in English ` + + .. _SpikeConv1d-cn: + + :class:`torch.nn.Conv1d` 在输入为脉冲时的特例。 + + .. note:: + + 在CUDA设备上运行时拥有比 :class:`torch.nn.Conv1d` 更低的显存消耗。 + + .. warning:: + + `spike` 中的任何元素都必须为0或1。 + + * :ref:`中文API ` + + .. _SpikeConv1d-en: + + A specific case of :class:`torch.nn.Conv1d` with inputs are spikes. + + .. admonition:: Note + :class: note + + This function has less memory consumption than :class:`torch.nn.Conv1d` when training on CUDA devices. + + .. admonition:: Warning + :class: warning + + Any element in `spike` must be 0 or 1. + """ + + def _conv_forward(self, spike: Tensor, weight: Tensor, bias: Optional[Tensor]): + if self.padding_mode != 'zeros': + return spike_conv1d(F.pad(spike, self._reversed_padding_repeated_twice, mode=self.padding_mode), + weight, bias, self.stride, + _single(0), self.dilation, self.groups) + return spike_conv1d(spike, weight, bias, self.stride, + self.padding, self.dilation, self.groups) + + +class SpikeConv2d(nn.Conv2d): + """ + * :ref:`API in English ` + + .. _SpikeConv2d-cn: + + :class:`torch.nn.Conv2d` 在输入为脉冲时的特例。 + + .. note:: + + 在CUDA设备上运行时拥有比 :class:`torch.nn.Conv2d` 更低的显存消耗。 + + .. warning:: + + `spike` 中的任何元素都必须为0或1。 + + * :ref:`中文API ` + + .. _SpikeConv2d-en: + + A specific case of :class:`torch.nn.Conv2d` with inputs are spikes. + + .. admonition:: Note + :class: note + + This function has less memory consumption than :class:`torch.nn.Conv2d` when training on CUDA devices. + + .. admonition:: Warning + :class: warning + + Any element in `spike` must be 0 or 1. + """ + + def _conv_forward(self, spike: Tensor, weight: Tensor, bias: Optional[Tensor]): + if self.padding_mode != 'zeros': + return spike_conv2d(F.pad(spike, self._reversed_padding_repeated_twice, mode=self.padding_mode), + weight, bias, self.stride, + _pair(0), self.dilation, self.groups) + return spike_conv2d(spike, weight, bias, self.stride, + self.padding, self.dilation, self.groups) + + +class SpikeConv3d(nn.Conv3d): + """ + * :ref:`API in English ` + + .. _SpikeConv3d-cn: + + :class:`torch.nn.Conv3d` 在输入为脉冲时的特例。 + + .. note:: + + 在CUDA设备上运行时拥有比 :class:`torch.nn.Conv3d` 更低的显存消耗。 + + .. warning:: + + `spike` 中的任何元素都必须为0或1。 + + * :ref:`中文API ` + + .. _SpikeConv3d-en: + + A specific case of :class:`torch.nn.Conv3d` with inputs are spikes. + + .. admonition:: Note + :class: note + + This function has less memory consumption than :class:`torch.nn.Conv3d` when training on CUDA devices. + + .. admonition:: Warning + :class: warning + + Any element in `spike` must be 0 or 1. + """ + + def _conv_forward(self, spike: Tensor, weight: Tensor, bias: Optional[Tensor]): + if self.padding_mode != "zeros": + return spike_conv3d( + F.pad( + spike, self._reversed_padding_repeated_twice, mode=self.padding_mode + ), + weight, + bias, + self.stride, + _triple(0), + self.dilation, + self.groups, + ) + return spike_conv3d( + spike, weight, bias, self.stride, self.padding, self.dilation, self.groups + ) diff --git a/spikingjelly/clock_driven/surrogate.py b/spikingjelly/clock_driven/surrogate.py index f627653..a79e899 100644 --- a/spikingjelly/clock_driven/surrogate.py +++ b/spikingjelly/clock_driven/surrogate.py @@ -45,14 +45,12 @@ def heaviside(x: torch.Tensor): ''' return (x >= 0).to(x) -def check_manual_grad(primitive_function, spiking_function, eps=1e-5): +def check_manual_grad(primitive_function, spiking_function, *args, **kwargs): ''' :param primitive_function: 梯度替代函数的原函数 :type primitive_function: callable :param spiking_function: 梯度替代函数 :type spiking_function: callable - :param eps: 最大误差 - :type eps: float 梯度替代函数的反向传播一般是手写的,可以用此函数去检查手写梯度是否正确。 @@ -62,18 +60,54 @@ def check_manual_grad(primitive_function, spiking_function, eps=1e-5): .. code-block:: python - surrogate.check_manual_grad(surrogate.ATan.primitive_function, surrogate.atan.apply) + def s2nn_apply(x, alpha, beta): + return surrogate.s2nn.apply(x, alpha, beta) + + surrogate.check_manual_grad(surrogate.S2NN.primitive_function, s2nn_apply, alpha=4., beta=1.) ''' - alpha = torch.tensor(1.0, dtype=torch.float) - x = torch.arange(-16, 16, 32 / 8192) + x = torch.arange(-2, 2, 32 / 8192) + # x = torch.as_tensor([-1., 0., 1.]) x.requires_grad_(True) - primitive_function(x, alpha).sum().backward() + primitive_function(x, *args, **kwargs).sum().backward() x_grad_auto = x.grad.clone() x.grad.zero_() - spiking_function(x, alpha).sum().backward() + spiking_function(x, *args, **kwargs).sum().backward() x_grad_manual = x.grad.clone() - assert (x_grad_manual - x_grad_auto).abs().max().item() <= eps, 'x.grad is wrong!' - print('grad check pass') + print('auto grad', x_grad_auto) + print('manual grad', x_grad_manual) + abs_error = (x_grad_manual - x_grad_auto).abs() + idx = abs_error.argmax() + print('max error', abs_error[idx], 'occurs at') + print(f'x[{idx}] = {x[idx]}') + print('auto grad', x_grad_auto[idx]) + print('manual grad', x_grad_manual[idx]) + +def check_cuda_grad(neu: nn.Module, surrogate_function, device, *args, **kwargs): + # check_cuda_grad(neuron.MultiStepIFNode, surrogate.S2NN, device='cuda:1', alpha=4., beta=1.) + for dtype in [torch.float, torch.half]: + print(dtype) + net = neu(surrogate_function=surrogate_function(*args, **kwargs)) + net.to(device) + x = torch.arange(-2, 2, 32 / 8192, device=device, dtype=dtype) + x = x.unsqueeze(-1) + x.requires_grad_(True) + net.backend = 'torch' + net(x).sum().backward() + x_grad_py = x.grad.clone() + x.grad.zero_() + net.reset() + net.backend = 'cupy' + net(x).sum().backward() + x_grad_cp = x.grad.clone() + # print('python grad', x_grad_py) + # print('cupy grad', x_grad_cp) + abs_error = (x_grad_cp - x_grad_py).abs() + idx = abs_error.argmax() + print('max error', abs_error[idx], 'occurs at') + print(f'x[{idx}] = {x[idx]}') + print('python grad', x_grad_py[idx]) + print('cupy grad', x_grad_cp[idx]) + class SurrogateFunctionBase(nn.Module): def __init__(self, alpha, spiking=True): @@ -133,7 +167,8 @@ class piecewise_quadratic(torch.autograd.Function): @staticmethod def forward(ctx, x, alpha): if x.requires_grad: - ctx.save_for_backward(x, alpha) + ctx.save_for_backward(x) + ctx.alpha = alpha return heaviside(x) @staticmethod @@ -1221,5 +1256,292 @@ class SquarewaveFourierSeries(MultiArgsSurrogateFunctionBase): # plt.savefig('./docs/source/_static/API/clock_driven/surrogate/SquarewaveFourierSeries2.pdf') # plt.savefig('./docs/source/_static/API/clock_driven/surrogate/SquarewaveFourierSeries2.svg') +class s2nn(torch.autograd.Function): + @staticmethod + def forward(ctx, x: torch.Tensor, alpha: float, beta: float): + if x.requires_grad: + ctx.save_for_backward(x) + ctx.alpha = alpha + ctx.beta = beta + return heaviside(x) + + @staticmethod + def backward(ctx, grad_output): + x = ctx.saved_tensors[0] + sgax = torch.sigmoid(ctx.alpha * x) + grad_x = torch.where(x < 0., ctx.alpha * sgax * (1. - sgax), ctx.beta / (x + 1.)) + return grad_x * grad_output, None, None + +class S2NN(MultiArgsSurrogateFunctionBase): + def __init__(self, alpha=4., beta=1., spiking=True): + """ + * :ref:`API in English ` + .. _S2NN.__init__-cn: + + :param alpha: 控制 ``x < 0`` 时梯度的参数 + :param beta: 控制 ``x >= 0`` 时梯度的参数 + :param spiking: 是否输出脉冲,默认为 ``True``,在前向传播时使用 ``heaviside`` 而在反向传播使用替代梯度。若为 ``False`` + 则不使用替代梯度,前向传播时,使用反向传播时的梯度替代函数对应的原函数 + + `S2NN: Time Step Reduction of Spiking Surrogate Gradients for Training Energy Efficient Single-Step Neural Networks `_ 提出的S2NN替代函数。反向传播为 + + .. math:: + g'(x) = \\begin{cases} + \\alpha * (1 - \\mathrm{sigmoid} (\\alpha x)) \\mathrm{sigmoid} (\\alpha x), x < 0 \\\\ + \\beta (x + 1), x \ge 0 + \\end{cases} + + 对应的原函数为 + + .. math:: + g(x) = \\begin{cases} + \\mathrm{sigmoid} (\\alpha x), x < 0 \\\\ + \\beta \\mathrm{ln}(x + 1) + 1, x \ge 0 + \\end{cases} + + .. image:: ./_static/API/clock_driven/surrogate/S2NN.* + :width: 100% + + + * :ref:`中文API ` + .. _S2NN.__init__-en: + + :param alpha: the param that controls the gradient when ``x < 0`` + :param beta: the param that controls the gradient when ``x >= 0`` + :param spiking: whether output spikes. The default is ``True`` which means that using ``heaviside`` in forward + propagation and using surrogate gradient in backward propagation. If ``False``, in forward propagation, + using the primitive function of the surrogate gradient function used in backward propagation + + The S2NN surrogate spiking function, which is proposed by `S2NN: Time Step Reduction of Spiking Surrogate Gradients for Training Energy Efficient Single-Step Neural Networks `_. The gradient is defined by + + .. math:: + g'(x) = \\begin{cases} + \\alpha * (1 - \\mathrm{sigmoid} (\\alpha x)) \\mathrm{sigmoid} (\\alpha x), x < 0 \\\\ + \\beta (x + 1), x \ge 0 + \\end{cases} + + The primitive function is defined by + + .. math:: + g(x) = \\begin{cases} + \\mathrm{sigmoid} (\\alpha x), x < 0 \\\\ + \\beta \\mathrm{ln}(x + 1) + 1, x \ge 0 + \\end{cases} + + .. image:: ./_static/API/clock_driven/surrogate/S2NN.* + :width: 100% + """ + super().__init__(spiking) + self.alpha = alpha + self.beta = beta + self.spiking = spiking + if spiking: + self.f = self.spiking_function + else: + self.f = self.primitive_function + + def forward(self, x): + return self.f(x, self.alpha, self.beta) + + @staticmethod + def spiking_function(x: torch.Tensor, alpha, beta): + return s2nn.apply(x, alpha, beta) + + @staticmethod + def primitive_function(x: torch.Tensor, alpha, beta): + return torch.where(x < 0., torch.sigmoid(x * alpha), beta * torch.log((x + 1.).abs_() + 1e-5) + 0.5) + # abs and 1e-5 are used to avoid nan + + def cuda_code(self, x: str, y: str, dtype='fp32'): + sg_name = 'sg_' + self._get_name() + alpha = str(self.alpha) + 'f' + beta = str(self.beta) + 'f' + code = f''' + {tab4_str}{self.cuda_code_start_comments()} + ''' + + if dtype == 'fp32': + code += f''' + {tab4_str}const float {sg_name}_sigmoid_ax = 1.0f / (1.0f + expf(- {alpha} * {x})); + {tab4_str}const float {sg_name}_mask_l = (float)({x} < 0.0f); + {tab4_str}const float {y} = (1.0f - {sg_name}_sigmoid_ax) * {sg_name}_sigmoid_ax * {alpha} * {sg_name}_mask_l + {beta} / ({x} + 1.0f) * (1.0f - {sg_name}_mask_l); + ''' + elif dtype == 'fp16': + code += f''' + {tab4_str}const half2 {sg_name}_alpha = __float2half2_rn({alpha}); + {tab4_str}const half2 {sg_name}_sigmoid_ax = __h2div(__float2half2_rn(1.0f), __hadd2(h2exp(__hneg2(__hmul2({sg_name}_alpha, {x}))), __float2half2_rn(1.0f))); + {tab4_str}const half2 {sg_name}_mask_l = __hlt2({x}, __float2half2_rn(0.0f)); + {tab4_str}const half2 {y} = __hadd2(__hmul2(__hmul2(__hmul2(__hsub2(__float2half2_rn(1.0f), {sg_name}_sigmoid_ax), {sg_name}_sigmoid_ax), {sg_name}_alpha), {sg_name}_mask_l), __hmul2(__h2div(__float2half2_rn({beta}), __hadd2({x}, __float2half2_rn(1.0f))), __hsub2(__float2half2_rn(1.0f), {sg_name}_mask_l))); + ''' + else: + raise NotImplementedError + code += f''' + {tab4_str}{self.cuda_code_end_comments()} + ''' + return code + + # plt.style.use(['science', 'muted', 'grid']) + # fig = plt.figure(dpi=200, figsize=(6, 4)) + # x = torch.arange(-2.5, 2.5, 0.001) + # plt.plot(x.data, surrogate.heaviside(x), label='Heaviside', linestyle='-.') + # surrogate_function = surrogate.S2NN(alpha=4., beta=1., spiking=False) + # y = surrogate_function(x) + # plt.plot(x.data, y.data, label='Primitive, $\\alpha=4, \\beta=1$') + # + # surrogate_function = surrogate.S2NN(alpha=4, beta=1., spiking=True) + # x.requires_grad_(True) + # y = surrogate_function(x) + # z = y.sum() + # z.backward() + # plt.plot(x.data, x.grad, label='Gradient, $\\alpha=4, \\beta=1$') + # plt.xlim(-2, 2) + # plt.legend() + # plt.title('S2NN surrogate function') + # plt.xlabel('Input') + # plt.ylabel('Output') + # plt.grid(linestyle='--') + # # plt.show() + # plt.savefig('./S2NN.svg') + # plt.savefig('./S2NN.pdf') + +class q_pseudo_spike(torch.autograd.Function): + @staticmethod + def forward(ctx, x, alpha): + if x.requires_grad: + ctx.save_for_backward(x) + ctx.alpha = alpha + return heaviside(x) + + @staticmethod + def backward(ctx, grad_output): + grad_x = None + x = ctx.saved_tensors[0] + if ctx.needs_input_grad[0]: + grad_x = ((1 + 2 / (ctx.alpha - 1) * x.abs()).pow_(-ctx.alpha)) * grad_output + return grad_x, None +class QPseudoSpike(SurrogateFunctionBase): + def __init__(self, alpha=2.0, spiking=True): + ''' + * :ref:`API in English ` + .. _QPseudoSpike.__init__-cn: + + :param alpha: 控制反向传播时梯度函数尾部厚度的参数 + :param spiking: 是否输出脉冲,默认为 ``True``,在前向传播时使用 ``heaviside`` 而在反向传播使用替代梯度。若为 ``False`` + 则不使用替代梯度,前向传播时,使用反向传播时的梯度替代函数对应的原函数 + + `Surrogate Gradients Design `_ 提出的 :math:`q`-PseudoSpike替代函数。反向传播为 + + .. math:: + g'(x) = (1+\\frac{2|x|}{\\alpha-1})^{-\\alpha} + + 其中 :math:`\\alpha>1` 对应原文中的 :math:`q`。 + + 对应的原函数为 + .. math:: + g(x) = + \\begin{cases} + \\frac{1}{2}(1-\\frac{2x}{\\alpha-1})^{1-\\alpha}, & x < 0 \\\\ + 1 - \\frac{1}{2}(1+\\frac{2x}{\\alpha-1})^{1-\\alpha}, & x \\geq 0. + \\end{cases} + + .. image:: ./_static/API/clock_driven/surrogate/QPseudoSpike.* + :width: 100% + + * :ref:`中文API ` + .. _QPseudoSpike.__init__-en: + + :param alpha: parameter to control tail fatness of gradient + :param spiking: whether output spikes. The default is ``True`` which means that using ``heaviside`` in forward + propagation and using surrogate gradient in backward propagation. If ``False``, in forward propagation, + using the primitive function of the surrogate gradient function used in backward propagation + + The :math:`q`-PseudoSpike surrogate spiking function, which is first proposed in `Surrogate Gradients Design `_. The gradient is defined by + + .. math:: + g'(x) = (1+\\frac{2|x|}{\\alpha-1})^{-\\alpha} + + where :math:`\\alpha>1` corresponds to :math:`q` in paper. + + The primitive function is defined by + + .. math:: + g(x) = + \\begin{cases} + \\frac{1}{2}(1-\\frac{2x}{\\alpha-1})^{1-\\alpha}, & x < 0 \\\\ + 1 - \\frac{1}{2}(1+\\frac{2x}{\\alpha-1})^{1-\\alpha}, & x \\geq 0. + \\end{cases} + + .. image:: ./_static/API/clock_driven/surrogate/QPseudoSpike.* + :width: 100% + ''' + super().__init__(alpha, spiking) + + + @staticmethod + def spiking_function(x, alpha): + return q_pseudo_spike.apply(x, alpha) + + @staticmethod + def primitive_function(x: torch.Tensor, alpha): + mask_nonnegative = heaviside(x) + mask_sign = mask_nonnegative * 2. - 1. + + return mask_nonnegative - mask_sign * (0.5 * ((1. + 2. / (alpha - 1.) * x * mask_sign).pow_(1. - alpha))) + + def cuda_code(self, x: str, y: str, dtype='fp32'): + sg_name = 'sg_' + self._get_name() + alpha = str(self.alpha) + 'f' + code = f''' + {tab4_str}{self.cuda_code_start_comments()} + ''' + + if dtype == 'fp32': + code += f''' + {tab4_str}const float {sg_name}_base = 1.0f + 2.0f / ({alpha} - 1.0f) * fabsf({x}); + {tab4_str}const float {y} = powf({sg_name}_base, -{alpha}); + ''' + elif dtype == 'fp16': + code += f''' + {tab4_str}const half2 {sg_name}_alpha = __float2half2_rn({alpha}); + {tab4_str}const half2 {sg_name}_base = __hadd2(__float2half2_rn(1.0f), __h2div(__hmul2(__float2half2_rn(2.0f), __habs2({x})), __hsub2({sg_name}_alpha, __float2half2_rn(1.0f)))); + {tab4_str}const half2 {y} = h2exp2(__hmul2(h2log2({sg_name}_base), __hneg2({sg_name}_alpha))); // Replace power with combination of log and exp, since CUDA has no power function for FP16. + ''' + else: + raise NotImplementedError + code += f''' + {tab4_str}{self.cuda_code_end_comments()} + ''' + return code + + # plt.style.use(['science', 'muted', 'grid']) + # fig = plt.figure(dpi=200, figsize=(6, 4)) + # x = torch.arange(-2.5, 2.5, 0.001) + # plt.plot(x.data, surrogate.heaviside(x), label='Heaviside', linestyle='-.') + # surrogate_function = surrogate.QPseudoSpike(alpha=2, spiking=False) + # y = surrogate_function(x) + # plt.plot(x.data, y.data, label='Primitive, $\\alpha=2$') + + # surrogate_function = surrogate.QPseudoSpike(alpha=2, spiking=True) + # x.requires_grad_(True) + # y = surrogate_function(x) + # z = y.sum() + # z.backward() + # plt.plot(x.data, x.grad, label='Gradient, $\\alpha=2$') + # plt.xlim(-2, 2) + # plt.legend() + # plt.title('QPseudoSpike surrogate function') + # plt.xlabel('Input') + # plt.ylabel('Output') + # plt.grid(linestyle='--') + # # plt.savefig('QPseudoSpike.svg') + # # plt.savefig('QPseudoSpike.pdf') + +_has_cuda_ = [ + ATan, + Sigmoid, + PiecewiseLeakyReLU, + S2NN, + QPseudoSpike +] diff --git a/spikingjelly/clock_driven/tensor_cache.py b/spikingjelly/clock_driven/tensor_cache.py new file mode 100644 index 0000000..d4d7fa5 --- /dev/null +++ b/spikingjelly/clock_driven/tensor_cache.py @@ -0,0 +1,212 @@ +import torch +import torch.nn.functional as F +import threading +from .. import configure +from . import cu_kernel_opt +import logging +try: + import cupy +except BaseException as e: + logging.info(f'spikingjelly.clock_driven.tensor_cache: {e}') + cupy = None + +class DataTypeConvertCUDACode: + float2bool = r''' + extern "C" __global__ + void float2bool(const float* fs, unsigned char* bs, const int &N) + { + // assert N == numel / 8 and numel % 8 == 0 + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < N) + { + bs[index] = 0; + const int mem_offset = (index << 3); + #pragma unroll + for(int i = 0; i < 8; i++) + { + bs[index] += ( ((unsigned char) fs[mem_offset + i]) << i); + } + } + } + ''' + + half2bool = r''' + #include + extern "C" __global__ + void half2bool(const half* fs, unsigned char* bs, const int &N) + { + // assert N == numel / 8 and numel % 8 == 0 + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < N) + { + bs[index] = 0; + const int mem_offset = (index << 3); + #pragma unroll + for(int i = 0; i < 8; i++) + { + bs[index] += ( ((unsigned char) __half2float(fs[mem_offset + i])) << i); + } + } + } + ''' + + bool2float = r''' + extern "C" __global__ + void bool2float(const unsigned char* bs, float* fs, const int &N) + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < N) + { + const int mem_offset = (index << 3); + unsigned char compressed_v = bs[index]; + #pragma unroll + for(int i = 0; i < 8; i++) + { + fs[mem_offset + i] = (float) (compressed_v % 2); + compressed_v = (compressed_v >> 1); + } + } + } + ''' + + bool2half = r''' + #include + extern "C" __global__ + void bool2half(const unsigned char* bs, half* fs, const int &N) + { + const int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < N) + { + const int mem_offset = (index << 3); + unsigned char compressed_v = bs[index]; + #pragma unroll + for(int i = 0; i < 8; i++) + { + fs[mem_offset + i] = __float2half((float) (compressed_v % 2)); + compressed_v = (compressed_v >> 1); + } + } + } + ''' +def float_spike_to_bool(spike: torch.Tensor): + s_dtype = spike.dtype + if s_dtype == torch.float: + kernel_codes = DataTypeConvertCUDACode.float2bool + kernel_name = 'float2bool' + elif s_dtype == torch.half: + kernel_codes = DataTypeConvertCUDACode.half2bool + kernel_name = 'half2bool' + else: + raise NotImplementedError + + s_shape = spike.shape + + spike = spike.flatten() + s_padding = 8 - spike.numel() % 8 + if s_padding != 0: + spike = F.pad(spike, (0, s_padding)) + device_id = spike.get_device() + spike_b = torch.zeros([spike.numel() // 8], device=spike.device, dtype=torch.uint8) + with cu_kernel_opt.DeviceEnvironment(device_id): + numel = spike_b.numel() + blocks = cu_kernel_opt.cal_blocks(numel) + numel = cupy.asarray(numel) + spike, spike_b, numel = cu_kernel_opt.get_contiguous(spike, spike_b, numel) + kernel_args = [spike, spike_b, numel] + kernel = cupy.RawKernel( + kernel_codes, + kernel_name, + options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend + ) + kernel( + (blocks,), (configure.cuda_threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device_id, + *kernel_args + ) + ) + return spike_b, s_dtype, s_shape, s_padding + +def bool_spike_to_float(spike_b: torch.Tensor, s_dtype: torch.dtype, s_shape: torch.Size, s_padding: int = 0): + device_id = spike_b.get_device() + spike = torch.zeros(spike_b.numel() * 8, device=spike_b.device, dtype=s_dtype) + if s_dtype == torch.float: + kernel_codes = DataTypeConvertCUDACode.bool2float + kernel_name = 'bool2float' + elif s_dtype == torch.half: + kernel_codes = DataTypeConvertCUDACode.bool2half + kernel_name = 'bool2half' + else: + raise NotImplementedError + with cu_kernel_opt.DeviceEnvironment(device_id): + numel = spike_b.numel() + blocks = cu_kernel_opt.cal_blocks(numel) + numel = cupy.asarray(numel) + spike_b, spike, numel = cu_kernel_opt.get_contiguous(spike_b, spike, numel) + kernel_args = [spike_b, spike, numel] + kernel = cupy.RawKernel( + kernel_codes, + kernel_name, + options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend + ) + kernel( + (blocks,), (configure.cuda_threads,), + cu_kernel_opt.wrap_args_to_raw_kernel( + device_id, + *kernel_args + ) + ) + if s_padding is not None and s_padding != 0: + spike = spike[0: spike.numel() - s_padding] + return spike.reshape(s_shape) + + +def tensor_key(x: torch.Tensor): + x = x.flatten() + return x.data_ptr(), x[-1].data_ptr(), x.numel() + +class BoolTensorCache: + def __init__(self): + super().__init__() + self.cache_dict = {} + self.cache_refcount_dict = {} + self.lock = threading.Lock() + + def store_bool(self, spike: torch.FloatTensor or torch.HalfTensor): + tk = tensor_key(spike) + + self.lock.acquire() + if tk not in self.cache_dict: + if configure.save_bool_spike_level == 0: + self.cache_dict[tk] = (spike.bool(), spike.dtype) + elif configure.save_bool_spike_level == 1: + self.cache_dict[tk] = float_spike_to_bool(spike) + else: + raise NotImplementedError + self.cache_refcount_dict[tk] = 1 + else: + self.cache_refcount_dict[tk] += 1 + self.lock.release() + + return tk + + def get_float(self, tk, spike_shape: torch.Size): + if configure.save_bool_spike_level == 0: + spike, s_dtype = self.cache_dict[tk] + spike = spike.to(s_dtype) + elif configure.save_bool_spike_level == 1: + spike = bool_spike_to_float(*self.cache_dict[tk]) + else: + raise NotImplementedError + + self.lock.acquire() + self.cache_refcount_dict[tk] -= 1 + if self.cache_refcount_dict[tk] == 0: + del self.cache_refcount_dict[tk] + del self.cache_dict[tk] + self.lock.release() + + return spike.view(spike_shape) + + +BOOL_TENSOR_CACHE = BoolTensorCache() diff --git a/spikingjelly/configure.py b/spikingjelly/configure.py index 763e400..65a1833 100644 --- a/spikingjelly/configure.py +++ b/spikingjelly/configure.py @@ -1,7 +1,17 @@ -# This py file defines some variables used in SpikingJelly. -# The user can change them and install SpikingJelly manually. +''' +This py file defines some variables used in SpikingJelly. +Here is an example of how you can change them to make effect in your codes: + + import spikingjelly + spikingjelly.configure.cuda_threads = 512 + +Do not change them in this way, which will not make effect: + + from spikingjelly.configure import cuda_threads + cuda_threads = 512 -max_threads_number_for_datasets_preprocess = 4 +''' +max_threads_number_for_datasets_preprocess = 16 ''' `max_threads_number_for_datasets_preprocess` defines the maximum threads for datasets preprocessing, which is 1. reading binary events and saving them to numpy format @@ -41,4 +51,19 @@ If `save_datasets_compressed == True`, events and frames in spikingjelly.dataset The compressed npz file consumes less memory in disk but more time in reading. ''' +save_spike_as_bool_in_neuron_kernel = False +''' +If `save_spike_as_bool_in_neuron_kernel == True`, the neuron kernel used in the neuron's cupy backend will save the spike as a bool, rather than float/half tensor for backward, which can reduce the memory consumption. +''' + +save_bool_spike_level = 0 +''' +`save_bool_spike_level` take effects on SpikeConv/SpikeLinear, and on neuron's cupy kernel when `save_spike_as_bool_in_neuron_kernel == True`. + +If `save_bool_spike_level == 0`, spikes will be saved in bool. Note that bool uses 8-bit, rather than 1-bit. + +If `save_bool_spike_level == 1`, spikes will be saved in uint8 with each 8-bit storing 8 spikes. + +A larger `save_bool_spike_level` means less memory consumption but slower speed. +''' diff --git a/spikingjelly/datasets/__init__.py b/spikingjelly/datasets/__init__.py index aebfae6..f410f93 100644 --- a/spikingjelly/datasets/__init__.py +++ b/spikingjelly/datasets/__init__.py @@ -1,6 +1,5 @@ -import torchvision.transforms from torchvision.datasets import DatasetFolder -from typing import Any, Callable, cast, Dict, List, Optional, Tuple +from typing import Callable, Dict, Optional, Tuple from abc import abstractmethod import scipy.io import struct @@ -10,18 +9,19 @@ import torch.utils.data import os from concurrent.futures import ThreadPoolExecutor import time -import multiprocessing from torchvision import transforms import torch from matplotlib import pyplot as plt import math import tqdm -from ..configure import max_threads_number_for_datasets_preprocess, cuda_threads, cuda_compiler_options, cuda_compiler_backend, save_datasets_compressed -np_savez = np.savez_compressed if save_datasets_compressed else np.savez +import shutil +from .. import configure +import logging +np_savez = np.savez_compressed if configure.save_datasets_compressed else np.savez try: import cupy - from spikingjelly.clock_driven import cu_kernel_opt + from ..clock_driven import cu_kernel_opt padded_sequence_mask_kernel_code = r''' extern "C" __global__ @@ -37,7 +37,8 @@ try: } } ''' -except ImportError: +except BaseException as e: + logging.info(f'spikingjelly.dataset.__init__: {e}') cupy = None pass @@ -91,7 +92,6 @@ def load_aedat_v3(file_name: str) -> Dict: :type file_name: str :return: a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray`` :rtype: Dict - This function is written by referring to https://gitlab.com/inivation/dv/dv-python . It can be used for DVS128 Gesture. ''' with open(file_name, 'rb') as bin_f: @@ -156,19 +156,12 @@ def load_ATIS_bin(file_name: str) -> Dict: :type file_name: str :return: a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray`` :rtype: Dict - This function is written by referring to https://github.com/jackd/events-tfds . - Each ATIS binary example is a separate binary file consisting of a list of events. Each event occupies 40 bits as described below: - bit 39 - 32: Xaddress (in pixels) - bit 31 - 24: Yaddress (in pixels) - bit 23: Polarity (0 for OFF, 1 for ON) - bit 22 - 0: Timestamp (in microseconds) - ''' with open(file_name, 'rb') as bin_f: # `& 128` 是取一个8位二进制数的最高位 @@ -191,10 +184,14 @@ def load_npz_frames(file_name: str) -> np.ndarray: ''' return np.load(file_name, allow_pickle=True)['frames'] -def integrate_events_segment_to_frame(events: Dict, H: int, W: int, j_l: int = 0, j_r: int = -1) -> np.ndarray: +def integrate_events_segment_to_frame(x: np.ndarray, y: np.ndarray, p: np.ndarray, H: int, W: int, j_l: int = 0, j_r: int = -1) -> np.ndarray: ''' - :param events: a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray`` - :type events: Dict + :param x: x-coordinate of events + :type x: numpy.ndarray + :param y: y-coordinate of events + :type y: numpy.ndarray + :param p: polarity of events + :type p: numpy.ndarray :param H: height of the frame :type H: int :param W: weight of the frame @@ -205,13 +202,9 @@ def integrate_events_segment_to_frame(events: Dict, H: int, W: int, j_l: int = 0 :type j_r: :return: frames :rtype: np.ndarray - Denote a two channels frame as :math:`F` and a pixel at :math:`(p, x, y)` as :math:`F(p, x, y)`, the pixel value is integrated from the events data whose indices are in :math:`[j_{l}, j_{r})`: - .. math:: - F(p, x, y) = \sum_{i = j_{l}}^{j_{r} - 1} \mathcal{I}_{p, x, y}(p_{i}, x_{i}, y_{i}) - where :math:`\lfloor \cdot \rfloor` is the floor operation, :math:`\mathcal{I}_{p, x, y}(p_{i}, x_{i}, y_{i})` is an indicator function and it equals 1 only when :math:`(p, x, y) = (p_{i}, x_{i}, y_{i})`. ''' # 累计脉冲需要用bitcount而不能直接相加,原因可参考下面的示例代码,以及 @@ -253,9 +246,9 @@ def integrate_events_segment_to_frame(events: Dict, H: int, W: int, j_l: int = 0 # print('correct accumulation by bincount\n', frames) frame = np.zeros(shape=[2, H * W]) - x = events['x'][j_l: j_r].astype(int) # avoid overflow - y = events['y'][j_l: j_r].astype(int) - p = events['p'][j_l: j_r] + x = x[j_l: j_r].astype(int) # avoid overflow + y = y[j_l: j_r].astype(int) + p = p[j_l: j_r] mask = [] mask.append(p == 0) mask.append(np.logical_not(mask[0])) @@ -275,17 +268,12 @@ def cal_fixed_frames_number_segment_index(events_t: np.ndarray, split_by: str, f :type frames_num: int :return: a tuple ``(j_l, j_r)`` :rtype: tuple - Denote ``frames_num`` as :math:`M`, if ``split_by`` is ``'time'``, then - .. math:: - \\Delta T & = [\\frac{t_{N-1} - t_{0}}{M}] \\\\ j_{l} & = \\mathop{\\arg\\min}\\limits_{k} \\{t_{k} | t_{k} \\geq t_{0} + \\Delta T \\cdot j\\} \\\\ j_{r} & = \\begin{cases} \\mathop{\\arg\\max}\\limits_{k} \\{t_{k} | t_{k} < t_{0} + \\Delta T \\cdot (j + 1)\\} + 1, & j < M - 1 \\cr N, & j = M - 1 \\end{cases} - If ``split_by`` is ``'number'``, then - .. math:: j_{l} & = [\\frac{N}{M}] \\cdot j \\\\ j_{r} & = \\begin{cases} [\\frac{N}{M}] \\cdot (j + 1), & j < M - 1 \\cr N, & j = M - 1 \\end{cases} @@ -332,17 +320,19 @@ def integrate_events_by_fixed_frames_number(events: Dict, split_by: str, frames_ :type W: int :return: frames :rtype: np.ndarray - Integrate events to frames by fixed frames number. See :class:`cal_fixed_frames_number_segment_index` and :class:`integrate_events_segment_to_frame` for more details. ''' - j_l, j_r = cal_fixed_frames_number_segment_index(events['t'], split_by, frames_num) + t, x, y, p = (events[key] for key in ('t', 'x', 'y', 'p')) + j_l, j_r = cal_fixed_frames_number_segment_index(t, split_by, frames_num) frames = np.zeros([frames_num, 2, H, W]) for i in range(frames_num): - frames[i] = integrate_events_segment_to_frame(events, H, W, j_l[i], j_r[i]) + frames[i] = integrate_events_segment_to_frame(x, y, p, H, W, j_l[i], j_r[i]) return frames -def integrate_events_file_to_frames_file_by_fixed_frames_number(events_np_file: str, output_dir: str, split_by: str, frames_num: int, H: int, W: int, print_save: bool = False) -> None: +def integrate_events_file_to_frames_file_by_fixed_frames_number(loader: Callable, events_np_file: str, output_dir: str, split_by: str, frames_num: int, H: int, W: int, print_save: bool = False) -> None: ''' + :param loader: a function that can load events from `events_np_file` + :type loader: Callable :param events_np_file: path of the events np file :type events_np_file: str :param output_dir: output directory for saving the frames @@ -358,11 +348,10 @@ def integrate_events_file_to_frames_file_by_fixed_frames_number(events_np_file: :param print_save: If ``True``, this function will print saved files' paths. :type print_save: bool :return: None - Integrate a events file to frames by fixed frames number and save it. See :class:`cal_fixed_frames_number_segment_index` and :class:`integrate_events_segment_to_frame` for more details. ''' fname = os.path.join(output_dir, os.path.basename(events_np_file)) - np_savez(fname, frames=integrate_events_by_fixed_frames_number(np.load(events_np_file), split_by, frames_num, H, W)) + np_savez(fname, frames=integrate_events_by_fixed_frames_number(loader(events_np_file), split_by, frames_num, H, W)) if print_save: print(f'Frames [{fname}] saved.') @@ -380,10 +369,12 @@ def integrate_events_by_fixed_duration(events: Dict, duration: int, H: int, W: i :type W: int :return: frames :rtype: np.ndarray - Integrate events to frames by fixed time duration of each frame. ''' + x = events['x'] + y = events['y'] t = events['t'] + p = events['p'] N = t.size frames = [] @@ -397,15 +388,17 @@ def integrate_events_by_fixed_duration(events: Dict, duration: int, H: int, W: i else: right += 1 # integrate from index [left, right) - frames.append(np.expand_dims(integrate_events_segment_to_frame(events, H, W, left, right), 0)) + frames.append(np.expand_dims(integrate_events_segment_to_frame(x, y, p, H, W, left, right), 0)) left = right if right == N: return np.concatenate(frames) -def integrate_events_file_to_frames_file_by_fixed_duration(events_np_file: str, output_dir: str, duration: int, H: int, W: int, print_save: bool = False) -> None: +def integrate_events_file_to_frames_file_by_fixed_duration(loader: Callable, events_np_file: str, output_dir: str, duration: int, H: int, W: int, print_save: bool = False) -> None: ''' + :param loader: a function that can load events from `events_np_file` + :type loader: Callable :param events_np_file: path of the events np file :type events_np_file: str :param output_dir: output directory for saving the frames @@ -419,10 +412,9 @@ def integrate_events_file_to_frames_file_by_fixed_duration(events_np_file: str, :param print_save: If ``True``, this function will print saved files' paths. :type print_save: bool :return: None - Integrate events to frames by fixed time duration of each frame. ''' - frames = integrate_events_by_fixed_duration(np.load(events_np_file), duration, H, W) + frames = integrate_events_by_fixed_duration(loader(events_np_file), duration, H, W) fname, _ = os.path.splitext(os.path.basename(events_np_file)) fname = os.path.join(output_dir, f'{fname}_{frames.shape[0]}.npz') np_savez(fname, frames=frames) @@ -441,7 +433,6 @@ def create_same_directory_structure(source_dir: str, target_dir: str) -> None: :param target_dir: Path of the directory that be copied to :type target_dir: str :return: None - Create the same directory structure in ``target_dir`` with that of ``source_dir``. ''' for sub_dir_name in os.listdir(source_dir): @@ -492,40 +483,32 @@ def split_to_train_test_set(train_ratio: float, origin_dataset: torch.utils.data def pad_sequence_collate(batch: list): ''' - :param batch: a list of samples that contains ``(x, y)``, where ``x.shape=[T, *]`` and ``y`` is the label + :param batch: a list of samples that contains ``(x, y)``, where ``x`` is a list containing sequences with different length and ``y`` is the label :type batch: list - :return: batched samples, where ``x`` is padded with the same length + :return: batched samples ``(x_p, y, x_len), where ``x_p`` is padded ``x`` with the same length, `y`` is the label, and ``x_len`` is the length of the ``x`` :rtype: tuple - This function can be use as the ``collate_fn`` for ``DataLoader`` to process the dataset with variable length, e.g., a ``NeuromorphicDatasetFolder`` with fixed duration to integrate events to frames. - Here is an example: - .. code-block:: python - - class RandomLengthDataset(torch.utils.data.Dataset): - def __init__(self, n=1000): - super().__init__() - self.n = n - - def __getitem__(self, i): - return torch.rand([random.randint(1, 10), 28, 28]), random.randint(0, 10) - - def __len__(self): - return self.n - - loader = torch.utils.data.DataLoader(RandomLengthDataset(n=32), batch_size=16, collate_fn=pad_sequence_collate) - - for x, y, z in loader: - print(x.shape, y.shape, z) - + class VariableLengthDataset(torch.utils.data.Dataset): + def __init__(self, n=1000): + super().__init__() + self.n = n + def __getitem__(self, i): + return torch.rand([i + 1, 2]), self.n - i - 1 + def __len__(self): + return self.n + loader = torch.utils.data.DataLoader(VariableLengthDataset(n=32), batch_size=2, collate_fn=pad_sequence_collate, + shuffle=True) + for i, (x_p, label, x_len) in enumerate(loader): + print(f'x_p.shape={x_p.shape}, label={label}, x_len={x_len}') + if i == 2: + break And the outputs are: - .. code-block:: bash - - torch.Size([10, 16, 28, 28]) torch.Size([16]) tensor([ 1, 9, 3, 4, 1, 2, 9, 7, 2, 1, 5, 7, 4, 10, 9, 5]) - torch.Size([10, 16, 28, 28]) torch.Size([16]) tensor([ 1, 8, 7, 10, 3, 10, 6, 7, 5, 9, 10, 5, 9, 6, 7, 6]) - + x_p.shape=torch.Size([2, 18, 2]), label=tensor([14, 30]), x_len=tensor([18, 2]) + x_p.shape=torch.Size([2, 29, 2]), label=tensor([3, 6]), x_len=tensor([29, 26]) + x_p.shape=torch.Size([2, 23, 2]), label=tensor([ 9, 23]), x_len=tensor([23, 9]) ''' x_list = [] x_len_list = [] @@ -545,11 +528,8 @@ def padded_sequence_mask(sequence_len: torch.Tensor, T=None): :type T: int :return: a bool mask with shape = [T, N], where the padded position is ``False`` :rtype: torch.Tensor - Here is an example: - .. code-block:: python - x1 = torch.rand([2, 6]) x2 = torch.rand([3, 6]) x3 = torch.rand([4, 6]) @@ -559,11 +539,8 @@ def padded_sequence_mask(sequence_len: torch.Tensor, T=None): mask = padded_sequence_mask(x_len) print('mask.shape=', mask.shape) print('mask=\\n', mask) - And the outputs are: - .. code-block:: bash - x.shape= torch.Size([4, 3, 6]) mask.shape= torch.Size([4, 3]) mask= @@ -571,7 +548,6 @@ def padded_sequence_mask(sequence_len: torch.Tensor, T=None): [ True, True, True], [False, True, True], [False, False, True]]) - ''' if T is None: T = sequence_len.max().item() @@ -580,15 +556,15 @@ def padded_sequence_mask(sequence_len: torch.Tensor, T=None): if device_id >= 0 and cupy is not None: mask = torch.zeros([T, N], dtype=bool, device=sequence_len.device) - with cupy.cuda.Device(device_id): + with cu_kernel_opt.DeviceEnvironment(device_id): T = cupy.asarray(T) N = cupy.asarray(N) sequence_len, mask, T, N = cu_kernel_opt.get_contiguous(sequence_len.to(torch.int), mask, T, N) kernel_args = [sequence_len, mask, T, N] - kernel = cupy.RawKernel(padded_sequence_mask_kernel_code, 'padded_sequence_mask_kernel', options=cuda_compiler_options, backend=cuda_compiler_backend) + kernel = cupy.RawKernel(padded_sequence_mask_kernel_code, 'padded_sequence_mask_kernel', options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend) blocks = cu_kernel_opt.cal_blocks(N) kernel( - (blocks,), (cuda_threads,), + (blocks,), (configure.cuda_threads,), cu_kernel_opt.wrap_args_to_raw_kernel( device_id, *kernel_args @@ -645,27 +621,20 @@ class NeuromorphicDatasetFolder(DatasetFolder): :param target_transform: a function/transform that takes in the target and transforms it. :type target_transform: callable - The base class for neuromorphic dataset. Users can define a new dataset by inheriting this class and implementing all abstract methods. Users can refer to :class:`spikingjelly.datasets.dvs128_gesture.DVS128Gesture`. - If ``data_type == 'event'`` the sample in this dataset is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray``. - If ``data_type == 'frame'`` and ``frames_number`` is not ``None`` events will be integrated to frames with fixed frames number. ``split_by`` will define how to split events. See :class:`cal_fixed_frames_number_segment_index` for more details. - If ``data_type == 'frame'``, ``frames_number`` is ``None``, and ``duration`` is not ``None`` events will be integrated to frames with fixed time duration. - If ``data_type == 'frame'``, ``frames_number`` is ``None``, ``duration`` is ``None``, and ``custom_integrate_function`` is not ``None``: events will be integrated by the user-defined function and saved to the ``custom_integrated_frames_dir_name`` directory in ``root`` directory. Here is an example from SpikingJelly's tutorials: - .. code-block:: python - from spikingjelly.datasets.dvs128_gesture import DVS128Gesture from typing import Dict import numpy as np @@ -673,13 +642,12 @@ class NeuromorphicDatasetFolder(DatasetFolder): def integrate_events_to_2_frames_randomly(events: Dict, H: int, W: int): index_split = np.random.randint(low=0, high=events['t'].__len__()) frames = np.zeros([2, 2, H, W]) - frames[0] = sjds.integrate_events_segment_to_frame(events, H, W, 0, index_split) - frames[1] = sjds.integrate_events_segment_to_frame(events, H, W, index_split, events['t'].__len__()) + t, x, y, p = (events[key] for key in ('t', 'x', 'y', 'p')) + frames[0] = sjds.integrate_events_segment_to_frame(x, y, p, H, W, 0, index_split) + frames[1] = sjds.integrate_events_segment_to_frame(x, y, p, H, W, index_split, events['t'].__len__()) return frames - root_dir = 'D:/datasets/DVS128Gesture' train_set = DVS128Gesture(root_dir, train=True, data_type='frame', custom_integrate_function=integrate_events_to_2_frames_randomly) - from spikingjelly.datasets import play_frame frame, label = train_set[500] play_frame(frame) @@ -773,7 +741,7 @@ class NeuromorphicDatasetFolder(DatasetFolder): # use multi-thread to accelerate t_ckp = time.time() - with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), max_threads_number_for_datasets_preprocess)) as tpe: + with ThreadPoolExecutor(max_workers=configure.max_threads_number_for_datasets_preprocess) as tpe: print(f'Start ThreadPoolExecutor with max workers = [{tpe._max_workers}].') for e_root, e_dirs, e_files in os.walk(events_np_root): if e_files.__len__() > 0: @@ -781,7 +749,7 @@ class NeuromorphicDatasetFolder(DatasetFolder): for e_file in e_files: events_np_file = os.path.join(e_root, e_file) print(f'Start to integrate [{events_np_file}] to frames and save to [{output_dir}].') - tpe.submit(integrate_events_file_to_frames_file_by_fixed_frames_number, events_np_file, output_dir, split_by, frames_number, H, W, True) + tpe.submit(integrate_events_file_to_frames_file_by_fixed_frames_number, self.load_events_np, events_np_file, output_dir, split_by, frames_number, H, W, True) print(f'Used time = [{round(time.time() - t_ckp, 2)}s].') @@ -803,7 +771,7 @@ class NeuromorphicDatasetFolder(DatasetFolder): create_same_directory_structure(events_np_root, frames_np_root) # use multi-thread to accelerate t_ckp = time.time() - with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), max_threads_number_for_datasets_preprocess)) as tpe: + with ThreadPoolExecutor(max_workers=configure.max_threads_number_for_datasets_preprocess) as tpe: print(f'Start ThreadPoolExecutor with max workers = [{tpe._max_workers}].') for e_root, e_dirs, e_files in os.walk(events_np_root): if e_files.__len__() > 0: @@ -811,7 +779,7 @@ class NeuromorphicDatasetFolder(DatasetFolder): for e_file in e_files: events_np_file = os.path.join(e_root, e_file) print(f'Start to integrate [{events_np_file}] to frames and save to [{output_dir}].') - tpe.submit(integrate_events_file_to_frames_file_by_fixed_duration, events_np_file, output_dir, duration, H, W, True) + tpe.submit(integrate_events_file_to_frames_file_by_fixed_duration, self.load_events_np, events_np_file, output_dir, duration, H, W, True) print(f'Used time = [{round(time.time() - t_ckp, 2)}s].') @@ -834,7 +802,7 @@ class NeuromorphicDatasetFolder(DatasetFolder): create_same_directory_structure(events_np_root, frames_np_root) # use multi-thread to accelerate t_ckp = time.time() - with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), max_threads_number_for_datasets_preprocess)) as tpe: + with ThreadPoolExecutor(max_workers=configure.max_threads_number_for_datasets_preprocess) as tpe: print(f'Start ThreadPoolExecutor with max workers = [{tpe._max_workers}].') for e_root, e_dirs, e_files in os.walk(events_np_root): if e_files.__len__() > 0: @@ -865,19 +833,6 @@ class NeuromorphicDatasetFolder(DatasetFolder): super().__init__(root=_root, loader=_loader, extensions=('.npz', ), transform=_transform, target_transform=_target_transform) - @staticmethod - @abstractmethod - def load_origin_data(file_name: str) -> Dict: - ''' - :param file_name: path of the events file - :type file_name: str - :return: a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray`` - :rtype: Dict - - This function defines how to read the origin binary data. - ''' - pass - @staticmethod @abstractmethod def resource_url_md5() -> list: @@ -905,7 +860,6 @@ class NeuromorphicDatasetFolder(DatasetFolder): :param extract_root: Root directory path which saves extracted files from downloaded files :type extract_root: str :return: None - This function defines how to extract download files. ''' pass @@ -919,7 +873,6 @@ class NeuromorphicDatasetFolder(DatasetFolder): :param events_np_root: Root directory path which saves events files in the ``npz`` format :type events_np_root: :return: None - This function defines how to convert the origin binary data in ``extract_root`` to ``npz`` format and save converted files in ``events_np_root``. ''' pass @@ -934,6 +887,16 @@ class NeuromorphicDatasetFolder(DatasetFolder): ''' pass + @staticmethod + def load_events_np(fname: str): + ''' + :param fname: file name + :return: a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray`` + This function defines how to load a sample from `events_np`. In most cases, this function is `np.load`. + But for some datasets, e.g., ES-ImageNet, it can be different. + ''' + return np.load(fname) + def random_temporal_delete(x_seq: torch.Tensor or np.ndarray, T_remain: int, batch_first): @@ -946,13 +909,9 @@ def random_temporal_delete(x_seq: torch.Tensor or np.ndarray, T_remain: int, bat :type batch_first: bool :return: the sequence with length `T_remain`, which is obtained by randomly removing `T - T_remain` slices :rtype: torch.Tensor or np.ndarray - The random temporal delete data augmentation used in `Deep Residual Learning in Spiking Neural Networks `_. - Codes example: - .. code-block:: python - import torch from spikingjelly.datasets import random_temporal_delete T = 8 @@ -961,11 +920,8 @@ def random_temporal_delete(x_seq: torch.Tensor or np.ndarray, T_remain: int, bat x_seq = torch.arange(0, N*T).view([N, T]) print('x_seq=\\n', x_seq) print('random_temporal_delete(x_seq)=\\n', random_temporal_delete(x_seq, T_remain, batch_first=True)) - Outputs: - .. code-block:: shell - x_seq= tensor([[ 0, 1, 2, 3, 4, 5, 6, 7], [ 8, 9, 10, 11, 12, 13, 14, 15], @@ -994,9 +950,7 @@ class RandomTemporalDelete(torch.nn.Module): :type T_remain: int :type T_remain: int :param batch_first: if `True`, `x_seq` will be regarded as `shape = [N, T, *]` - The random temporal delete data augmentation used in `Deep Residual Learning in Spiking Neural Networks `_. - Refer to :class:`random_temporal_delete` for more details. """ super().__init__() @@ -1007,3 +961,46 @@ class RandomTemporalDelete(torch.nn.Module): return random_temporal_delete(x_seq, self.T_remain, self.batch_first) +def create_sub_dataset(source_dir: str, target_dir:str, ratio: float, use_soft_link=True, randomly=False): + """ + :param source_dir: the directory path of the origin dataset + :type source_dir: str + :param target_dir: the directory path of the sub dataset + :type target_dir: str + :param ratio: the ratio of samples sub dataset will copy from the origin dataset + :type ratio: float + :param use_soft_link: if ``True``, the sub dataset will use soft link to copy; else, the sub dataset will copy files + :type use_soft_link: bool + :param randomly: if ``True``, the files copy from the origin dataset will be picked up randomly. The randomness is controlled by + ``numpy.random.seed`` + :type randomly: bool + Create a sub dataset with copy ``ratio`` of samples from the origin dataset. + """ + if not os.path.exists(target_dir): + os.makedirs(target_dir) + print(f'Mkdir [{target_dir}].') + create_same_directory_structure(source_dir, target_dir) + warnings_info = [] + for e_root, e_dirs, e_files in os.walk(source_dir, followlinks=True): + if e_files.__len__() > 0: + output_dir = os.path.join(target_dir, os.path.relpath(e_root, source_dir)) + samples_number = int(ratio * e_files.__len__()) + if samples_number == 0: + warnings_info.append(f'Warning: the samples number is 0 in [{output_dir}].') + if randomly: + np.random.shuffle(e_files) + for i, e_file in enumerate(e_files): + if i >= samples_number: + break + source_file = os.path.join(e_root, e_file) + target_file = os.path.join(output_dir, os.path.basename(source_file)) + if use_soft_link: + os.symlink(source_file, target_file) + # print(f'symlink {source_file} -> {target_file}') + else: + shutil.copyfile(source_file, target_file) + # print(f'copyfile {source_file} -> {target_file}') + print(f'[{samples_number}] files in [{e_root}] have been copied to [{output_dir}].') + + for i in range(warnings_info.__len__()): + print(warnings_info[i]) \ No newline at end of file diff --git a/spikingjelly/datasets/asl_dvs.py b/spikingjelly/datasets/asl_dvs.py index 343bc02..5e93b5c 100644 --- a/spikingjelly/datasets/asl_dvs.py +++ b/spikingjelly/datasets/asl_dvs.py @@ -1,5 +1,4 @@ -from typing import Any, Callable, cast, Dict, List, Optional, Tuple -import numpy as np +from typing import Callable, Dict, Optional, Tuple import spikingjelly.datasets as sjds from torchvision.datasets.utils import extract_archive import os @@ -7,8 +6,8 @@ import multiprocessing from concurrent.futures import ThreadPoolExecutor import time import shutil -from ..configure import max_threads_number_for_datasets_preprocess -from spikingjelly.datasets import np_savez +from .. import configure +from ..datasets import np_savez class ASLDVS(sjds.NeuromorphicDatasetFolder): def __init__( @@ -23,69 +22,11 @@ class ASLDVS(sjds.NeuromorphicDatasetFolder): transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, ) -> None: - ''' - :param root: root path of the dataset - :type root: str - :param data_type: `event` or `frame` - :type data_type: str - :param frames_number: the integrated frame number - :type frames_number: int - :param split_by: `time` or `number` - :type split_by: str - :param duration: the time duration of each frame - :type duration: int - :param custom_integrate_function: a user-defined function that inputs are ``events, H, W``. - ``events`` is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray`` - ``H`` is the height of the data and ``W`` is the weight of the data. - For example, H=128 and W=128 for the DVS128 Gesture dataset. - The user should define how to integrate events to frames, and return frames. - :type custom_integrate_function: Callable - :param custom_integrated_frames_dir_name: The name of directory for saving frames integrating by ``custom_integrate_function``. - If ``custom_integrated_frames_dir_name`` is ``None``, it will be set to ``custom_integrate_function.__name__`` - :type custom_integrated_frames_dir_name: str or None - :param transform: a function/transform that takes in - a sample and returns a transformed version. - E.g, ``transforms.RandomCrop`` for images. - :type transform: callable - :param target_transform: a function/transform that takes - in the target and transforms it. - :type target_transform: callable - - If ``data_type == 'event'`` - the sample in this dataset is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray``. - - If ``data_type == 'frame'`` and ``frames_number`` is not ``None`` - events will be integrated to frames with fixed frames number. ``split_by`` will define how to split events. - See :class:`spikingjelly.datasets.cal_fixed_frames_number_segment_index` for - more details. - - If ``data_type == 'frame'``, ``frames_number`` is ``None``, and ``duration`` is not ``None`` - events will be integrated to frames with fixed time duration. - - If ``data_type == 'frame'``, ``frames_number`` is ``None``, ``duration`` is ``None``, and ``custom_integrate_function`` is not ``None``: - events will be integrated by the user-defined function and saved to the ``custom_integrated_frames_dir_name`` directory in ``root`` directory. - Here is an example from SpikingJelly's tutorials: - - .. code-block:: python - - from spikingjelly.datasets.dvs128_gesture import DVS128Gesture - from typing import Dict - import numpy as np - import spikingjelly.datasets as sjds - def integrate_events_to_2_frames_randomly(events: Dict, H: int, W: int): - index_split = np.random.randint(low=0, high=events['t'].__len__()) - frames = np.zeros([2, 2, H, W]) - frames[0] = sjds.integrate_events_segment_to_frame(events, H, W, 0, index_split) - frames[1] = sjds.integrate_events_segment_to_frame(events, H, W, index_split, events['t'].__len__()) - return frames - - root_dir = 'D:/datasets/DVS128Gesture' - train_set = DVS128Gesture(root_dir, train=True, data_type='frame', custom_integrate_function=integrate_events_to_2_frames_randomly) - - from spikingjelly.datasets import play_frame - frame, label = train_set[500] - play_frame(frame) - ''' + """ + The ASL-DVS dataset, which is proposed by `Graph-based Object Classification for Neuromorphic Vision Sensing `_. + + Refer to :class:`spikingjelly.datasets.NeuromorphicDatasetFolder` for more details about params information. + """ super().__init__(root, None, data_type, frames_number, split_by, duration, custom_integrate_function, custom_integrated_frames_dir_name, transform, target_transform) @staticmethod @@ -179,7 +120,7 @@ class ASLDVS(sjds.NeuromorphicDatasetFolder): This function defines how to convert the origin binary data in ``extract_root`` to ``npz`` format and save converted files in ``events_np_root``. ''' t_ckp = time.time() - with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), max_threads_number_for_datasets_preprocess)) as tpe: + with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), configure.max_threads_number_for_datasets_preprocess)) as tpe: for class_name in os.listdir(extract_root): mat_dir = os.path.join(extract_root, class_name) np_dir = os.path.join(events_np_root, class_name) diff --git a/spikingjelly/datasets/cifar10_dvs.py b/spikingjelly/datasets/cifar10_dvs.py index 7e67140..ceaaf0c 100644 --- a/spikingjelly/datasets/cifar10_dvs.py +++ b/spikingjelly/datasets/cifar10_dvs.py @@ -1,13 +1,13 @@ -from typing import Any, Callable, cast, Dict, List, Optional, Tuple +from typing import Callable, Dict, Optional, Tuple import numpy as np -import spikingjelly.datasets as sjds +from .. import datasets as sjds from torchvision.datasets.utils import extract_archive import os import multiprocessing from concurrent.futures import ThreadPoolExecutor import time -from ..configure import max_threads_number_for_datasets_preprocess -from spikingjelly.datasets import np_savez +from .. import configure +from ..datasets import np_savez # https://github.com/jackd/events-tfds/blob/master/events_tfds/data_io/aedat.py @@ -119,69 +119,12 @@ class CIFAR10DVS(sjds.NeuromorphicDatasetFolder): transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, ) -> None: - ''' - :param root: root path of the dataset - :type root: str - :param data_type: `event` or `frame` - :type data_type: str - :param frames_number: the integrated frame number - :type frames_number: int - :param split_by: `time` or `number` - :type split_by: str - :param duration: the time duration of each frame - :type duration: int - :param custom_integrate_function: a user-defined function that inputs are ``events, H, W``. - ``events`` is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray`` - ``H`` is the height of the data and ``W`` is the weight of the data. - For example, H=128 and W=128 for the DVS128 Gesture dataset. - The user should define how to integrate events to frames, and return frames. - :type custom_integrate_function: Callable - :param custom_integrated_frames_dir_name: The name of directory for saving frames integrating by ``custom_integrate_function``. - If ``custom_integrated_frames_dir_name`` is ``None``, it will be set to ``custom_integrate_function.__name__`` - :type custom_integrated_frames_dir_name: str or None - :param transform: a function/transform that takes in - a sample and returns a transformed version. - E.g, ``transforms.RandomCrop`` for images. - :type transform: callable - :param target_transform: a function/transform that takes - in the target and transforms it. - :type target_transform: callable - - If ``data_type == 'event'`` - the sample in this dataset is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray``. - - If ``data_type == 'frame'`` and ``frames_number`` is not ``None`` - events will be integrated to frames with fixed frames number. ``split_by`` will define how to split events. - See :class:`spikingjelly.datasets.cal_fixed_frames_number_segment_index` for - more details. - - If ``data_type == 'frame'``, ``frames_number`` is ``None``, and ``duration`` is not ``None`` - events will be integrated to frames with fixed time duration. - - If ``data_type == 'frame'``, ``frames_number`` is ``None``, ``duration`` is ``None``, and ``custom_integrate_function`` is not ``None``: - events will be integrated by the user-defined function and saved to the ``custom_integrated_frames_dir_name`` directory in ``root`` directory. - Here is an example from SpikingJelly's tutorials: + """ + The CIFAR10-DVS dataset, which is proposed by `CIFAR10-DVS: An Event-Stream Dataset for Object Classification + `_. - .. code-block:: python - - from spikingjelly.datasets.dvs128_gesture import DVS128Gesture - from typing import Dict - import numpy as np - import spikingjelly.datasets as sjds - def integrate_events_to_2_frames_randomly(events: Dict, H: int, W: int): - index_split = np.random.randint(low=0, high=events['t'].__len__()) - frames = np.zeros([2, 2, H, W]) - frames[0] = sjds.integrate_events_segment_to_frame(events, H, W, 0, index_split) - frames[1] = sjds.integrate_events_segment_to_frame(events, H, W, index_split, events['t'].__len__()) - return frames - - root_dir = 'D:/datasets/DVS128Gesture' - train_set = DVS128Gesture(root_dir, train=True, data_type='frame', custom_integrate_function=integrate_events_to_2_frames_randomly) - - from spikingjelly.datasets import play_frame - frame, label = train_set[500] - play_frame(frame) - ''' + Refer to :class:`spikingjelly.datasets.NeuromorphicDatasetFolder` for more details about params information. + """ super().__init__(root, None, data_type, frames_number, split_by, duration, custom_integrate_function, custom_integrated_frames_dir_name, transform, target_transform) @staticmethod @@ -283,7 +226,7 @@ class CIFAR10DVS(sjds.NeuromorphicDatasetFolder): This function defines how to convert the origin binary data in ``extract_root`` to ``npz`` format and save converted files in ``events_np_root``. ''' t_ckp = time.time() - with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), max_threads_number_for_datasets_preprocess)) as tpe: + with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), configure.max_threads_number_for_datasets_preprocess)) as tpe: for class_name in os.listdir(extract_root): aedat_dir = os.path.join(extract_root, class_name) np_dir = os.path.join(events_np_root, class_name) diff --git a/spikingjelly/datasets/dvs128_gesture.py b/spikingjelly/datasets/dvs128_gesture.py index 0f1180a..4ed764f 100644 --- a/spikingjelly/datasets/dvs128_gesture.py +++ b/spikingjelly/datasets/dvs128_gesture.py @@ -1,13 +1,13 @@ -from typing import Any, Callable, cast, Dict, List, Optional, Tuple +from typing import Callable, Dict, Optional, Tuple import numpy as np -import spikingjelly.datasets as sjds +from .. import datasets as sjds from torchvision.datasets.utils import extract_archive import os import multiprocessing from concurrent.futures import ThreadPoolExecutor import time -from ..configure import max_threads_number_for_datasets_preprocess -from spikingjelly.datasets import np_savez +from .. import configure +from ..datasets import np_savez class DVS128Gesture(sjds.NeuromorphicDatasetFolder): def __init__( @@ -23,71 +23,11 @@ class DVS128Gesture(sjds.NeuromorphicDatasetFolder): transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, ) -> None: - ''' - :param root: root path of the dataset - :type root: str - :param train: whether use the train set - :type train: bool - :param data_type: `event` or `frame` - :type data_type: str - :param frames_number: the integrated frame number - :type frames_number: int - :param split_by: `time` or `number` - :type split_by: str - :param duration: the time duration of each frame - :type duration: int - :param custom_integrate_function: a user-defined function that inputs are ``events, H, W``. - ``events`` is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray`` - ``H`` is the height of the data and ``W`` is the weight of the data. - For example, H=128 and W=128 for the DVS128 Gesture dataset. - The user should define how to integrate events to frames, and return frames. - :type custom_integrate_function: Callable - :param custom_integrated_frames_dir_name: The name of directory for saving frames integrating by ``custom_integrate_function``. - If ``custom_integrated_frames_dir_name`` is ``None``, it will be set to ``custom_integrate_function.__name__`` - :type custom_integrated_frames_dir_name: str or None - :param transform: a function/transform that takes in - a sample and returns a transformed version. - E.g, ``transforms.RandomCrop`` for images. - :type transform: callable - :param target_transform: a function/transform that takes - in the target and transforms it. - :type target_transform: callable - - If ``data_type == 'event'`` - the sample in this dataset is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray``. - - If ``data_type == 'frame'`` and ``frames_number`` is not ``None`` - events will be integrated to frames with fixed frames number. ``split_by`` will define how to split events. - See :class:`spikingjelly.datasets.cal_fixed_frames_number_segment_index` for - more details. - - If ``data_type == 'frame'``, ``frames_number`` is ``None``, and ``duration`` is not ``None`` - events will be integrated to frames with fixed time duration. - - If ``data_type == 'frame'``, ``frames_number`` is ``None``, ``duration`` is ``None``, and ``custom_integrate_function`` is not ``None``: - events will be integrated by the user-defined function and saved to the ``custom_integrated_frames_dir_name`` directory in ``root`` directory. - Here is an example from SpikingJelly's tutorials: + """ + The DVS128 Gesture dataset, which is proposed by `A Low Power, Fully Event-Based Gesture Recognition System `_. - .. code-block:: python - - from spikingjelly.datasets.dvs128_gesture import DVS128Gesture - from typing import Dict - import numpy as np - import spikingjelly.datasets as sjds - def integrate_events_to_2_frames_randomly(events: Dict, H: int, W: int): - index_split = np.random.randint(low=0, high=events['t'].__len__()) - frames = np.zeros([2, 2, H, W]) - frames[0] = sjds.integrate_events_segment_to_frame(events, H, W, 0, index_split) - frames[1] = sjds.integrate_events_segment_to_frame(events, H, W, index_split, events['t'].__len__()) - return frames - - root_dir = 'D:/datasets/DVS128Gesture' - train_set = DVS128Gesture(root_dir, train=True, data_type='frame', custom_integrate_function=integrate_events_to_2_frames_randomly) - - from spikingjelly.datasets import play_frame - frame, label = train_set[500] - play_frame(frame) - ''' + Refer to :class:`spikingjelly.datasets.NeuromorphicDatasetFolder` for more details about params information. + """ assert train is not None super().__init__(root, train, data_type, frames_number, split_by, duration, custom_integrate_function, custom_integrated_frames_dir_name, transform, target_transform) @staticmethod @@ -234,7 +174,7 @@ class DVS128Gesture(sjds.NeuromorphicDatasetFolder): os.path.join(aedat_dir, 'trials_to_test.txt')) as trials_to_test_txt: # use multi-thread to accelerate t_ckp = time.time() - with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), max_threads_number_for_datasets_preprocess)) as tpe: + with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), configure.max_threads_number_for_datasets_preprocess)) as tpe: print(f'Start the ThreadPoolExecutor with max workers = [{tpe._max_workers}].') for fname in trials_to_train_txt.readlines(): diff --git a/spikingjelly/datasets/es_imagenet.py b/spikingjelly/datasets/es_imagenet.py new file mode 100644 index 0000000..f31631b --- /dev/null +++ b/spikingjelly/datasets/es_imagenet.py @@ -0,0 +1,217 @@ +from typing import Callable, Dict, Optional, Tuple +import numpy as np +from .. import datasets as sjds +import os +import rarfile +import time + + +def load_events(fname: str): + events = np.load(fname) + e_pos = events['pos'] + e_neg = events['neg'] + e_pos = np.hstack((e_pos, np.ones((e_pos.shape[0], 1)))) + e_neg = np.hstack((e_neg, np.zeros((e_neg.shape[0], 1)))) + events = np.vstack((e_pos, e_neg)) # shape = [N, 4], N * (x, y, t, p) + idx = np.argsort(events[:, 2]) + events = events[idx] + return { + 'x': events[:, 1], + 'y': events[:, 0], + 't': events[:, 2], + 'p': events[:, 3] + } + + +class ESImageNet(sjds.NeuromorphicDatasetFolder): + def __init__( + self, + root: str, + train: bool = None, + data_type: str = 'event', + frames_number: int = None, + split_by: str = None, + duration: int = None, + custom_integrate_function: Callable = None, + custom_integrated_frames_dir_name: str = None, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + ) -> None: + """ + The ES-ImageNet dataset, which is proposed by `ES-ImageNet: A Million Event-Stream Classification Dataset for Spiking Neural Networks `_. + + Refer to :class:`spikingjelly.datasets.NeuromorphicDatasetFolder` for more details about params information. + """ + assert train is not None + super().__init__(root, train, data_type, frames_number, split_by, duration, custom_integrate_function, custom_integrated_frames_dir_name, transform, target_transform) + + if data_type == 'event': + self.loader = load_events + + @staticmethod + def load_events_np(fname: str): + return load_events(fname) + + @staticmethod + def resource_url_md5() -> list: + ''' + :return: A list ``url`` that ``url[i]`` is a tuple, which contains the i-th file's name, download link, and MD5 + :rtype: list + ''' + urls = [ + ('ES-imagenet-0.18.part01.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part01.rar&dl=1', + '900bdd57b5641f7d81cd4620283fef76'), + ('ES-imagenet-0.18.part02.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part02.rar&dl=1', + '5982532009e863a8f4e18e793314c54b'), + ('ES-imagenet-0.18.part03.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part03.rar&dl=1', + '8f408c1f5a1d4604e48d0d062a8289a0'), + ('ES-imagenet-0.18.part04.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part04.rar&dl=1', + '5c5b5cf0a55954eb639964e3da510097'), + ('ES-imagenet-0.18.part05.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part05.rar&dl=1', + '51feb661b4c9fa87860b63e76b914673'), + ('ES-imagenet-0.18.part06.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part06.rar&dl=1', + 'fcd007a2b17b7c13f338734c53f6db31'), + ('ES-imagenet-0.18.part07.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part07.rar&dl=1', + 'd3e74b96d9c5df15714bbc3abcd329fc'), + ('ES-imagenet-0.18.part08.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part08.rar&dl=1', + '65b9cf7fa63e18d2e7d92ff45a42a5e5'), + ('ES-imagenet-0.18.part09.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part09.rar&dl=1', + '241c9a37a83ff9efd305fe46d012211e'), + ('ES-imagenet-0.18.part10.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part10.rar&dl=1', + 'ceee96971008e30d0cdc34086c49fd75'), + ('ES-imagenet-0.18.part11.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part11.rar&dl=1', + '4fbfefbe6e48758fbb72427c81f119cf'), + ('ES-imagenet-0.18.part12.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part12.rar&dl=1', + 'c8cc163be4e5f6451201dccbded4ec24'), + ('ES-imagenet-0.18.part13.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part13.rar&dl=1', + '08c9dff32f6b42c49ef7cd78e37c728e'), + ('ES-imagenet-0.18.part14.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part14.rar&dl=1', + '43aa157dc5bd5fcea81315a46e0322cf'), + ('ES-imagenet-0.18.part15.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part15.rar&dl=1', + '480a69b050f465ef01efcc44ae29f7df'), + ('ES-imagenet-0.18.part16.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part16.rar&dl=1', + '11abd24d92b93e7f85acd63abd4a18ab'), + ('ES-imagenet-0.18.part17.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part17.rar&dl=1', + '3891486a6862c63a325c5f16cd01fdd1'), + ('ES-imagenet-0.18.part18.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part18.rar&dl=1', + 'cf8bb0525b514f411bca9d7c2d681f7c'), + ('ES-imagenet-0.18.part19.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part19.rar&dl=1', + '3766bc35572ccacc03f0f293c571d0ae'), + ('ES-imagenet-0.18.part20.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part20.rar&dl=1', + 'bf73a5e338644122220e41da7b5630e6'), + ('ES-imagenet-0.18.part21.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part21.rar&dl=1', + '564de4a2609cbb0bb67ffa1bc51f2487'), + ('ES-imagenet-0.18.part22.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part22.rar&dl=1', + '60a9e52db1acadfccc9a9809073f0b04'), + ('ES-imagenet-0.18.part23.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part23.rar&dl=1', + '373b5484826d40d7ec35f0e1605cb6ea'), + ('ES-imagenet-0.18.part24.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part24.rar&dl=1', + 'a50612e889b20f99cc7b2725dfd72e9e'), + ('ES-imagenet-0.18.part25.rar', + 'https://cloud.tsinghua.edu.cn/d/94873ab4ec2a4eb497b3/files/?p=%2FES-imagenet-0.18.part25.rar&dl=1', + '0802ccdeb0cff29237faf55164524101') + ] + + return urls + + + @staticmethod + def downloadable() -> bool: + ''' + :return: Whether the dataset can be directly downloaded by python codes. If not, the user have to download it manually + :rtype: bool + ''' + return True + + @staticmethod + def extract_downloaded_files(download_root: str, extract_root: str): + ''' + :param download_root: Root directory path which saves downloaded dataset files + :type download_root: str + :param extract_root: Root directory path which saves extracted files from downloaded files + :type extract_root: str + :return: None + + This function defines how to extract download files. + ''' + rar_file = os.path.join(download_root, 'ES-imagenet-0.18.part01.rar') + print(f'Extract [{rar_file}] to [{extract_root}].') + rar_file = rarfile.RarFile(rar_file) + rar_file.extractall(extract_root) + rar_file.close() + + + + @staticmethod + def create_events_np_files(extract_root: str, events_np_root: str): + ''' + :param extract_root: Root directory path which saves extracted files from downloaded files + :type extract_root: str + :param events_np_root: Root directory path which saves events files in the ``npz`` format + :type events_np_root: + :return: None + + This function defines how to convert the origin binary data in ``extract_root`` to ``npz`` format and save converted files in ``events_np_root``. + ''' + t_ckp = time.time() + train_dir = os.path.join(events_np_root, 'train') + os.mkdir(train_dir) + print(f'Mkdir [{train_dir}].') + sjds.create_same_directory_structure(os.path.join(extract_root, 'ES-imagenet-0.18/train'), train_dir) + for class_dir in os.listdir(os.path.join(extract_root, 'ES-imagenet-0.18/train')): + source_dir = os.path.join(extract_root, 'ES-imagenet-0.18/train', class_dir) + target_dir = os.path.join(train_dir, class_dir) + print(f'Create soft links from [{source_dir}] to [{target_dir}].') + for class_sample in os.listdir(source_dir): + os.symlink(os.path.join(source_dir, class_sample), + os.path.join(target_dir, class_sample)) + + + + + val_label = np.loadtxt(os.path.join(extract_root, 'ES-imagenet-0.18/vallabel.txt'), delimiter=' ', usecols=(1, ), dtype=int) + val_fname = np.loadtxt(os.path.join(extract_root, 'ES-imagenet-0.18/vallabel.txt'), delimiter=' ', usecols=(0, ), dtype=str) + source_dir = os.path.join(extract_root, 'ES-imagenet-0.18/val') + target_dir = os.path.join(events_np_root, 'test') + os.mkdir(target_dir) + print(f'Mkdir [{target_dir}].') + sjds.create_same_directory_structure(train_dir, target_dir) + + for i in range(val_fname.__len__()): + os.symlink(os.path.join(source_dir, val_fname[i]), os.path.join(target_dir, f'class{val_label[i]}/{val_fname[i]}')) + + print(f'Used time = [{round(time.time() - t_ckp, 2)}s].') + print(f'Note that files in [{events_np_root}] are soft links whose source files are in [{extract_root}]. If you want to use events, do not delete [{extract_root}].') + + @staticmethod + def get_H_W() -> Tuple: + ''' + :return: A tuple ``(H, W)``, where ``H`` is the height of the data and ``W` is the weight of the data. + For example, this function returns ``(128, 128)`` for the DVS128 Gesture dataset. + :rtype: tuple + ''' + return 256, 256 \ No newline at end of file diff --git a/spikingjelly/datasets/n_caltech101.py b/spikingjelly/datasets/n_caltech101.py index e1823ce..7ce037f 100644 --- a/spikingjelly/datasets/n_caltech101.py +++ b/spikingjelly/datasets/n_caltech101.py @@ -1,13 +1,12 @@ -from typing import Any, Callable, cast, Dict, List, Optional, Tuple -import numpy as np -import spikingjelly.datasets as sjds +from typing import Callable, Dict, Optional, Tuple +from .. import datasets as sjds from torchvision.datasets.utils import extract_archive import os import multiprocessing from concurrent.futures import ThreadPoolExecutor import time -from ..configure import max_threads_number_for_datasets_preprocess -from spikingjelly.datasets import np_savez +from .. import configure +from ..datasets import np_savez class NCaltech101(sjds.NeuromorphicDatasetFolder): def __init__( @@ -22,69 +21,11 @@ class NCaltech101(sjds.NeuromorphicDatasetFolder): transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, ) -> None: - ''' - :param root: root path of the dataset - :type root: str - :param data_type: `event` or `frame` - :type data_type: str - :param frames_number: the integrated frame number - :type frames_number: int - :param split_by: `time` or `number` - :type split_by: str - :param duration: the time duration of each frame - :type duration: int - :param custom_integrate_function: a user-defined function that inputs are ``events, H, W``. - ``events`` is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray`` - ``H`` is the height of the data and ``W`` is the weight of the data. - For example, H=128 and W=128 for the DVS128 Gesture dataset. - The user should define how to integrate events to frames, and return frames. - :type custom_integrate_function: Callable - :param custom_integrated_frames_dir_name: The name of directory for saving frames integrating by ``custom_integrate_function``. - If ``custom_integrated_frames_dir_name`` is ``None``, it will be set to ``custom_integrate_function.__name__`` - :type custom_integrated_frames_dir_name: str or None - :param transform: a function/transform that takes in - a sample and returns a transformed version. - E.g, ``transforms.RandomCrop`` for images. - :type transform: callable - :param target_transform: a function/transform that takes - in the target and transforms it. - :type target_transform: callable - - If ``data_type == 'event'`` - the sample in this dataset is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray``. - - If ``data_type == 'frame'`` and ``frames_number`` is not ``None`` - events will be integrated to frames with fixed frames number. ``split_by`` will define how to split events. - See :class:`spikingjelly.datasets.cal_fixed_frames_number_segment_index` for - more details. - - If ``data_type == 'frame'``, ``frames_number`` is ``None``, and ``duration`` is not ``None`` - events will be integrated to frames with fixed time duration. - - If ``data_type == 'frame'``, ``frames_number`` is ``None``, ``duration`` is ``None``, and ``custom_integrate_function`` is not ``None``: - events will be integrated by the user-defined function and saved to the ``custom_integrated_frames_dir_name`` directory in ``root`` directory. - Here is an example from SpikingJelly's tutorials: - - .. code-block:: python - - from spikingjelly.datasets.dvs128_gesture import DVS128Gesture - from typing import Dict - import numpy as np - import spikingjelly.datasets as sjds - def integrate_events_to_2_frames_randomly(events: Dict, H: int, W: int): - index_split = np.random.randint(low=0, high=events['t'].__len__()) - frames = np.zeros([2, 2, H, W]) - frames[0] = sjds.integrate_events_segment_to_frame(events, H, W, 0, index_split) - frames[1] = sjds.integrate_events_segment_to_frame(events, H, W, index_split, events['t'].__len__()) - return frames - - root_dir = 'D:/datasets/DVS128Gesture' - train_set = DVS128Gesture(root_dir, train=True, data_type='frame', custom_integrate_function=integrate_events_to_2_frames_randomly) - - from spikingjelly.datasets import play_frame - frame, label = train_set[500] - play_frame(frame) - ''' + """ + The N-Caltech101 dataset, which is proposed by `Converting Static Image Datasets to Spiking Neuromorphic Datasets Using Saccades `_. + + Refer to :class:`spikingjelly.datasets.NeuromorphicDatasetFolder` for more details about params information. + """ super().__init__(root, None, data_type, frames_number, split_by, duration, custom_integrate_function, custom_integrated_frames_dir_name, transform, target_transform) @staticmethod def resource_url_md5() -> list: @@ -171,7 +112,7 @@ class NCaltech101(sjds.NeuromorphicDatasetFolder): ''' t_ckp = time.time() extract_root = os.path.join(extract_root, 'Caltech101') - with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), max_threads_number_for_datasets_preprocess)) as tpe: + with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), configure.max_threads_number_for_datasets_preprocess)) as tpe: # too many threads will make the disk overload for class_name in os.listdir(extract_root): bin_dir = os.path.join(extract_root, class_name) diff --git a/spikingjelly/datasets/n_mnist.py b/spikingjelly/datasets/n_mnist.py index f17d64b..2b16356 100644 --- a/spikingjelly/datasets/n_mnist.py +++ b/spikingjelly/datasets/n_mnist.py @@ -1,13 +1,12 @@ -from typing import Any, Callable, cast, Dict, List, Optional, Tuple -import numpy as np -import spikingjelly.datasets as sjds +from typing import Callable, Dict, Optional, Tuple +from .. import datasets as sjds from torchvision.datasets.utils import extract_archive import os import multiprocessing from concurrent.futures import ThreadPoolExecutor import time -from ..configure import max_threads_number_for_datasets_preprocess -from spikingjelly.datasets import np_savez +from .. import configure +from ..datasets import np_savez class NMNIST(sjds.NeuromorphicDatasetFolder): def __init__( @@ -23,72 +22,11 @@ class NMNIST(sjds.NeuromorphicDatasetFolder): transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, ) -> None: - ''' - :param root: root path of the dataset - :type root: str - :param train: whether use the train set - :type train: bool - :param data_type: `event` or `frame` - :type data_type: str - :param frames_number: the integrated frame number - :type frames_number: int - :param split_by: `time` or `number` - :type split_by: str - :param duration: the time duration of each frame - :type duration: int - :param custom_integrate_function: a user-defined function that inputs are ``events, H, W``. - ``events`` is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray`` - ``H`` is the height of the data and ``W`` is the weight of the data. - For example, H=128 and W=128 for the DVS128 Gesture dataset. - The user should define how to integrate events to frames, and return frames. - :type custom_integrate_function: Callable - :param custom_integrated_frames_dir_name: The name of directory for saving frames integrating by ``custom_integrate_function``. - If ``custom_integrated_frames_dir_name`` is ``None``, it will be set to ``custom_integrate_function.__name__`` - :type custom_integrated_frames_dir_name: str or None - :param transform: a function/transform that takes in - a sample and returns a transformed version. - E.g, ``transforms.RandomCrop`` for images. - :type transform: callable - :param target_transform: a function/transform that takes - in the target and transforms it. - :type target_transform: callable - - If ``data_type == 'event'`` - the sample in this dataset is a dict whose keys are ``['t', 'x', 'y', 'p']`` and values are ``numpy.ndarray``. - - If ``data_type == 'frame'`` and ``frames_number`` is not ``None`` - events will be integrated to frames with fixed frames number. ``split_by`` will define how to split events. - See :class:`spikingjelly.datasets.cal_fixed_frames_number_segment_index` for - more details. - - If ``data_type == 'frame'``, ``frames_number`` is ``None``, and ``duration`` is not ``None`` - events will be integrated to frames with fixed time duration. - - If ``data_type == 'frame'``, ``frames_number`` is ``None``, ``duration`` is ``None``, and ``custom_integrate_function`` is not ``None``: - events will be integrated by the user-defined function and saved to the ``custom_integrated_frames_dir_name`` directory in ``root`` directory. - Here is an example from SpikingJelly's tutorials: - - .. code-block:: python - - from spikingjelly.datasets.dvs128_gesture import DVS128Gesture - from typing import Dict - import numpy as np - import spikingjelly.datasets as sjds - def integrate_events_to_2_frames_randomly(events: Dict, H: int, W: int): - index_split = np.random.randint(low=0, high=events['t'].__len__()) - frames = np.zeros([2, 2, H, W]) - frames[0] = sjds.integrate_events_segment_to_frame(events, H, W, 0, index_split) - frames[1] = sjds.integrate_events_segment_to_frame(events, H, W, index_split, events['t'].__len__()) - return frames - - root_dir = 'D:/datasets/DVS128Gesture' - train_set = DVS128Gesture(root_dir, train=True, data_type='frame', custom_integrate_function=integrate_events_to_2_frames_randomly) - - from spikingjelly.datasets import play_frame - frame, label = train_set[500] - play_frame(frame) + """ + The N-MNIST dataset, which is proposed by `Converting Static Image Datasets to Spiking Neuromorphic Datasets Using Saccades `_. - ''' + Refer to :class:`spikingjelly.datasets.NeuromorphicDatasetFolder` for more details about params information. + """ assert train is not None super().__init__(root, train, data_type, frames_number, split_by, duration, custom_integrate_function, custom_integrated_frames_dir_name, transform, target_transform) @staticmethod @@ -175,7 +113,7 @@ class NMNIST(sjds.NeuromorphicDatasetFolder): This function defines how to convert the origin binary data in ``extract_root`` to ``npz`` format and save converted files in ``events_np_root``. ''' t_ckp = time.time() - with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), max_threads_number_for_datasets_preprocess)) as tpe: + with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), configure.max_threads_number_for_datasets_preprocess)) as tpe: # too many threads will make the disk overload for train_test_dir in ['Train', 'Test']: source_dir = os.path.join(extract_root, train_test_dir) diff --git a/spikingjelly/datasets/nav_gesture.py b/spikingjelly/datasets/nav_gesture.py new file mode 100644 index 0000000..b950f61 --- /dev/null +++ b/spikingjelly/datasets/nav_gesture.py @@ -0,0 +1,331 @@ + +# Codes from the source dataset: +# --------------------------------------------------------------------------------------------- +#!/usr/bin/python +# -*- coding: utf8 -* +##################### +# read_td_events.py # +##################### +# Feb 2017 - Jean-Matthieu Maro +# Email: jean-matthieu dot maro, hosted at inserm, which is located in FRance. +# Thanks to Germain Haessig and Laurent Dardelet. + +from struct import unpack, pack +import numpy as np +import sys + + +def peek(f, length=1): + pos = f.tell() + data = f.read(length) + f.seek(pos) + return data + +def readATIS_tddat(file_name, orig_at_zero = True, drop_negative_dt = True, verbose = True, events_restriction = [0, np.inf]): + + """ + reads ATIS td events in .dat format + + input: + filename: string, path to the .dat file + orig_at_zero: bool, if True, timestamps will start at 0 + drop_negative_dt: bool, if True, events with a timestamp greater than the previous event are dismissed + verbose: bool, if True, verbose mode. + events_restriction: list [min ts, max ts], will return only events with ts in the defined boundaries + + output: + timestamps: numpy array of length (number of events), timestamps + coords: numpy array of size (number of events, 2), spatial coordinates: col 0 is x, col 1 is y. + polarities: numpy array of length (number of events), polarities + removed_events: integer, number of removed events (negative delta-ts) + + """ + + polmask = 0x0002000000000000 + xmask = 0x000001FF00000000 + ymask = 0x0001FE0000000000 + polpadding = 49 + ypadding = 41 + xpadding = 32 + + # This one read _td.dat files generated by kAER + if verbose: + print('Reading _td dat file... (' + file_name + ')') + file = open(file_name,'rb') + + header = False + while peek(file) == b'%': + file.readline() + header = True + if header: + ev_type = unpack('B',file.read(1))[0] + ev_size = unpack('B',file.read(1))[0] + if verbose: + print('> Header exists. Event type is ' + str(ev_type) + ', event size is ' + str(ev_size)) + if ev_size != 8: + print('Wrong event size. Aborting.') + return -1, -1, -1, -1 + else: # set default ev type and size + if verbose: + print('> No header. Setting default event type and size.') + ev_size = 8 + ev_type = 0 + + # Compute number of events in the file + start = file.tell() + file.seek(0,2) + stop = file.tell() + file.seek(start) + + Nevents = int( (stop-start)/ev_size ) + dNEvents = Nevents/100 + if verbose: + print("> The file contains %d events." %Nevents) + + # store read data + timestamps = np.zeros(Nevents, dtype = int) + polarities = np.zeros(Nevents, dtype = int) + coords = np.zeros((Nevents, 2), dtype = int) + + ActualEvents = 0 + for i in np.arange(0, int(Nevents)): + + event = unpack('Q',file.read(8)) + ts = event[0] & 0x00000000FFFFFFFF + # padding = event[0] & 0xFFFC000000000000 + pol = (event[0] & polmask) >> polpadding + y = (event[0] & ymask) >> ypadding + x = (event[0] & xmask) >> xpadding + if i >= events_restriction[0] and ts>=timestamps[max(0,i-1)]: + ActualEvents += 1 + timestamps[i] = ts + polarities[i] = pol + coords[i, 0] = x + coords[i, 1] = y + + if verbose and i%dNEvents == 0: + sys.stdout.write("> "+str(i/dNEvents)+"% \r") + sys.stdout.flush() + if i > events_restriction[1]: + break + file.close() + if verbose: + print ("> After loading events, actually found {0} events.".format(ActualEvents)) + + timestamps = timestamps[:ActualEvents] + coords = coords[:ActualEvents, :] + polarities = polarities[:ActualEvents] + + #check for negative timestamps + for ts in timestamps: + if ts < 0: + print('Found a negative timestamp.') + + if orig_at_zero: + timestamps = timestamps - timestamps[0] + + drop_sum = 0 + if drop_negative_dt: + if verbose: + print('> Looking for negative dts...') + # first check if negative TS differences + just_dropped = True + nPasses = 0 + while just_dropped: + nPasses += 1 + index_neg = [] + just_dropped = False + ii = 0 + while ii < (timestamps.size - 1): + dt = timestamps[ii+1] - timestamps[ii] + if dt < 0: # alors ts en ii+1 plus petit que ii + index_neg += [ii+1] + ii += 1 + just_dropped = True + if verbose and ii%dNEvents == 0: + sys.stdout.write("> "+str(ii/dNEvents)+"% (pass "+str(nPasses)+") \r") + sys.stdout.flush() + ii += 1 + if len(index_neg) > 0: + drop_sum += len(index_neg) + index_neg = np.array(index_neg) + timestamps = np.delete(timestamps, index_neg) + polarities = np.delete(polarities, index_neg) + coords = np.delete(coords, index_neg, axis = 0) + if verbose: + print('> Removed {0} events in {1} passes.'.format(drop_sum, nPasses)) + removed_events = drop_sum + else: + removed_events = -1 + if verbose: + print("> Sequence duration: {0:.2f}s, ts[0] = {1}, ts[{2}] = {3}.".format(float(timestamps[-1] - timestamps[0]) / 1e6, timestamps[0], len(timestamps)-1, timestamps[-1])) + + + return timestamps, coords, polarities, removed_events +# --------------------------------------------------------------------------------------------- + +from typing import Callable, Dict, Optional, Tuple +from .. import datasets as sjds +from torchvision.datasets.utils import extract_archive +import os +import multiprocessing +from concurrent.futures import ThreadPoolExecutor +import shutil +import time +from .. import configure +from ..datasets import np_savez + + + +class NAVGestureWalk(sjds.NeuromorphicDatasetFolder): + # 6 gestures: left, right, up, down, home, select. + # 10 subjects, holding the phone in one hand (selfie mode) while walking indoor and outdoor + def __init__( + self, + root: str, + data_type: str = 'event', + frames_number: int = None, + split_by: str = None, + duration: int = None, + custom_integrate_function: Callable = None, + custom_integrated_frames_dir_name: str = None, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + ) -> None: + """ + The Nav Gesture dataset, which is proposed by `Event-Based Gesture Recognition With Dynamic Background Suppression Using Smartphone Computational Capabilities `_. + + Refer to :class:`spikingjelly.datasets.NeuromorphicDatasetFolder` for more details about params information. + """ + super().__init__(root, None, data_type, frames_number, split_by, duration, custom_integrate_function, custom_integrated_frames_dir_name, transform, target_transform) + + @staticmethod + def resource_url_md5() -> list: + ''' + :return: A list ``url`` that ``url[i]`` is a tuple, which contains the i-th file's name, download link, and MD5 + :rtype: list + ''' + return [('navgesture-walk.zip', 'https://www.neuromorphic-vision.com/public/downloads/navgesture/navgesture-walk.zip', '5d305266f13005401959e819abe206f0')] + + @staticmethod + def downloadable() -> bool: + ''' + :return: Whether the dataset can be directly downloaded by python codes. If not, the user have to download it manually + :rtype: bool + ''' + return True + + @staticmethod + def extract_downloaded_files(download_root: str, extract_root: str): + ''' + :param download_root: Root directory path which saves downloaded dataset files + :type download_root: str + :param extract_root: Root directory path which saves extracted files from downloaded files + :type extract_root: str + :return: None + + This function defines how to extract download files. + ''' + temp_ext_dir = os.path.join(download_root, 'temp_ext') + os.mkdir(temp_ext_dir) + print(f'Mkdir [{temp_ext_dir}].') + extract_archive(os.path.join(download_root, 'navgesture-walk.zip'), temp_ext_dir) + with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), 4)) as tpe: + for zip_file in os.listdir(temp_ext_dir): + if os.path.splitext(zip_file)[1] == '.zip': + zip_file = os.path.join(temp_ext_dir, zip_file) + print(f'Extract [{zip_file}] to [{extract_root}].') + tpe.submit(extract_archive, zip_file, extract_root) + + shutil.rmtree(temp_ext_dir) + print(f'Rmtree [{temp_ext_dir}].') + + @staticmethod + def get_H_W() -> Tuple: + ''' + :return: A tuple ``(H, W)``, where ``H`` is the height of the data and ``W` is the weight of the data. + For example, this function returns ``(128, 128)`` for the DVS128 Gesture dataset. + :rtype: tuple + ''' + return 240, 304 # this camera is 240*320, but x.max() = 303. So, I set W = 304. + + @staticmethod + def read_aedat_save_to_np(bin_file: str, np_file: str): + t, xy, p, _ = readATIS_tddat(bin_file, verbose=False) + x = xy[:, 0] + y = 239 - xy[:, 1] + np_savez(np_file, + t=t, + x=x, + y=y, + p=p + ) + print(f'Save [{bin_file}] to [{np_file}].') + + @staticmethod + def create_events_np_files(extract_root: str, events_np_root: str): + ''' + :param extract_root: Root directory path which saves extracted files from downloaded files + :type extract_root: str + :param events_np_root: Root directory path which saves events files in the ``npz`` format + :type events_np_root: + :return: None + + This function defines how to convert the origin binary data in ``extract_root`` to ``npz`` format and save converted files in ``events_np_root``. + ''' + t_ckp = time.time() + np_dir_dict = {} + for label in ['le', 'ri', 'up', 'do', 'ho', 'se']: + np_dir = os.path.join(events_np_root, label) + os.mkdir(np_dir) + print(f'Mkdir [{np_dir}].') + np_dir_dict[label] = np_dir + + with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), + configure.max_threads_number_for_datasets_preprocess)) as tpe: + for user_name in os.listdir(extract_root): + aedat_dir = os.path.join(extract_root, user_name) + for bin_file in os.listdir(aedat_dir): + base_name = os.path.splitext(bin_file)[0] + label = base_name.split('_')[1] + source_file = os.path.join(aedat_dir, bin_file) + target_file = os.path.join(np_dir_dict[label], base_name + '.npz') + print(f'Start to convert [{source_file}] to [{target_file}].') + tpe.submit(NAVGestureWalk.read_aedat_save_to_np, source_file, + target_file) + print(f'Used time = [{round(time.time() - t_ckp, 2)}s].') + + +class NAVGestureSit(NAVGestureWalk): + @staticmethod + def resource_url_md5() -> list: + ''' + :return: A list ``url`` that ``url[i]`` is a tuple, which contains the i-th file's name, download link, and MD5 + :rtype: list + ''' + return [('navgesture-sit.zip', 'https://www.neuromorphic-vision.com/public/downloads/navgesture/navgesture-sit.zip', '1571753ace4d9e0946e6503313712c22')] + + @staticmethod + def extract_downloaded_files(download_root: str, extract_root: str): + ''' + :param download_root: Root directory path which saves downloaded dataset files + :type download_root: str + :param extract_root: Root directory path which saves extracted files from downloaded files + :type extract_root: str + :return: None + + This function defines how to extract download files. + ''' + temp_ext_dir = os.path.join(download_root, 'temp_ext') + os.mkdir(temp_ext_dir) + print(f'Mkdir [{temp_ext_dir}].') + extract_archive(os.path.join(download_root, 'navgesture-sit.zip'), temp_ext_dir) + with ThreadPoolExecutor(max_workers=min(multiprocessing.cpu_count(), 4)) as tpe: + for zip_file in os.listdir(temp_ext_dir): + if os.path.splitext(zip_file)[1] == '.zip': + zip_file = os.path.join(temp_ext_dir, zip_file) + print(f'Extract [{zip_file}] to [{extract_root}].') + tpe.submit(extract_archive, zip_file, extract_root) + + shutil.rmtree(temp_ext_dir) + print(f'Rmtree [{temp_ext_dir}].') diff --git a/spikingjelly/datasets/speechcommands.py b/spikingjelly/datasets/speechcommands.py index 0c73dc3..067d742 100644 --- a/spikingjelly/datasets/speechcommands.py +++ b/spikingjelly/datasets/speechcommands.py @@ -10,7 +10,6 @@ from torchaudio.datasets.utils import ( download_url, extract_archive ) -from torchvision import transforms from torchvision.datasets.utils import verify_str_arg import numpy as np from random import choice