OpenI
/
uavs3e
mirror of https://github.com/uavs3/uavs3e.git

 
			
			   
				 
					
						
						
							
							/**************************************************************************************
 * Copyright (c) 2018-2020 ["Peking University Shenzhen Graduate School",
 *   "Peng Cheng Laboratory", and "Guangdong Bohua UHD Innovation Corporation"]
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *    This product includes the software uAVS3d developed by
 *    Peking University Shenzhen Graduate School, Peng Cheng Laboratory
 *    and Guangdong Bohua UHD Innovation Corporation.
 * 4. Neither the name of the organizations (Peking University Shenzhen Graduate School,
 *    Peng Cheng Laboratory and Guangdong Bohua UHD Innovation Corporation) nor the
 *    names of its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * For more information, contact us at rgwang@pkusz.edu.cn.
 **************************************************************************************/

#include "avx2.h"

#if (BIT_DEPTH == 8)
void uavs3e_pel_avrg_8_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    int i_dst2 = i_dst << 1;
    int i_dst3 = i_dst + i_dst2;
    int i_dst4 = i_dst << 2;
    __m256i s0, s1, d0;
    __m128i m0, m1;

    while (height) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src2));

        d0 = _mm256_avg_epu8(s0, s1);
        m0 = _mm256_castsi256_si128(d0);
        m1 = _mm256_extractf128_si256(d0, 1);

        _mm_storel_epi64((__m128i *)dst, m0);
        _mm_storel_epi64((__m128i *)(dst + i_dst), _mm_srli_si128(m0, 8));
        _mm_storel_epi64((__m128i *)(dst + i_dst2), m1);
        _mm_storel_epi64((__m128i *)(dst + i_dst3), _mm_srli_si128(m1, 8));

        src1 += 32;
        src2 += 32;
        dst += i_dst4;
        height -= 4;
    }
}

void uavs3e_pel_avrg_16_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    int i_dst2 = i_dst << 1;
    int i_dst3 = i_dst + i_dst2;
    int i_dst4 = i_dst << 2;
    __m256i s0, s1, s2, s3, d0, d1;
    __m128i m0, m1, m2, m3;

    while (height) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src1 + 32));
        s2 = _mm256_load_si256((const __m256i *)(src2));
        s3 = _mm256_load_si256((const __m256i *)(src2 + 32));

        d0 = _mm256_avg_epu8(s0, s2);
        d1 = _mm256_avg_epu8(s1, s3);
        m0 = _mm256_castsi256_si128(d0);
        m1 = _mm256_extractf128_si256(d0, 1);
        m2 = _mm256_castsi256_si128(d1);
        m3 = _mm256_extractf128_si256(d1, 1);

        _mm_storeu_si128((__m128i *)dst, m0);
        _mm_storeu_si128((__m128i *)(dst + i_dst), m1);
        _mm_storeu_si128((__m128i *)(dst + i_dst2), m2);
        _mm_storeu_si128((__m128i *)(dst + i_dst3), m3);

        src1 += 64;
        src2 += 64;
        dst += i_dst4;
        height -= 4;
    }
}

void uavs3e_pel_avrg_32_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    int i_dst2 = i_dst << 1;
    __m256i s0, s1, s2, s3, d0, d1;

    while (height) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src1 + 32));
        s2 = _mm256_load_si256((const __m256i *)(src2));
        s3 = _mm256_load_si256((const __m256i *)(src2 + 32));

        d0 = _mm256_avg_epu8(s0, s2);
        d1 = _mm256_avg_epu8(s1, s3);

        _mm256_storeu_si256((__m256i *)(dst), d0);
        _mm256_storeu_si256((__m256i *)(dst + i_dst), d1);

        src1 += 64;
        src2 += 64;
        dst += i_dst2;
        height -= 2;
    }
}

void uavs3e_pel_avrg_64_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    __m256i s0, s1, s2, s3, d0, d1;

    while (height--) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src1 + 32));
        s2 = _mm256_load_si256((const __m256i *)(src2));
        s3 = _mm256_load_si256((const __m256i *)(src2 + 32));

        d0 = _mm256_avg_epu8(s0, s2);
        d1 = _mm256_avg_epu8(s1, s3);

        _mm256_store_si256((__m256i *)(dst), d0);
        _mm256_store_si256((__m256i *)(dst + 32), d1);

        src1 += 64;
        src2 += 64;
        dst += i_dst;
    }
}

void uavs3e_pel_avrg_128_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    __m256i s0, s1, s2, s3, s4, s5, s6, s7;
    __m256i d0, d1, d2, d3;

    while (height--) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src1 + 32));
        s2 = _mm256_load_si256((const __m256i *)(src1 + 64));
        s3 = _mm256_load_si256((const __m256i *)(src1 + 96));
        s4 = _mm256_load_si256((const __m256i *)(src2));
        s5 = _mm256_load_si256((const __m256i *)(src2 + 32));
        s6 = _mm256_load_si256((const __m256i *)(src2 + 64));
        s7 = _mm256_load_si256((const __m256i *)(src2 + 96));

        d0 = _mm256_avg_epu8(s0, s4);
        d1 = _mm256_avg_epu8(s1, s5);
        d2 = _mm256_avg_epu8(s2, s6);
        d3 = _mm256_avg_epu8(s3, s7);

        _mm256_store_si256((__m256i *)(dst), d0);
        _mm256_store_si256((__m256i *)(dst + 32), d1);
        _mm256_store_si256((__m256i *)(dst + 64), d2);
        _mm256_store_si256((__m256i *)(dst + 96), d3);

        src1 += 128;
        src2 += 128;
        dst += i_dst;
    }
}

#define PEL_DIFF_W16(d) { \
        __m256i A = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p_org  + d * 16))); \
        __m256i B = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p_pred  + d * 16))); \
        __m256i C = _mm256_sub_epi16(A, B);\
        _mm256_storeu_si256((__m256i*)(p_resi + d * 16), C);\
    }

void uavs3e_pel_diff_16_avx2(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height)
{
    while (height--) {
        PEL_DIFF_W16(0);
        p_org += i_org;
        p_pred += i_pred;
        p_resi += i_resi;
    }
}

void uavs3e_pel_diff_32_avx2(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height)
{
    while (height--) {
        PEL_DIFF_W16(0);
        PEL_DIFF_W16(1);
        p_org += i_org;
        p_pred += i_pred;
        p_resi += i_resi;
    }
}

void uavs3e_pel_diff_64_avx2(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height)
{
    while (height--) {
        PEL_DIFF_W16(0);
        PEL_DIFF_W16(1);
        PEL_DIFF_W16(2);
        PEL_DIFF_W16(3);
        p_org += i_org;
        p_pred += i_pred;
        p_resi += i_resi;
    }
}

void uavs3e_pel_diff_128_avx2(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height)
{
    while (height--) {
        PEL_DIFF_W16(0);
        PEL_DIFF_W16(1);
        PEL_DIFF_W16(2);
        PEL_DIFF_W16(3);
        PEL_DIFF_W16(4);
        PEL_DIFF_W16(5);
        PEL_DIFF_W16(6);
        PEL_DIFF_W16(7);

        p_org += i_org;
        p_pred += i_pred;
        p_resi += i_resi;
    }
}

void uavs3e_recon_w16_avx2(s16 *resi, pel *pred, int i_pred, int width, int height, pel *rec, int i_rec, int cbf, int bit_depth)
{
    int i;
    int i_rec2 = i_rec << 1;
    int i_pred2 = i_pred << 1;

    if (cbf == 0) {
        __m128i p0, p1;
        for (i = 0; i < height; i += 2) {
            p0 = _mm_loadu_si128((const __m128i *)(pred));
            p1 = _mm_loadu_si128((const __m128i *)(pred + i_pred));

            _mm_storeu_si128((__m128i *)(rec), p0);
            _mm_storeu_si128((__m128i *)(rec + i_rec), p1);
            rec += i_rec2;
            pred += i_pred2;
        }
    } else {
        __m128i p0, p1;
        __m256i zero = _mm256_setzero_si256();
        __m256i m0, m1, r0, r1;
        for (i = 0; i < height; i += 2) {
            p0 = _mm_loadu_si128((const __m128i *)(pred));
            p1 = _mm_loadu_si128((const __m128i *)(pred + i_pred));
            r0 = _mm256_load_si256((const __m256i *)(resi));
            r1 = _mm256_load_si256((const __m256i *)(resi + 16));
            m0 = _mm256_set_m128i(p1, p0);
            m0 = _mm256_permute4x64_epi64(m0, 0xd8);
            m1 = _mm256_unpackhi_epi8(m0, zero);
            m0 = _mm256_unpacklo_epi8(m0, zero);

            m0 = _mm256_adds_epi16(m0, r0);
            m1 = _mm256_adds_epi16(m1, r1);
            m0 = _mm256_packus_epi16(m0, m1);
            m0 = _mm256_permute4x64_epi64(m0, 0xd8);

            _mm_storeu_si128((__m128i *)(rec), _mm256_castsi256_si128(m0));
            _mm_storeu_si128((__m128i *)(rec + i_rec), _mm256_extracti128_si256(m0, 1));

            pred += i_pred2;
            rec += i_rec2;
            resi += 32;
        }
    }
}

void uavs3e_recon_w32_avx2(s16 *resi, pel *pred, int i_pred, int width, int height, pel *rec, int i_rec, int cbf, int bit_depth)
{
    int i;
    if (cbf == 0) {
        int i_rec2 = i_rec << 1;
        int i_pred2 = i_pred << 1;
        __m256i p0, p1;
        for (i = 0; i < height; i += 2) {
            p0 = _mm256_loadu_si256((const __m256i *)(pred));
            p1 = _mm256_loadu_si256((const __m256i *)(pred + i_pred));

            _mm256_storeu_si256((__m256i *)(rec), p0);
            _mm256_storeu_si256((__m256i *)(rec + i_rec), p1);
            rec += i_rec2;
            pred += i_pred2;
        }
    } else {
        __m256i zero = _mm256_setzero_si256();
        __m256i m0, m1, r0, r1;
        for (i = 0; i < height; i++) {
            m0 = _mm256_loadu_si256((const __m256i *)(pred));
            r0 = _mm256_load_si256((const __m256i *)(resi));
            r1 = _mm256_load_si256((const __m256i *)(resi + 16));
            m0 = _mm256_permute4x64_epi64(m0, 0xd8);
            m1 = _mm256_unpackhi_epi8(m0, zero);
            m0 = _mm256_unpacklo_epi8(m0, zero);

            m0 = _mm256_adds_epi16(m0, r0);
            m1 = _mm256_adds_epi16(m1, r1);
            m0 = _mm256_packus_epi16(m0, m1);
            m0 = _mm256_permute4x64_epi64(m0, 0xd8);

            _mm256_storeu_si256((__m256i *)(rec), m0);

            pred += i_pred;
            rec += i_rec;
            resi += 32;
        }
    }
}

void uavs3e_recon_w64_avx2(s16 *resi, pel *pred, int i_pred, int width, int height, pel *rec, int i_rec, int cbf, int bit_depth)
{
    int i;
    if (cbf == 0) {
        __m256i p0, p1, p2, p3;
        int i_rec2 = i_rec << 1;
        int i_pred2 = i_pred << 1;
        for (i = 0; i < height; i += 2) {
            p0 = _mm256_loadu_si256((const __m256i *)(pred));
            p1 = _mm256_loadu_si256((const __m256i *)(pred + 32));
            p2 = _mm256_loadu_si256((const __m256i *)(pred + i_pred));
            p3 = _mm256_loadu_si256((const __m256i *)(pred + i_pred + 32));

            _mm256_storeu_si256((__m256i *)(rec), p0);
            _mm256_storeu_si256((__m256i *)(rec + 32), p1);
            _mm256_storeu_si256((__m256i *)(rec + i_rec), p2);
            _mm256_storeu_si256((__m256i *)(rec + i_rec + 32), p3);
            rec += i_rec2;
            pred += i_pred2;
        }
    } else {
        __m256i zero = _mm256_setzero_si256();
        __m256i m0, m1, m2, m3, r0, r1, r2, r3;
        for (i = 0; i < height; i++) {
            m0 = _mm256_loadu_si256((const __m256i *)(pred));
            m1 = _mm256_loadu_si256((const __m256i *)(pred + 32));
            r0 = _mm256_load_si256((const __m256i *)(resi));
            r1 = _mm256_load_si256((const __m256i *)(resi + 16));
            r2 = _mm256_load_si256((const __m256i *)(resi + 32));
            r3 = _mm256_load_si256((const __m256i *)(resi + 48));
            m0 = _mm256_permute4x64_epi64(m0, 0xd8);
            m1 = _mm256_permute4x64_epi64(m1, 0xd8);
            m2 = _mm256_unpacklo_epi8(m1, zero);
            m3 = _mm256_unpackhi_epi8(m1, zero);
            m1 = _mm256_unpackhi_epi8(m0, zero);
            m0 = _mm256_unpacklo_epi8(m0, zero);

            m0 = _mm256_adds_epi16(m0, r0);
            m1 = _mm256_adds_epi16(m1, r1);
            m2 = _mm256_adds_epi16(m2, r2);
            m3 = _mm256_adds_epi16(m3, r3);
            m0 = _mm256_packus_epi16(m0, m1);
            m2 = _mm256_packus_epi16(m2, m3);
            m0 = _mm256_permute4x64_epi64(m0, 0xd8);
            m2 = _mm256_permute4x64_epi64(m2, 0xd8);

            _mm256_storeu_si256((__m256i *)(rec), m0);
            _mm256_storeu_si256((__m256i *)(rec + 32), m2);

            pred += i_pred;
            rec  += i_rec;
            resi += 64;
        }
    }
}

void uavs3e_conv_fmt_8bit_avx2(unsigned char *src_y, unsigned char *src_uv, unsigned char *dst[3], int width, int height, int src_stride, int src_stridec, int dst_stride[3], int uv_shift)
{
    __m256i uv_mask = _mm256_set1_epi16(0xff), msrc, msrcu, msrcv;
    unsigned char *p_src = src_y;
    unsigned char *p_dst_y = dst[0];
    unsigned char *p_dst_u = dst[1];
    unsigned char *p_dst_v = dst[2];
    int i, j, j2;
    int width32;
    for (i = 0; i < height; i++) {
        memcpy(p_dst_y, p_src, width);
        p_src += src_stride;
        p_dst_y += dst_stride[0];
    }
    width >>= uv_shift;
    height >>= uv_shift;

    p_src = src_uv;
    width <<= 1;
    width32 = (width >> 5) << 5;
    for (i = 0; i < height; i++) {
        for (j = 0; j < width32; j += 32) {
            j2 = j >> 1;
            msrc = _mm256_loadu_si256((const __m256i *)(p_src + j));
            msrcu = _mm256_and_si256(msrc, uv_mask);
            msrcv = _mm256_and_si256(_mm256_srli_si256(msrc, 1), uv_mask);
            msrc = _mm256_packus_epi16(msrcu, msrcv);
            msrc = _mm256_permute4x64_epi64(msrc, 0xd8);
            _mm_storeu_si128((__m128i *)(p_dst_u + j2), _mm256_castsi256_si128(msrc));
            _mm_storeu_si128((__m128i *)(p_dst_v + j2), _mm256_extracti128_si256(msrc, 1));
        }
        for (; j < width; j += 2) {
            p_dst_u[j >> 1] = p_src[j];
            p_dst_v[j >> 1] = p_src[j + 1];
        }
        p_src += src_stridec;
        p_dst_u += dst_stride[1];
        p_dst_v += dst_stride[2];
    }
}

void uavs3e_conv_fmt_16bit_avx2(unsigned char *src_y, unsigned char *src_uv, unsigned char *dst[3], int width, int height, int src_stride, int src_stridec, int dst_stride[3], int uv_shift)
{
    __m256i uv_mask = _mm256_set1_epi32(0xffff), msrc, msrcu, msrcv;
    unsigned char *p_src = src_y;
    unsigned char *p_dst_y = dst[0];
    unsigned char *p_dst_u = dst[1];
    unsigned char *p_dst_v = dst[2];
    int i, j, j2;
    int width32;
    width <<= 1;
    for (i = 0; i < height; i++) {
        memcpy(p_dst_y, p_src, width);
        p_src += src_stride;
        p_dst_y += dst_stride[0];
    }
    width >>= uv_shift;
    height >>= uv_shift;

    p_src = src_uv;
    width <<= 1;
    width32 = (width >> 5) << 5;
    for (i = 0; i < height; i++) {
        for (j = 0; j < width32; j += 32) {
            j2 = j >> 1;
            msrc = _mm256_loadu_si256((const __m256i *)(p_src + j));
            msrcu = _mm256_and_si256(msrc, uv_mask);
            msrcv = _mm256_and_si256(_mm256_srli_si256(msrc, 2), uv_mask);
            msrc = _mm256_packus_epi32(msrcu, msrcv);
            msrc = _mm256_permute4x64_epi64(msrc, 0xd8);
            _mm_storeu_si128((__m128i *)(p_dst_u + j2), _mm256_castsi256_si128(msrc));
            _mm_storeu_si128((__m128i *)(p_dst_v + j2), _mm256_extracti128_si256(msrc, 1));
        }
        for (; j < width; j += 4) {
            j2 = j >> 1;    // for short
            ((short *)p_dst_u)[j2 >> 1] = ((short *)p_src)[j2];
            ((short *)p_dst_v)[j2 >> 1] = ((short *)p_src)[j2 + 1];
        }
        p_src += src_stridec;
        p_dst_u += dst_stride[1];
        p_dst_v += dst_stride[2];
    }
}

void uavs3e_conv_fmt_16to8bit_avx2(unsigned char *src_y, unsigned char *src_uv, unsigned char *dst[3], int width, int height, int src_stride, int src_stridec, int dst_stride[3], int uv_shift)
{
    __m256i uv_mask = _mm256_set1_epi32(0xffff);
    __m256i m0, m1, m2, m3, m4, m5;
    unsigned char *p_src = src_y;
    unsigned char *p_dst_y = dst[0];
    unsigned char *p_dst_u = dst[1];
    unsigned char *p_dst_v = dst[2];
    int i, j, j2;
    int width64 = (width >> 5) << 6;

    width <<= 1;    // width for byte
    for (i = 0; i < height; i++) {
        for (j = 0; j < width64; j += 64) {
            j2 = j >> 1;
            m0 = _mm256_loadu_si256((const __m256i *)(p_src + j));
            m1 = _mm256_loadu_si256((const __m256i *)(p_src + j + 32));
            m2 = _mm256_packus_epi16(m0, m1);
            m2 = _mm256_permute4x64_epi64(m2, 0xd8);
            _mm256_storeu_si256((__m256i *)(p_dst_y + j2), m2);
        }
        for (; j < width; j += 2) {
            p_dst_y[j >> 1] = (unsigned char)((short *)p_src)[j >> 1];
        }
        p_src += src_stride;
        p_dst_y += dst_stride[0];
    }
    width >>= uv_shift;
    height >>= uv_shift;

    p_src = src_uv;
    width <<= 1;
    width64 = (width >> 6) << 6;
    for (i = 0; i < height; i++) {
        for (j = 0; j < width64; j += 64) {
            j2 = j >> 1;
            m0 = _mm256_loadu_si256((const __m256i *)(p_src + j));
            m1 = _mm256_loadu_si256((const __m256i *)(p_src + j + 32));
            m2 = _mm256_and_si256(m0, uv_mask);        // U: 0-8 (32bit)
            m3 = _mm256_and_si256(m1, uv_mask);        // U: 8-16
            m4 = _mm256_and_si256(_mm256_srli_si256(m0, 2), uv_mask); // V: 0-8 (32bit)
            m5 = _mm256_and_si256(_mm256_srli_si256(m1, 2), uv_mask); // V: 8-16
            m2 = _mm256_packus_epi32(m2, m3);
            m4 = _mm256_packus_epi32(m4, m5);
            m2 = _mm256_permute4x64_epi64(m2, 0xd8);
            m4 = _mm256_permute4x64_epi64(m4, 0xd8);
            m2 = _mm256_packus_epi16(m2, m4);          // U: 0-16; V: 0-16 (8bit)
            m2 = _mm256_permute4x64_epi64(m2, 0xd8);
            _mm_storeu_si128((__m128i *)(p_dst_u + (j2 >> 1)), _mm256_castsi256_si128(m2));
            _mm_storeu_si128((__m128i *)(p_dst_v + (j2 >> 1)), _mm256_extracti128_si256(m2, 1));
        }
        for (; j < width; j += 4) {
            j2 = j >> 1;    // for short
            p_dst_u[j2 >> 1] = (unsigned char)((short *)p_src)[j2];
            p_dst_v[j2 >> 1] = (unsigned char)((short *)p_src)[j2 + 1];
        }
        p_src += src_stridec;
        p_dst_u += dst_stride[1];
        p_dst_v += dst_stride[2];
    }
}

#elif (BIT_DEPTH == 10)
void uavs3e_pel_avrg_4_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    int i_dst2 = i_dst << 1;
    int i_dst3 = i_dst + i_dst2;
    int i_dst4 = i_dst << 2;
    __m256i s0, s1, d0;
    __m128i m0, m1, m2, m3;

    while (height) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src2));

        d0 = _mm256_avg_epu16(s0, s1);
        m0 = _mm256_castsi256_si128(d0);
        m2 = _mm256_extractf128_si256(d0, 1);
        m1 = _mm_srli_si128(m0, 8);
        m3 = _mm_srli_si128(m2, 8);
        _mm_storel_epi64((__m128i *)dst, m0);
        _mm_storel_epi64((__m128i *)(dst + i_dst), m1);
        _mm_storel_epi64((__m128i *)(dst + i_dst2), m2);
        _mm_storel_epi64((__m128i *)(dst + i_dst3), m3);

        src1 += 16;
        src2 += 16;
        dst += i_dst4;
        height -= 4;
    }
}

void uavs3e_pel_avrg_8_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    int i_dst2 = i_dst << 1;
    int i_dst3 = i_dst + i_dst2;
    int i_dst4 = i_dst << 2;
    __m256i s0, s1, s2, s3, d0, d1;

    while (height) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src1 + 16));
        s2 = _mm256_load_si256((const __m256i *)(src2));
        s3 = _mm256_load_si256((const __m256i *)(src2 + 16));

        d0 = _mm256_avg_epu16(s0, s2);
        d1 = _mm256_avg_epu16(s1, s3);

        _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(d0));
        _mm_storeu_si128((__m128i *)(dst + i_dst), _mm256_extractf128_si256(d0, 1));
        _mm_storeu_si128((__m128i *)(dst + i_dst2), _mm256_castsi256_si128(d1));
        _mm_storeu_si128((__m128i *)(dst + i_dst3), _mm256_extractf128_si256(d1, 1));

        src1 += 32;
        src2 += 32;
        dst += i_dst4;
        height -= 4;
    }
}

void uavs3e_pel_avrg_16_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    int i_dst2 = i_dst << 1;
    __m256i s0, s1, s2, s3, d0, d1;

    while (height) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src1 + 16));
        s2 = _mm256_load_si256((const __m256i *)(src2));
        s3 = _mm256_load_si256((const __m256i *)(src2 + 16));

        d0 = _mm256_avg_epu16(s0, s2);
        d1 = _mm256_avg_epu16(s1, s3);

        _mm256_store_si256((__m256i *)(dst), d0);
        _mm256_store_si256((__m256i *)(dst + i_dst), d1);

        src1 += 32;
        src2 += 32;
        dst += i_dst2;
        height -= 2;
    }
}

void uavs3e_pel_avrg_32_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    __m256i s0, s1, s2, s3, d0, d1;

    while (height--) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src1 + 16));
        s2 = _mm256_load_si256((const __m256i *)(src2));
        s3 = _mm256_load_si256((const __m256i *)(src2 + 16));

        d0 = _mm256_avg_epu16(s0, s2);
        d1 = _mm256_avg_epu16(s1, s3);

        _mm256_storeu_si256((__m256i *)(dst), d0);
        _mm256_storeu_si256((__m256i *)(dst + 16), d1);

        src1 += 32;
        src2 += 32;
        dst += i_dst;
    }
}

void uavs3e_pel_avrg_64_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    __m256i s0, s1, s2, s3, s4, s5, s6, s7;
    __m256i d0, d1, d2, d3;

    while (height--) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src1 + 16));
        s2 = _mm256_load_si256((const __m256i *)(src1 + 32));
        s3 = _mm256_load_si256((const __m256i *)(src1 + 48));
        s4 = _mm256_load_si256((const __m256i *)(src2));
        s5 = _mm256_load_si256((const __m256i *)(src2 + 16));
        s6 = _mm256_load_si256((const __m256i *)(src2 + 32));
        s7 = _mm256_load_si256((const __m256i *)(src2 + 48));

        d0 = _mm256_avg_epu16(s0, s4);
        d1 = _mm256_avg_epu16(s1, s5);
        d2 = _mm256_avg_epu16(s2, s6);
        d3 = _mm256_avg_epu16(s3, s7);

        _mm256_store_si256((__m256i *)(dst), d0);
        _mm256_store_si256((__m256i *)(dst + 16), d1);
        _mm256_store_si256((__m256i *)(dst + 32), d2);
        _mm256_store_si256((__m256i *)(dst + 48), d3);

        src1 += 64;
        src2 += 64;
        dst += i_dst;
    }
}

void uavs3e_pel_avrg_128_avx2(pel *dst, int i_dst, pel *src1, pel *src2, int height)
{
    __m256i s0, s1, s2, s3, s4, s5, s6, s7;
    __m256i d0, d1, d2, d3;

    while (height--) {
        s0 = _mm256_load_si256((const __m256i *)(src1));
        s1 = _mm256_load_si256((const __m256i *)(src1 + 16));
        s2 = _mm256_load_si256((const __m256i *)(src1 + 32));
        s3 = _mm256_load_si256((const __m256i *)(src1 + 48));
        s4 = _mm256_load_si256((const __m256i *)(src2));
        s5 = _mm256_load_si256((const __m256i *)(src2 + 16));
        s6 = _mm256_load_si256((const __m256i *)(src2 + 32));
        s7 = _mm256_load_si256((const __m256i *)(src2 + 48));

        d0 = _mm256_avg_epu16(s0, s4);
        d1 = _mm256_avg_epu16(s1, s5);
        d2 = _mm256_avg_epu16(s2, s6);
        d3 = _mm256_avg_epu16(s3, s7);

        s0 = _mm256_load_si256((const __m256i *)(src1 + 16*4));
        s1 = _mm256_load_si256((const __m256i *)(src1 + 16*5));
        s2 = _mm256_load_si256((const __m256i *)(src1 + 16*6));
        s3 = _mm256_load_si256((const __m256i *)(src1 + 16*7));
        s4 = _mm256_load_si256((const __m256i *)(src2 + 16*4));
        s5 = _mm256_load_si256((const __m256i *)(src2 + 16*5));
        s6 = _mm256_load_si256((const __m256i *)(src2 + 16*6));
        s7 = _mm256_load_si256((const __m256i *)(src2 + 16*7));

        _mm256_store_si256((__m256i *)(dst), d0);
        _mm256_store_si256((__m256i *)(dst + 16), d1);
        _mm256_store_si256((__m256i *)(dst + 32), d2);
        _mm256_store_si256((__m256i *)(dst + 48), d3);

        d0 = _mm256_avg_epu16(s0, s4);
        d1 = _mm256_avg_epu16(s1, s5);
        d2 = _mm256_avg_epu16(s2, s6);
        d3 = _mm256_avg_epu16(s3, s7);
        _mm256_store_si256((__m256i *)(dst + 64), d0);
        _mm256_store_si256((__m256i *)(dst + 80), d1);
        _mm256_store_si256((__m256i *)(dst + 96), d2);
        _mm256_store_si256((__m256i *)(dst + 112), d3);

        src1 += 128;
        src2 += 128;
        dst += i_dst;
    }
}

#define PEL_DIFF_W16(d) { \
        __m256i A = _mm256_loadu_si256((__m256i*)(p_org  + d * 16)); \
        __m256i B = _mm256_loadu_si256((__m256i*)(p_pred  + d * 16)); \
        __m256i C = _mm256_sub_epi16(A, B);\
        _mm256_storeu_si256((__m256i*)(p_resi + d * 16), C);\
    }

void uavs3e_pel_diff_16_avx2(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height)
{
    while (height--) {
        PEL_DIFF_W16(0);
        p_org += i_org;
        p_pred += i_pred;
        p_resi += i_resi;
    }
}

void uavs3e_pel_diff_32_avx2(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height)
{
    while (height--) {
        PEL_DIFF_W16(0);
        PEL_DIFF_W16(1);
        p_org += i_org;
        p_pred += i_pred;
        p_resi += i_resi;
    }
}

void uavs3e_pel_diff_64_avx2(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height)
{
    while (height--) {
        PEL_DIFF_W16(0);
        PEL_DIFF_W16(1);
        PEL_DIFF_W16(2);
        PEL_DIFF_W16(3);
        p_org += i_org;
        p_pred += i_pred;
        p_resi += i_resi;
    }
}

void uavs3e_pel_diff_128_avx2(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height)
{
    while (height--) {
        PEL_DIFF_W16(0);
        PEL_DIFF_W16(1);
        PEL_DIFF_W16(2);
        PEL_DIFF_W16(3);
        PEL_DIFF_W16(4);
        PEL_DIFF_W16(5);
        PEL_DIFF_W16(6);
        PEL_DIFF_W16(7);

        p_org += i_org;
        p_pred += i_pred;
        p_resi += i_resi;
    }
}

void uavs3e_recon_w8_avx2(s16 *resi, pel *pred, int i_pred, int width, int height, pel *rec, int i_rec, int cbf, int bit_depth)
{
    int i;
    int i_rec2 = i_rec << 1;
    int i_pred2 = i_pred << 1;

    if (cbf == 0) {
        __m128i p0, p1;
        for (i = 0; i < height; i += 2) {
            p0 = _mm_loadu_si128((const __m128i *)(pred));
            p1 = _mm_loadu_si128((const __m128i *)(pred + i_pred));

            _mm_storeu_si128((__m128i *)(rec), p0);
            _mm_storeu_si128((__m128i *)(rec + i_rec), p1);
            rec += i_rec2;
            pred += i_pred2;
        }
    }
    else {
        __m256i m0, p0, r0;
        __m256i min = _mm256_setzero_si256();
        __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
        for (i = 0; i < height; i += 2) {
            p0 = _mm256_loadu2_m128i((const __m128i *)(pred + i_pred), (const __m128i *)(pred));
            r0 = _mm256_load_si256((const __m256i *)(resi));

            m0 = _mm256_adds_epi16(p0, r0);

            m0 = _mm256_max_epi16(m0, min);
            m0 = _mm256_min_epi16(m0, max);

            _mm_storeu_si128((__m128i *)(rec), _mm256_castsi256_si128(m0));
            _mm_storeu_si128((__m128i *)(rec + i_rec), _mm256_extracti128_si256(m0, 1));

            pred += i_pred2;
            rec += i_rec2;
            resi += 16;
        }
    }
}

void uavs3e_recon_w16_avx2(s16 *resi, pel *pred, int i_pred, int width, int height, pel *rec, int i_rec, int cbf, int bit_depth)
{
    int i;
    int i_rec2 = i_rec << 1;
    int i_pred2 = i_pred << 1;

    if (cbf == 0) {
        __m256i p0, p1;
        for (i = 0; i < height; i += 2) {
            p0 = _mm256_loadu_si256((const __m256i *)(pred));
            p1 = _mm256_loadu_si256((const __m256i *)(pred + i_pred));

            _mm256_storeu_si256((__m256i *)(rec), p0);
            _mm256_storeu_si256((__m256i *)(rec + i_rec), p1);
            rec += i_rec2;
            pred += i_pred2;
        }
    }
    else {
        __m256i p0, p1;
        __m256i m0, m1, r0, r1;
        __m256i min = _mm256_setzero_si256();
        __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
        for (i = 0; i < height; i += 2) {
            p0 = _mm256_loadu_si256((const __m256i *)(pred));
            p1 = _mm256_loadu_si256((const __m256i *)(pred + i_pred));
            r0 = _mm256_load_si256((const __m256i *)(resi));
            r1 = _mm256_load_si256((const __m256i *)(resi + 16));

            m0 = _mm256_adds_epi16(p0, r0);
            m1 = _mm256_adds_epi16(p1, r1);

            m0 = _mm256_max_epi16(m0, min);
            m1 = _mm256_max_epi16(m1, min);
            m0 = _mm256_min_epi16(m0, max);
            m1 = _mm256_min_epi16(m1, max);
            
            _mm256_storeu_si256((__m256i *)(rec), m0);
            _mm256_storeu_si256((__m256i *)(rec + i_rec), m1);

            pred += i_pred2;
            rec += i_rec2;
            resi += 32;
        }
    }
}

void uavs3e_recon_w32_avx2(s16 *resi, pel *pred, int i_pred, int width, int height, pel *rec, int i_rec, int cbf, int bit_depth)
{
    int i;
    if (cbf == 0) {
        int i_rec2 = i_rec << 1;
        int i_pred2 = i_pred << 1;
        __m256i p0, p1, p2, p3;
        for (i = 0; i < height; i += 2) {
            p0 = _mm256_loadu_si256((const __m256i *)(pred));
            p1 = _mm256_loadu_si256((const __m256i *)(pred + 16));
            p2 = _mm256_loadu_si256((const __m256i *)(pred + i_pred));
            p3 = _mm256_loadu_si256((const __m256i *)(pred + i_pred + 16));

            _mm256_storeu_si256((__m256i *)(rec), p0);
            _mm256_storeu_si256((__m256i *)(rec + 16), p1);
            _mm256_storeu_si256((__m256i *)(rec + i_rec), p2);
            _mm256_storeu_si256((__m256i *)(rec + i_rec + 16), p3);
            rec += i_rec2;
            pred += i_pred2;
        }
    }
    else {
        __m256i m0, m1, r0, r1;
        __m256i min = _mm256_setzero_si256();
        __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
        for (i = 0; i < height; i++) {
            m0 = _mm256_loadu_si256((const __m256i *)(pred));
            m1 = _mm256_loadu_si256((const __m256i *)(pred + 16));
            r0 = _mm256_load_si256((const __m256i *)(resi));
            r1 = _mm256_load_si256((const __m256i *)(resi + 16));

            m0 = _mm256_adds_epi16(m0, r0);
            m1 = _mm256_adds_epi16(m1, r1);

            m0 = _mm256_max_epi16(m0, min);
            m1 = _mm256_max_epi16(m1, min);
            m0 = _mm256_min_epi16(m0, max);
            m1 = _mm256_min_epi16(m1, max);

            _mm256_storeu_si256((__m256i *)(rec), m0);
            _mm256_storeu_si256((__m256i *)(rec + 16), m1);

            pred += i_pred;
            rec += i_rec;
            resi += 32;
        }
    }
}

void uavs3e_recon_w64_avx2(s16 *resi, pel *pred, int i_pred, int width, int height, pel *rec, int i_rec, int cbf, int bit_depth)
{
    int i;
    if (cbf == 0) {
        __m256i p0, p1, p2, p3;
        for (i = 0; i < height; i++) {
            p0 = _mm256_loadu_si256((const __m256i *)(pred));
            p1 = _mm256_loadu_si256((const __m256i *)(pred + 16));
            p2 = _mm256_loadu_si256((const __m256i *)(pred + 32));
            p3 = _mm256_loadu_si256((const __m256i *)(pred + 48));

            _mm256_storeu_si256((__m256i *)(rec), p0);
            _mm256_storeu_si256((__m256i *)(rec + 16), p1);
            _mm256_storeu_si256((__m256i *)(rec + 32), p2);
            _mm256_storeu_si256((__m256i *)(rec + 48), p3);
            rec += i_rec;
            pred += i_pred;
        }
    }
    else {
        __m256i m0, m1, m2, m3, r0, r1, r2, r3;
        __m256i min = _mm256_setzero_si256();
        __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
        for (i = 0; i < height; i++) {
            m0 = _mm256_loadu_si256((const __m256i *)(pred));
            m1 = _mm256_loadu_si256((const __m256i *)(pred + 16));
            m2 = _mm256_loadu_si256((const __m256i *)(pred + 32));
            m3 = _mm256_loadu_si256((const __m256i *)(pred + 48));
            r0 = _mm256_load_si256((const __m256i *)(resi));
            r1 = _mm256_load_si256((const __m256i *)(resi + 16));
            r2 = _mm256_load_si256((const __m256i *)(resi + 32));
            r3 = _mm256_load_si256((const __m256i *)(resi + 48));

            m0 = _mm256_adds_epi16(m0, r0);
            m1 = _mm256_adds_epi16(m1, r1);
            m2 = _mm256_adds_epi16(m2, r2);
            m3 = _mm256_adds_epi16(m3, r3);

            m0 = _mm256_max_epi16(m0, min);
            m1 = _mm256_max_epi16(m1, min);
            m2 = _mm256_max_epi16(m2, min);
            m3 = _mm256_max_epi16(m3, min);
            m0 = _mm256_min_epi16(m0, max);
            m1 = _mm256_min_epi16(m1, max);
            m2 = _mm256_min_epi16(m2, max);
            m3 = _mm256_min_epi16(m3, max);

            _mm256_storeu_si256((__m256i *)(rec), m0);
            _mm256_storeu_si256((__m256i *)(rec + 16), m1);
            _mm256_storeu_si256((__m256i *)(rec + 32), m2);
            _mm256_storeu_si256((__m256i *)(rec + 48), m3);

            pred += i_pred;
            rec += i_rec;
            resi += 64;
        }
    }
}

#endif