opencv on mbed

Dependencies:   mbed

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers sse_utils.hpp Source File

sse_utils.hpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                          License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2015, Itseez Inc., all rights reserved.
00014 // Third party copyrights are property of their respective owners.
00015 //
00016 // Redistribution and use in source and binary forms, with or without modification,
00017 // are permitted provided that the following conditions are met:
00018 //
00019 //   * Redistribution's of source code must retain the above copyright notice,
00020 //     this list of conditions and the following disclaimer.
00021 //
00022 //   * Redistribution's in binary form must reproduce the above copyright notice,
00023 //     this list of conditions and the following disclaimer in the documentation
00024 //     and/or other materials provided with the distribution.
00025 //
00026 //   * The name of the copyright holders may not be used to endorse or promote products
00027 //     derived from this software without specific prior written permission.
00028 //
00029 // This software is provided by the copyright holders and contributors "as is" and
00030 // any express or implied warranties, including, but not limited to, the implied
00031 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00032 // In no event shall the Intel Corporation or contributors be liable for any direct,
00033 // indirect, incidental, special, exemplary, or consequential damages
00034 // (including, but not limited to, procurement of substitute goods or services;
00035 // loss of use, data, or profits; or business interruption) however caused
00036 // and on any theory of liability, whether in contract, strict liability,
00037 // or tort (including negligence or otherwise) arising in any way out of
00038 // the use of this software, even if advised of the possibility of such damage.
00039 //
00040 //M*/
00041 
00042 #ifndef __OPENCV_CORE_SSE_UTILS_HPP__
00043 #define __OPENCV_CORE_SSE_UTILS_HPP__
00044 
00045 #ifndef __cplusplus
00046 #  error sse_utils.hpp header must be compiled as C++
00047 #endif
00048 
00049 #include "opencv2/core/cvdef.h"
00050 
00051 //! @addtogroup core_utils_sse
00052 //! @{
00053 
00054 #if CV_SSE2
00055 
00056 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
00057 {
00058     __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
00059     __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
00060     __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
00061     __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
00062 
00063     __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
00064     __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
00065     __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
00066     __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
00067 
00068     __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
00069     __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
00070     __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
00071     __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
00072 
00073     __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
00074     __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
00075     __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
00076     __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
00077 
00078     v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
00079     v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
00080     v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
00081     v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
00082 }
00083 
00084 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
00085                                   __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
00086 {
00087     __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
00088     __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
00089     __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
00090     __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
00091     __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
00092     __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
00093 
00094     __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
00095     __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
00096     __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
00097     __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
00098     __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
00099     __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
00100 
00101     __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
00102     __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
00103     __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
00104     __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
00105     __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
00106     __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
00107 
00108     __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
00109     __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
00110     __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
00111     __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
00112     __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
00113     __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
00114 
00115     v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
00116     v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
00117     v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
00118     v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
00119     v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
00120     v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
00121 }
00122 
00123 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
00124                                   __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
00125 {
00126     __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
00127     __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
00128     __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
00129     __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
00130     __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
00131     __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
00132     __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
00133     __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
00134 
00135     __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
00136     __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
00137     __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
00138     __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
00139     __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
00140     __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
00141     __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
00142     __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
00143 
00144     __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
00145     __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
00146     __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
00147     __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
00148     __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
00149     __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
00150     __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
00151     __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
00152 
00153     __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
00154     __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
00155     __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
00156     __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
00157     __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
00158     __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
00159     __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
00160     __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
00161 
00162     v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
00163     v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
00164     v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
00165     v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
00166     v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
00167     v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
00168     v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
00169     v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
00170 }
00171 
00172 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
00173 {
00174     __m128i v_mask = _mm_set1_epi16(0x00ff);
00175 
00176     __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
00177     __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
00178     __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
00179     __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
00180 
00181     __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
00182     __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
00183     __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
00184     __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
00185 
00186     __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
00187     __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
00188     __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
00189     __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
00190 
00191     __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
00192     __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
00193     __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
00194     __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
00195 
00196     v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
00197     v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
00198     v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
00199     v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
00200 }
00201 
00202 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
00203                                 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
00204 {
00205     __m128i v_mask = _mm_set1_epi16(0x00ff);
00206 
00207     __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
00208     __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
00209     __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
00210     __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
00211     __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
00212     __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
00213 
00214     __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
00215     __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
00216     __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
00217     __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
00218     __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
00219     __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
00220 
00221     __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
00222     __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
00223     __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
00224     __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
00225     __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
00226     __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
00227 
00228     __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
00229     __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
00230     __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
00231     __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
00232     __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
00233     __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
00234 
00235     v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
00236     v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
00237     v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
00238     v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
00239     v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
00240     v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
00241 }
00242 
00243 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
00244                                 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
00245 {
00246     __m128i v_mask = _mm_set1_epi16(0x00ff);
00247 
00248     __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
00249     __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
00250     __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
00251     __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
00252     __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
00253     __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
00254     __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
00255     __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
00256 
00257     __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
00258     __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
00259     __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
00260     __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
00261     __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
00262     __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
00263     __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
00264     __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
00265 
00266     __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
00267     __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
00268     __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
00269     __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
00270     __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
00271     __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
00272     __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
00273     __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
00274 
00275     __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
00276     __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
00277     __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
00278     __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
00279     __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
00280     __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
00281     __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
00282     __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
00283 
00284     v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
00285     v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
00286     v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
00287     v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
00288     v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
00289     v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
00290     v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
00291     v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
00292 }
00293 
00294 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
00295 {
00296     __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
00297     __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
00298     __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
00299     __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
00300 
00301     __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
00302     __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
00303     __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
00304     __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
00305 
00306     __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
00307     __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
00308     __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
00309     __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
00310 
00311     v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
00312     v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
00313     v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
00314     v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
00315 }
00316 
00317 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
00318                                    __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
00319 {
00320     __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
00321     __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
00322     __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
00323     __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
00324     __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
00325     __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
00326 
00327     __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
00328     __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
00329     __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
00330     __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
00331     __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
00332     __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
00333 
00334     __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
00335     __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
00336     __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
00337     __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
00338     __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
00339     __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
00340 
00341     v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
00342     v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
00343     v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
00344     v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
00345     v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
00346     v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
00347 }
00348 
00349 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
00350                                    __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
00351 {
00352     __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
00353     __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
00354     __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
00355     __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
00356     __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
00357     __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
00358     __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
00359     __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
00360 
00361     __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
00362     __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
00363     __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
00364     __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
00365     __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
00366     __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
00367     __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
00368     __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
00369 
00370     __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
00371     __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
00372     __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
00373     __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
00374     __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
00375     __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
00376     __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
00377     __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
00378 
00379     v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
00380     v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
00381     v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
00382     v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
00383     v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
00384     v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
00385     v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
00386     v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
00387 }
00388 
00389 #if CV_SSE4_1
00390 
00391 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
00392 {
00393     __m128i v_mask = _mm_set1_epi32(0x0000ffff);
00394 
00395     __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
00396     __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
00397     __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
00398     __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
00399 
00400     __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
00401     __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
00402     __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
00403     __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
00404 
00405     __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
00406     __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
00407     __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
00408     __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
00409 
00410     v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
00411     v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
00412     v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
00413     v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
00414 }
00415 
00416 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
00417                                  __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
00418 {
00419     __m128i v_mask = _mm_set1_epi32(0x0000ffff);
00420 
00421     __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
00422     __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
00423     __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
00424     __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
00425     __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
00426     __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
00427 
00428     __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
00429     __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
00430     __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
00431     __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
00432     __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
00433     __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
00434 
00435     __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
00436     __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
00437     __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
00438     __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
00439     __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
00440     __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
00441 
00442     v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
00443     v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
00444     v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
00445     v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
00446     v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
00447     v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
00448 }
00449 
00450 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
00451                                  __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
00452 {
00453     __m128i v_mask = _mm_set1_epi32(0x0000ffff);
00454 
00455     __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
00456     __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
00457     __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
00458     __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
00459     __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
00460     __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
00461     __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
00462     __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
00463 
00464     __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
00465     __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
00466     __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
00467     __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
00468     __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
00469     __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
00470     __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
00471     __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
00472 
00473     __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
00474     __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
00475     __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
00476     __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
00477     __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
00478     __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
00479     __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
00480     __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
00481 
00482     v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
00483     v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
00484     v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
00485     v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
00486     v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
00487     v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
00488     v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
00489     v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
00490 }
00491 
00492 #endif // CV_SSE4_1
00493 
00494 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
00495 {
00496     __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
00497     __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
00498     __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
00499     __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
00500 
00501     __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
00502     __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
00503     __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
00504     __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
00505 
00506     v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
00507     v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
00508     v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
00509     v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
00510 }
00511 
00512 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
00513                                 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
00514 {
00515     __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
00516     __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
00517     __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
00518     __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
00519     __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
00520     __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
00521 
00522     __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
00523     __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
00524     __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
00525     __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
00526     __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
00527     __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
00528 
00529     v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
00530     v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
00531     v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
00532     v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
00533     v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
00534     v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
00535 }
00536 
00537 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
00538                                 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
00539 {
00540     __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
00541     __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
00542     __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
00543     __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
00544     __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
00545     __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
00546     __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
00547     __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
00548 
00549     __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
00550     __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
00551     __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
00552     __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
00553     __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
00554     __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
00555     __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
00556     __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
00557 
00558     v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
00559     v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
00560     v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
00561     v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
00562     v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
00563     v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
00564     v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
00565     v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
00566 }
00567 
00568 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
00569 {
00570     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
00571 
00572     __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
00573     __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
00574     __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
00575     __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
00576 
00577     __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
00578     __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
00579     __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
00580     __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
00581 
00582     v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
00583     v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
00584     v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
00585     v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
00586 }
00587 
00588 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
00589                               __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
00590 {
00591     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
00592 
00593     __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
00594     __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
00595     __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
00596     __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
00597     __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
00598     __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
00599 
00600     __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
00601     __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
00602     __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
00603     __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
00604     __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
00605     __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
00606 
00607     v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
00608     v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
00609     v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
00610     v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
00611     v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
00612     v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
00613 }
00614 
00615 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
00616                               __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
00617 {
00618     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
00619 
00620     __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
00621     __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
00622     __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
00623     __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
00624     __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
00625     __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
00626     __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
00627     __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
00628 
00629     __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
00630     __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
00631     __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
00632     __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
00633     __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
00634     __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
00635     __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
00636     __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
00637 
00638     v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
00639     v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
00640     v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
00641     v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
00642     v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
00643     v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
00644     v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
00645     v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
00646 }
00647 
00648 #endif // CV_SSE2
00649 
00650 //! @}
00651 
00652 #endif //__OPENCV_CORE_SSE_UTILS_HPP__
00653