Joe Verbout
/
main
opencv on mbed
Embed:
(wiki syntax)
Show/hide line numbers
sse_utils.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2015, Itseez Inc., all rights reserved. 00014 // Third party copyrights are property of their respective owners. 00015 // 00016 // Redistribution and use in source and binary forms, with or without modification, 00017 // are permitted provided that the following conditions are met: 00018 // 00019 // * Redistribution's of source code must retain the above copyright notice, 00020 // this list of conditions and the following disclaimer. 00021 // 00022 // * Redistribution's in binary form must reproduce the above copyright notice, 00023 // this list of conditions and the following disclaimer in the documentation 00024 // and/or other materials provided with the distribution. 00025 // 00026 // * The name of the copyright holders may not be used to endorse or promote products 00027 // derived from this software without specific prior written permission. 00028 // 00029 // This software is provided by the copyright holders and contributors "as is" and 00030 // any express or implied warranties, including, but not limited to, the implied 00031 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00032 // In no event shall the Intel Corporation or contributors be liable for any direct, 00033 // indirect, incidental, special, exemplary, or consequential damages 00034 // (including, but not limited to, procurement of substitute goods or services; 00035 // loss of use, data, or profits; or business interruption) however caused 00036 // and on any theory of liability, whether in contract, strict liability, 00037 // or tort (including negligence or otherwise) arising in any way out of 00038 // the use of this software, even if advised of the possibility of such damage. 00039 // 00040 //M*/ 00041 00042 #ifndef __OPENCV_CORE_SSE_UTILS_HPP__ 00043 #define __OPENCV_CORE_SSE_UTILS_HPP__ 00044 00045 #ifndef __cplusplus 00046 # error sse_utils.hpp header must be compiled as C++ 00047 #endif 00048 00049 #include "opencv2/core/cvdef.h" 00050 00051 //! @addtogroup core_utils_sse 00052 //! @{ 00053 00054 #if CV_SSE2 00055 00056 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 00057 { 00058 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0); 00059 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0); 00060 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1); 00061 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1); 00062 00063 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2); 00064 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2); 00065 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3); 00066 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3); 00067 00068 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2); 00069 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2); 00070 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3); 00071 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3); 00072 00073 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2); 00074 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2); 00075 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3); 00076 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3); 00077 00078 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2); 00079 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2); 00080 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3); 00081 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3); 00082 } 00083 00084 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 00085 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 00086 { 00087 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1); 00088 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1); 00089 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0); 00090 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0); 00091 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1); 00092 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1); 00093 00094 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); 00095 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); 00096 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); 00097 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); 00098 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); 00099 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); 00100 00101 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); 00102 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); 00103 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); 00104 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); 00105 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); 00106 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); 00107 00108 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); 00109 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); 00110 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); 00111 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); 00112 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); 00113 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); 00114 00115 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); 00116 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); 00117 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); 00118 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); 00119 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); 00120 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); 00121 } 00122 00123 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 00124 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 00125 { 00126 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0); 00127 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0); 00128 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1); 00129 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1); 00130 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0); 00131 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0); 00132 __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1); 00133 __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1); 00134 00135 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4); 00136 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4); 00137 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5); 00138 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5); 00139 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6); 00140 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6); 00141 __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7); 00142 __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7); 00143 00144 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4); 00145 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4); 00146 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5); 00147 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5); 00148 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6); 00149 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6); 00150 __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7); 00151 __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7); 00152 00153 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4); 00154 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4); 00155 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5); 00156 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5); 00157 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6); 00158 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6); 00159 __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7); 00160 __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7); 00161 00162 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4); 00163 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4); 00164 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5); 00165 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5); 00166 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6); 00167 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6); 00168 v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7); 00169 v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7); 00170 } 00171 00172 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 00173 { 00174 __m128i v_mask = _mm_set1_epi16(0x00ff); 00175 00176 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00177 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); 00178 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00179 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); 00180 00181 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); 00182 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); 00183 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); 00184 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); 00185 00186 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00187 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); 00188 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00189 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); 00190 00191 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00192 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); 00193 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00194 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); 00195 00196 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00197 v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); 00198 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00199 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); 00200 } 00201 00202 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 00203 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 00204 { 00205 __m128i v_mask = _mm_set1_epi16(0x00ff); 00206 00207 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00208 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); 00209 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00210 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); 00211 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 00212 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); 00213 00214 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); 00215 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); 00216 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); 00217 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); 00218 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); 00219 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); 00220 00221 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00222 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); 00223 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00224 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); 00225 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 00226 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); 00227 00228 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00229 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); 00230 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00231 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); 00232 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 00233 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); 00234 00235 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00236 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); 00237 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00238 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); 00239 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 00240 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); 00241 } 00242 00243 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 00244 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 00245 { 00246 __m128i v_mask = _mm_set1_epi16(0x00ff); 00247 00248 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00249 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); 00250 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00251 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); 00252 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 00253 __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); 00254 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); 00255 __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8)); 00256 00257 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); 00258 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); 00259 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); 00260 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); 00261 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); 00262 __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); 00263 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask)); 00264 __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8)); 00265 00266 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00267 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); 00268 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00269 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); 00270 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 00271 __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); 00272 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); 00273 __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8)); 00274 00275 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00276 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); 00277 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00278 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); 00279 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 00280 __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); 00281 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); 00282 __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8)); 00283 00284 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00285 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); 00286 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00287 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); 00288 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 00289 v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); 00290 v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); 00291 v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); 00292 } 00293 00294 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 00295 { 00296 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0); 00297 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0); 00298 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1); 00299 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1); 00300 00301 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2); 00302 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2); 00303 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3); 00304 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3); 00305 00306 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2); 00307 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2); 00308 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3); 00309 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3); 00310 00311 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2); 00312 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2); 00313 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3); 00314 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3); 00315 } 00316 00317 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 00318 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 00319 { 00320 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1); 00321 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1); 00322 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0); 00323 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0); 00324 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1); 00325 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1); 00326 00327 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3); 00328 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3); 00329 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4); 00330 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4); 00331 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5); 00332 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5); 00333 00334 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3); 00335 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3); 00336 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4); 00337 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4); 00338 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5); 00339 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5); 00340 00341 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3); 00342 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3); 00343 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4); 00344 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4); 00345 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5); 00346 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); 00347 } 00348 00349 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 00350 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 00351 { 00352 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0); 00353 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0); 00354 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1); 00355 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1); 00356 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0); 00357 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0); 00358 __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1); 00359 __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1); 00360 00361 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4); 00362 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4); 00363 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5); 00364 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5); 00365 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6); 00366 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6); 00367 __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7); 00368 __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7); 00369 00370 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4); 00371 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4); 00372 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5); 00373 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5); 00374 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6); 00375 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6); 00376 __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7); 00377 __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7); 00378 00379 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4); 00380 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4); 00381 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5); 00382 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5); 00383 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6); 00384 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6); 00385 v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7); 00386 v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7); 00387 } 00388 00389 #if CV_SSE4_1 00390 00391 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 00392 { 00393 __m128i v_mask = _mm_set1_epi32(0x0000ffff); 00394 00395 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00396 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); 00397 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00398 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); 00399 00400 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00401 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); 00402 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00403 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); 00404 00405 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00406 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); 00407 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00408 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); 00409 00410 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00411 v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); 00412 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00413 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); 00414 } 00415 00416 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 00417 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 00418 { 00419 __m128i v_mask = _mm_set1_epi32(0x0000ffff); 00420 00421 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00422 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); 00423 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00424 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); 00425 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 00426 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); 00427 00428 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00429 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); 00430 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00431 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); 00432 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 00433 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); 00434 00435 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00436 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); 00437 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00438 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); 00439 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 00440 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); 00441 00442 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00443 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); 00444 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00445 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); 00446 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 00447 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); 00448 } 00449 00450 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 00451 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 00452 { 00453 __m128i v_mask = _mm_set1_epi32(0x0000ffff); 00454 00455 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 00456 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); 00457 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 00458 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); 00459 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 00460 __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); 00461 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); 00462 __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16)); 00463 00464 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 00465 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); 00466 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 00467 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); 00468 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 00469 __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); 00470 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); 00471 __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); 00472 00473 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 00474 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); 00475 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 00476 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); 00477 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 00478 __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); 00479 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); 00480 __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); 00481 00482 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 00483 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); 00484 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 00485 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); 00486 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 00487 v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); 00488 v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); 00489 v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16)); 00490 } 00491 00492 #endif // CV_SSE4_1 00493 00494 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) 00495 { 00496 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0); 00497 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0); 00498 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1); 00499 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1); 00500 00501 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2); 00502 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2); 00503 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3); 00504 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3); 00505 00506 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2); 00507 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2); 00508 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3); 00509 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3); 00510 } 00511 00512 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, 00513 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) 00514 { 00515 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1); 00516 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1); 00517 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0); 00518 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0); 00519 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1); 00520 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1); 00521 00522 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); 00523 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); 00524 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); 00525 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4); 00526 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5); 00527 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5); 00528 00529 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3); 00530 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3); 00531 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4); 00532 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4); 00533 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5); 00534 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); 00535 } 00536 00537 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, 00538 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) 00539 { 00540 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0); 00541 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0); 00542 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1); 00543 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1); 00544 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0); 00545 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0); 00546 __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1); 00547 __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1); 00548 00549 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4); 00550 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4); 00551 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5); 00552 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5); 00553 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6); 00554 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6); 00555 __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7); 00556 __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7); 00557 00558 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4); 00559 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4); 00560 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5); 00561 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5); 00562 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6); 00563 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6); 00564 v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7); 00565 v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7); 00566 } 00567 00568 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) 00569 { 00570 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); 00571 00572 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); 00573 __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); 00574 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); 00575 __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); 00576 00577 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); 00578 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); 00579 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); 00580 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); 00581 00582 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); 00583 v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); 00584 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); 00585 v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); 00586 } 00587 00588 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, 00589 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) 00590 { 00591 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); 00592 00593 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); 00594 __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); 00595 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); 00596 __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); 00597 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); 00598 __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); 00599 00600 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); 00601 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); 00602 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); 00603 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); 00604 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); 00605 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); 00606 00607 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); 00608 v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); 00609 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); 00610 v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); 00611 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); 00612 v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); 00613 } 00614 00615 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, 00616 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) 00617 { 00618 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); 00619 00620 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); 00621 __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); 00622 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); 00623 __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); 00624 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); 00625 __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); 00626 __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo); 00627 __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi); 00628 00629 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); 00630 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); 00631 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); 00632 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); 00633 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); 00634 __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); 00635 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo); 00636 __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi); 00637 00638 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); 00639 v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); 00640 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); 00641 v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); 00642 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); 00643 v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); 00644 v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo); 00645 v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi); 00646 } 00647 00648 #endif // CV_SSE2 00649 00650 //! @} 00651 00652 #endif //__OPENCV_CORE_SSE_UTILS_HPP__ 00653
Generated on Tue Jul 12 2022 16:42:40 by 1.7.2