Joe Verbout
/
main
opencv on mbed
Embed:
(wiki syntax)
Show/hide line numbers
intrin_cpp.hpp
00001 /*M/////////////////////////////////////////////////////////////////////////////////////// 00002 // 00003 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 00004 // 00005 // By downloading, copying, installing or using the software you agree to this license. 00006 // If you do not agree to this license, do not download, install, 00007 // copy or use the software. 00008 // 00009 // 00010 // License Agreement 00011 // For Open Source Computer Vision Library 00012 // 00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 00016 // Copyright (C) 2015, Itseez Inc., all rights reserved. 00017 // Third party copyrights are property of their respective owners. 00018 // 00019 // Redistribution and use in source and binary forms, with or without modification, 00020 // are permitted provided that the following conditions are met: 00021 // 00022 // * Redistribution's of source code must retain the above copyright notice, 00023 // this list of conditions and the following disclaimer. 00024 // 00025 // * Redistribution's in binary form must reproduce the above copyright notice, 00026 // this list of conditions and the following disclaimer in the documentation 00027 // and/or other materials provided with the distribution. 00028 // 00029 // * The name of the copyright holders may not be used to endorse or promote products 00030 // derived from this software without specific prior written permission. 00031 // 00032 // This software is provided by the copyright holders and contributors "as is" and 00033 // any express or implied warranties, including, but not limited to, the implied 00034 // warranties of merchantability and fitness for a particular purpose are disclaimed. 00035 // In no event shall the Intel Corporation or contributors be liable for any direct, 00036 // indirect, incidental, special, exemplary, or consequential damages 00037 // (including, but not limited to, procurement of substitute goods or services; 00038 // loss of use, data, or profits; or business interruption) however caused 00039 // and on any theory of liability, whether in contract, strict liability, 00040 // or tort (including negligence or otherwise) arising in any way out of 00041 // the use of this software, even if advised of the possibility of such damage. 00042 // 00043 //M*/ 00044 00045 #ifndef __OPENCV_HAL_INTRIN_CPP_HPP__ 00046 #define __OPENCV_HAL_INTRIN_CPP_HPP__ 00047 00048 #include <limits> 00049 #include <cstring> 00050 #include <algorithm> 00051 #include "opencv2/core/saturate.hpp" 00052 00053 namespace cv 00054 { 00055 00056 /** @addtogroup core_hal_intrin 00057 00058 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on 00059 different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86 00060 architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers 00061 containing packed values of different types. In case when there is no SIMD extension available 00062 during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as 00063 expected although it could be slower. 00064 00065 ### Types 00066 00067 There are several types representing 128-bit register as a vector of packed values, each type is 00068 implemented as a structure based on a one SIMD register. 00069 00070 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char 00071 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short 00072 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int 00073 - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64 00074 - cv::v_float32x4: four 32-bit floating point values (signed) - float 00075 - cv::v_float64x2: two 64-bit floating point valies (signed) - double 00076 00077 @note 00078 cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to 00079 check the CV_SIMD128_64F preprocessor definition: 00080 @code 00081 #if CV_SIMD128_64F 00082 //... 00083 #endif 00084 @endcode 00085 00086 ### Load and store operations 00087 00088 These operations allow to set contents of the register explicitly or by loading it from some memory 00089 block and to save contents of the register to memory block. 00090 00091 - Constructors: 00092 @ref v_reg::v_reg(const _Tp *ptr) "from memory", 00093 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ... 00094 - Other create methods: 00095 @ref v_setall_s8, @ref v_setall_u8, ..., 00096 @ref v_setzero_u8, @ref v_setzero_s8, ... 00097 - Memory operations: 00098 @ref v_load, @ref v_load_aligned, @ref v_load_halves, 00099 @ref v_store, @ref v_store_aligned, 00100 @ref v_store_high, @ref v_store_low 00101 00102 ### Value reordering 00103 00104 These operations allow to reorder or recombine elements in one or multiple vectors. 00105 00106 - Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave 00107 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand 00108 - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u, 00109 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store 00110 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high 00111 - Extract: @ref v_extract 00112 00113 00114 ### Arithmetic, bitwise and comparison operations 00115 00116 Element-wise binary and unary operations. 00117 00118 - Arithmetics: 00119 @ref operator+(const v_reg &a, const v_reg &b) "+", 00120 @ref operator-(const v_reg &a, const v_reg &b) "-", 00121 @ref operator*(const v_reg &a, const v_reg &b) "*", 00122 @ref operator/(const v_reg &a, const v_reg &b) "/", 00123 @ref v_mul_expand 00124 00125 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap 00126 00127 - Bitwise shifts: 00128 @ref operator<<(const v_reg &a, int s) "<<", 00129 @ref operator>>(const v_reg &a, int s) ">>", 00130 @ref v_shl, @ref v_shr 00131 00132 - Bitwise logic: 00133 @ref operator&(const v_reg &a, const v_reg &b) "&", 00134 @ref operator|(const v_reg &a, const v_reg &b) "|", 00135 @ref operator^(const v_reg &a, const v_reg &b) "^", 00136 @ref operator~(const v_reg &a) "~" 00137 00138 - Comparison: 00139 @ref operator>(const v_reg &a, const v_reg &b) ">", 00140 @ref operator>=(const v_reg &a, const v_reg &b) ">=", 00141 @ref operator<(const v_reg &a, const v_reg &b) "<", 00142 @ref operator<=(const v_reg &a, const v_reg &b) "<=", 00143 @ref operator==(const v_reg &a, const v_reg &b) "==", 00144 @ref operator!=(const v_reg &a, const v_reg &b) "!=" 00145 00146 - min/max: @ref v_min, @ref v_max 00147 00148 ### Reduce and mask 00149 00150 Most of these operations return only one value. 00151 00152 - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum 00153 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select 00154 00155 ### Other math 00156 00157 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude 00158 - Absolute values: @ref v_abs, @ref v_absdiff 00159 00160 ### Conversions 00161 00162 Different type conversions and casts: 00163 00164 - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc, 00165 - To float: @ref v_cvt_f32, @ref v_cvt_f64 00166 - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ... 00167 00168 ### Matrix operations 00169 00170 In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4 00171 00172 ### Usability 00173 00174 Most operations are implemented only for some subset of the available types, following matrices 00175 shows the applicability of different operations to the types. 00176 00177 Regular integers: 00178 00179 | Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 | 00180 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:| 00181 |load, store | x | x | x | x | x | x | 00182 |interleave | x | x | x | x | x | x | 00183 |expand | x | x | x | x | x | x | 00184 |expand_q | x | x | | | | | 00185 |add, sub | x | x | x | x | x | x | 00186 |add_wrap, sub_wrap | x | x | x | x | | | 00187 |mul | | | x | x | x | x | 00188 |mul_expand | | | x | x | x | | 00189 |compare | x | x | x | x | x | x | 00190 |shift | | | x | x | x | x | 00191 |dotprod | | | | x | | | 00192 |logical | x | x | x | x | x | x | 00193 |min, max | x | x | x | x | x | x | 00194 |absdiff | x | x | x | x | x | x | 00195 |reduce | | | | | x | x | 00196 |mask | x | x | x | x | x | x | 00197 |pack | x | x | x | x | x | x | 00198 |pack_u | x | | x | | | | 00199 |unpack | x | x | x | x | x | x | 00200 |extract | x | x | x | x | x | x | 00201 |cvt_flt32 | | | | | | x | 00202 |cvt_flt64 | | | | | | x | 00203 |transpose4x4 | | | | | x | x | 00204 00205 Big integers: 00206 00207 | Operations\\Types | uint 64x2 | int 64x2 | 00208 |-------------------|:-:|:-:| 00209 |load, store | x | x | 00210 |add, sub | x | x | 00211 |shift | x | x | 00212 |logical | x | x | 00213 |extract | x | x | 00214 00215 Floating point: 00216 00217 | Operations\\Types | float 32x4 | float 64x2 | 00218 |-------------------|:-:|:-:| 00219 |load, store | x | x | 00220 |interleave | x | | 00221 |add, sub | x | x | 00222 |mul | x | x | 00223 |div | x | x | 00224 |compare | x | x | 00225 |min, max | x | x | 00226 |absdiff | x | x | 00227 |reduce | x | | 00228 |mask | x | x | 00229 |unpack | x | x | 00230 |cvt_flt32 | | x | 00231 |cvt_flt64 | x | | 00232 |sqrt, abs | x | x | 00233 |float math | x | x | 00234 |transpose4x4 | x | | 00235 00236 00237 @{ */ 00238 00239 template<typename _Tp, int n> struct v_reg 00240 { 00241 //! @cond IGNORED 00242 typedef _Tp lane_type; 00243 typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec; 00244 typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec; 00245 enum { nlanes = n }; 00246 // !@endcond 00247 00248 /** @brief Constructor 00249 00250 Initializes register with data from memory 00251 @param ptr pointer to memory block with data for register */ 00252 explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; } 00253 00254 /** @brief Constructor 00255 00256 Initializes register with two 64-bit values */ 00257 v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; } 00258 00259 /** @brief Constructor 00260 00261 Initializes register with four 32-bit values */ 00262 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; } 00263 00264 /** @brief Constructor 00265 00266 Initializes register with eight 16-bit values */ 00267 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, 00268 _Tp s4, _Tp s5, _Tp s6, _Tp s7) 00269 { 00270 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; 00271 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; 00272 } 00273 00274 /** @brief Constructor 00275 00276 Initializes register with sixteen 8-bit values */ 00277 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, 00278 _Tp s4, _Tp s5, _Tp s6, _Tp s7, 00279 _Tp s8, _Tp s9, _Tp s10, _Tp s11, 00280 _Tp s12, _Tp s13, _Tp s14, _Tp s15) 00281 { 00282 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; 00283 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7; 00284 s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11; 00285 s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15; 00286 } 00287 00288 /** @brief Default constructor 00289 00290 Does not initialize anything*/ 00291 v_reg() {} 00292 00293 /** @brief Copy constructor */ 00294 v_reg(const v_reg<_Tp, n> & r) 00295 { 00296 for( int i = 0; i < n; i++ ) 00297 s[i] = r.s[i]; 00298 } 00299 /** @brief Access first value 00300 00301 Returns value of the first lane according to register type, for example: 00302 @code{.cpp} 00303 v_int32x4 r(1, 2, 3, 4); 00304 int v = r.get0(); // returns 1 00305 v_uint64x2 r(1, 2); 00306 uint64_t v = r.get0(); // returns 1 00307 @endcode 00308 */ 00309 _Tp get0() const { return s[0]; } 00310 00311 //! @cond IGNORED 00312 _Tp get(const int i) const { return s[i]; } 00313 v_reg<_Tp, n> high() const 00314 { 00315 v_reg<_Tp, n> c; 00316 int i; 00317 for( i = 0; i < n/2; i++ ) 00318 { 00319 c.s[i] = s[i+(n/2)]; 00320 c.s[i+(n/2)] = 0; 00321 } 00322 return c; 00323 } 00324 00325 static v_reg<_Tp, n> zero() 00326 { 00327 v_reg<_Tp, n> c; 00328 for( int i = 0; i < n; i++ ) 00329 c.s[i] = (_Tp)0; 00330 return c; 00331 } 00332 00333 static v_reg<_Tp, n> all(_Tp s) 00334 { 00335 v_reg<_Tp, n> c; 00336 for( int i = 0; i < n; i++ ) 00337 c.s[i] = s; 00338 return c; 00339 } 00340 00341 template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const 00342 { 00343 size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n); 00344 v_reg<_Tp2, n2> c; 00345 std::memcpy(&c.s[0], &s[0], bytes); 00346 return c; 00347 } 00348 00349 _Tp s[n]; 00350 //! @endcond 00351 }; 00352 00353 /** @brief Sixteen 8-bit unsigned integer values */ 00354 typedef v_reg<uchar, 16> v_uint8x16; 00355 /** @brief Sixteen 8-bit signed integer values */ 00356 typedef v_reg<schar, 16> v_int8x16; 00357 /** @brief Eight 16-bit unsigned integer values */ 00358 typedef v_reg<ushort, 8> v_uint16x8; 00359 /** @brief Eight 16-bit signed integer values */ 00360 typedef v_reg<short, 8> v_int16x8; 00361 /** @brief Four 32-bit unsigned integer values */ 00362 typedef v_reg<unsigned, 4> v_uint32x4; 00363 /** @brief Four 32-bit signed integer values */ 00364 typedef v_reg<int, 4> v_int32x4; 00365 /** @brief Four 32-bit floating point values (single precision) */ 00366 typedef v_reg<float, 4> v_float32x4; 00367 /** @brief Two 64-bit floating point values (double precision) */ 00368 typedef v_reg<double, 2> v_float64x2; 00369 /** @brief Two 64-bit unsigned integer values */ 00370 typedef v_reg<uint64, 2> v_uint64x2; 00371 /** @brief Two 64-bit signed integer values */ 00372 typedef v_reg<int64, 2> v_int64x2; 00373 00374 //! @brief Helper macro 00375 //! @ingroup core_hal_intrin_impl 00376 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \ 00377 template<typename _Tp, int n> inline v_reg<_Tp, n> \ 00378 operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00379 { \ 00380 v_reg<_Tp, n> c; \ 00381 for( int i = 0; i < n; i++ ) \ 00382 c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ 00383 return c; \ 00384 } \ 00385 template<typename _Tp, int n> inline v_reg<_Tp, n>& \ 00386 operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00387 { \ 00388 for( int i = 0; i < n; i++ ) \ 00389 a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \ 00390 return a; \ 00391 } 00392 00393 /** @brief Add values 00394 00395 For all types. */ 00396 OPENCV_HAL_IMPL_BIN_OP(+) 00397 00398 /** @brief Subtract values 00399 00400 For all types. */ 00401 OPENCV_HAL_IMPL_BIN_OP(-) 00402 00403 /** @brief Multiply values 00404 00405 For 16- and 32-bit integer types and floating types. */ 00406 OPENCV_HAL_IMPL_BIN_OP(*) 00407 00408 /** @brief Divide values 00409 00410 For floating types only. */ 00411 OPENCV_HAL_IMPL_BIN_OP(/) 00412 00413 //! @brief Helper macro 00414 //! @ingroup core_hal_intrin_impl 00415 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \ 00416 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \ 00417 (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00418 { \ 00419 v_reg<_Tp, n> c; \ 00420 typedef typename V_TypeTraits<_Tp>::int_type itype; \ 00421 for( int i = 0; i < n; i++ ) \ 00422 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ 00423 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ 00424 return c; \ 00425 } \ 00426 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \ 00427 bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00428 { \ 00429 typedef typename V_TypeTraits<_Tp>::int_type itype; \ 00430 for( int i = 0; i < n; i++ ) \ 00431 a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \ 00432 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \ 00433 return a; \ 00434 } 00435 00436 /** @brief Bitwise AND 00437 00438 Only for integer types. */ 00439 OPENCV_HAL_IMPL_BIT_OP(&) 00440 00441 /** @brief Bitwise OR 00442 00443 Only for integer types. */ 00444 OPENCV_HAL_IMPL_BIT_OP(|) 00445 00446 /** @brief Bitwise XOR 00447 00448 Only for integer types.*/ 00449 OPENCV_HAL_IMPL_BIT_OP(^) 00450 00451 /** @brief Bitwise NOT 00452 00453 Only for integer types.*/ 00454 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) 00455 { 00456 v_reg<_Tp, n> c; 00457 for( int i = 0; i < n; i++ ) 00458 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); 00459 return c; 00460 } 00461 00462 //! @brief Helper macro 00463 //! @ingroup core_hal_intrin_impl 00464 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \ 00465 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \ 00466 { \ 00467 v_reg<_Tp2, n> c; \ 00468 for( int i = 0; i < n; i++ ) \ 00469 c.s[i] = cfunc(a.s[i]); \ 00470 return c; \ 00471 } 00472 00473 /** @brief Square root of elements 00474 00475 Only for floating point types.*/ 00476 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp) 00477 00478 //! @cond IGNORED 00479 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp) 00480 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp) 00481 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp) 00482 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp) 00483 //! @endcond 00484 00485 /** @brief Absolute value of elements 00486 00487 Only for floating point types.*/ 00488 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs, 00489 typename V_TypeTraits<_Tp>::abs_type) 00490 00491 /** @brief Round elements 00492 00493 Only for floating point types.*/ 00494 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int) 00495 00496 /** @brief Floor elements 00497 00498 Only for floating point types.*/ 00499 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int) 00500 00501 /** @brief Ceil elements 00502 00503 Only for floating point types.*/ 00504 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int) 00505 00506 /** @brief Truncate elements 00507 00508 Only for floating point types.*/ 00509 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int) 00510 00511 //! @brief Helper macro 00512 //! @ingroup core_hal_intrin_impl 00513 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \ 00514 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00515 { \ 00516 v_reg<_Tp, n> c; \ 00517 for( int i = 0; i < n; i++ ) \ 00518 c.s[i] = cfunc(a.s[i], b.s[i]); \ 00519 return c; \ 00520 } 00521 00522 //! @brief Helper macro 00523 //! @ingroup core_hal_intrin_impl 00524 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \ 00525 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \ 00526 { \ 00527 _Tp c = a.s[0]; \ 00528 for( int i = 1; i < n; i++ ) \ 00529 c = cfunc(c, a.s[i]); \ 00530 return c; \ 00531 } 00532 00533 /** @brief Choose min values for each pair 00534 00535 Scheme: 00536 @code 00537 {A1 A2 ...} 00538 {B1 B2 ...} 00539 -------------- 00540 {min(A1,B1) min(A2,B2) ...} 00541 @endcode 00542 For all types except 64-bit integer. */ 00543 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min) 00544 00545 /** @brief Choose max values for each pair 00546 00547 Scheme: 00548 @code 00549 {A1 A2 ...} 00550 {B1 B2 ...} 00551 -------------- 00552 {max(A1,B1) max(A2,B2) ...} 00553 @endcode 00554 For all types except 64-bit integer. */ 00555 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max) 00556 00557 /** @brief Find one min value 00558 00559 Scheme: 00560 @code 00561 {A1 A2 A3 ...} => min(A1,A2,A3,...) 00562 @endcode 00563 For 32-bit integer and 32-bit floating point types. */ 00564 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min) 00565 00566 /** @brief Find one max value 00567 00568 Scheme: 00569 @code 00570 {A1 A2 A3 ...} => max(A1,A2,A3,...) 00571 @endcode 00572 For 32-bit integer and 32-bit floating point types. */ 00573 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max) 00574 00575 //! @cond IGNORED 00576 template<typename _Tp, int n> 00577 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 00578 v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval ) 00579 { 00580 for( int i = 0; i < n; i++ ) 00581 { 00582 minval.s[i] = std::min(a.s[i], b.s[i]); 00583 maxval.s[i] = std::max(a.s[i], b.s[i]); 00584 } 00585 } 00586 //! @endcond 00587 00588 //! @brief Helper macro 00589 //! @ingroup core_hal_intrin_impl 00590 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \ 00591 template<typename _Tp, int n> \ 00592 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00593 { \ 00594 typedef typename V_TypeTraits<_Tp>::int_type itype; \ 00595 v_reg<_Tp, n> c; \ 00596 for( int i = 0; i < n; i++ ) \ 00597 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \ 00598 return c; \ 00599 } 00600 00601 /** @brief Less-than comparison 00602 00603 For all types except 64-bit integer values. */ 00604 OPENCV_HAL_IMPL_CMP_OP(<) 00605 00606 /** @brief Greater-than comparison 00607 00608 For all types except 64-bit integer values. */ 00609 OPENCV_HAL_IMPL_CMP_OP(>) 00610 00611 /** @brief Less-than or equal comparison 00612 00613 For all types except 64-bit integer values. */ 00614 OPENCV_HAL_IMPL_CMP_OP(<=) 00615 00616 /** @brief Greater-than or equal comparison 00617 00618 For all types except 64-bit integer values. */ 00619 OPENCV_HAL_IMPL_CMP_OP(>=) 00620 00621 /** @brief Equal comparison 00622 00623 For all types except 64-bit integer values. */ 00624 OPENCV_HAL_IMPL_CMP_OP(==) 00625 00626 /** @brief Not equal comparison 00627 00628 For all types except 64-bit integer values. */ 00629 OPENCV_HAL_IMPL_CMP_OP(!=) 00630 00631 //! @brief Helper macro 00632 //! @ingroup core_hal_intrin_impl 00633 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \ 00634 template<typename _Tp, int n> \ 00635 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \ 00636 { \ 00637 typedef _Tp2 rtype; \ 00638 v_reg<rtype, n> c; \ 00639 for( int i = 0; i < n; i++ ) \ 00640 c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \ 00641 return c; \ 00642 } 00643 00644 /** @brief Add values without saturation 00645 00646 For 8- and 16-bit integer values. */ 00647 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp) 00648 00649 /** @brief Subtract values without saturation 00650 00651 For 8- and 16-bit integer values. */ 00652 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp) 00653 00654 //! @cond IGNORED 00655 template<typename T> inline T _absdiff(T a, T b) 00656 { 00657 return a > b ? a - b : b - a; 00658 } 00659 //! @endcond 00660 00661 /** @brief Absolute difference 00662 00663 Returns \f$ |a - b| \f$ converted to corresponding unsigned type. 00664 Example: 00665 @code{.cpp} 00666 v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1} 00667 v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3} 00668 @endcode 00669 For 8-, 16-, 32-bit integer source types. */ 00670 template<typename _Tp, int n> 00671 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b) 00672 { 00673 typedef typename V_TypeTraits<_Tp>::abs_type rtype; 00674 v_reg<rtype, n> c; 00675 const rtype mask = std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0; 00676 for( int i = 0; i < n; i++ ) 00677 { 00678 rtype ua = a.s[i] ^ mask; 00679 rtype ub = b.s[i] ^ mask; 00680 c.s[i] = _absdiff(ua, ub); 00681 } 00682 return c; 00683 } 00684 00685 /** @overload 00686 00687 For 32-bit floating point values */ 00688 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) 00689 { 00690 v_float32x4 c; 00691 for( int i = 0; i < c.nlanes; i++ ) 00692 c.s[i] = _absdiff(a.s[i], b.s[i]); 00693 return c; 00694 } 00695 00696 /** @overload 00697 00698 For 64-bit floating point values */ 00699 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) 00700 { 00701 v_float64x2 c; 00702 for( int i = 0; i < c.nlanes; i++ ) 00703 c.s[i] = _absdiff(a.s[i], b.s[i]); 00704 return c; 00705 } 00706 00707 /** @brief Inversed square root 00708 00709 Returns \f$ 1/sqrt(a) \f$ 00710 For floating point types only. */ 00711 template<typename _Tp, int n> 00712 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a) 00713 { 00714 v_reg<_Tp, n> c; 00715 for( int i = 0; i < n; i++ ) 00716 c.s[i] = 1.f/std::sqrt(a.s[i]); 00717 return c; 00718 } 00719 00720 /** @brief Magnitude 00721 00722 Returns \f$ sqrt(a^2 + b^2) \f$ 00723 For floating point types only. */ 00724 template<typename _Tp, int n> 00725 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 00726 { 00727 v_reg<_Tp, n> c; 00728 for( int i = 0; i < n; i++ ) 00729 c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]); 00730 return c; 00731 } 00732 00733 /** @brief Square of the magnitude 00734 00735 Returns \f$ a^2 + b^2 \f$ 00736 For floating point types only. */ 00737 template<typename _Tp, int n> 00738 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 00739 { 00740 v_reg<_Tp, n> c; 00741 for( int i = 0; i < n; i++ ) 00742 c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i]; 00743 return c; 00744 } 00745 00746 /** @brief Multiply and add 00747 00748 Returns \f$ a*b + c \f$ 00749 For floating point types only. */ 00750 template<typename _Tp, int n> 00751 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 00752 const v_reg<_Tp, n>& c) 00753 { 00754 v_reg<_Tp, n> d; 00755 for( int i = 0; i < n; i++ ) 00756 d.s[i] = a.s[i]*b.s[i] + c.s[i]; 00757 return d; 00758 } 00759 00760 /** @brief Dot product of elements 00761 00762 Multiply values in two registers and sum adjacent result pairs. 00763 Scheme: 00764 @code 00765 {A1 A2 ...} // 16-bit 00766 x {B1 B2 ...} // 16-bit 00767 ------------- 00768 {A1B1+A2B2 ...} // 32-bit 00769 @endcode 00770 Implemented only for 16-bit signed source type (v_int16x8). 00771 */ 00772 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> 00773 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 00774 { 00775 typedef typename V_TypeTraits<_Tp>::w_type w_type; 00776 v_reg<w_type, n/2> c; 00777 for( int i = 0; i < (n/2); i++ ) 00778 c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1]; 00779 return c; 00780 } 00781 00782 /** @brief Multiply and expand 00783 00784 Multiply values two registers and store results in two registers with wider pack type. 00785 Scheme: 00786 @code 00787 {A B C D} // 32-bit 00788 x {E F G H} // 32-bit 00789 --------------- 00790 {AE BF} // 64-bit 00791 {CG DH} // 64-bit 00792 @endcode 00793 Example: 00794 @code{.cpp} 00795 v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2} 00796 v_uint64x2 c, d; // results 00797 v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8} 00798 @endcode 00799 Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4). 00800 */ 00801 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 00802 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c, 00803 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d) 00804 { 00805 typedef typename V_TypeTraits<_Tp>::w_type w_type; 00806 for( int i = 0; i < (n/2); i++ ) 00807 { 00808 c.s[i] = (w_type)a.s[i]*b.s[i]; 00809 d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)]; 00810 } 00811 } 00812 00813 //! @cond IGNORED 00814 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a, 00815 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c) 00816 { 00817 typedef typename V_TypeTraits<_Tp>::w_type w_type; 00818 for( int i = 0; i < (n/2); i++ ) 00819 { 00820 c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1]; 00821 } 00822 } 00823 //! @endcond 00824 00825 //! @brief Helper macro 00826 //! @ingroup core_hal_intrin_impl 00827 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \ 00828 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \ 00829 { \ 00830 v_reg<_Tp, n> c; \ 00831 for( int i = 0; i < n; i++ ) \ 00832 c.s[i] = (_Tp)(a.s[i] shift_op imm); \ 00833 return c; \ 00834 } 00835 00836 /** @brief Bitwise shift left 00837 00838 For 16-, 32- and 64-bit integer values. */ 00839 OPENCV_HAL_IMPL_SHIFT_OP(<<) 00840 00841 /** @brief Bitwise shift right 00842 00843 For 16-, 32- and 64-bit integer values. */ 00844 OPENCV_HAL_IMPL_SHIFT_OP(>>) 00845 00846 /** @brief Sum packed values 00847 00848 Scheme: 00849 @code 00850 {A1 A2 A3 ...} => sum{A1,A2,A3,...} 00851 @endcode 00852 For 32-bit integer and 32-bit floating point types.*/ 00853 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a) 00854 { 00855 typename V_TypeTraits<_Tp>::sum_type c = a.s[0]; 00856 for( int i = 1; i < n; i++ ) 00857 c += a.s[i]; 00858 return c; 00859 } 00860 00861 /** @brief Get negative values mask 00862 00863 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes. 00864 Example: 00865 @code{.cpp} 00866 v_int32x4 r; // set to {-1, -1, 1, 1} 00867 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011 00868 @endcode 00869 For all types except 64-bit. */ 00870 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a) 00871 { 00872 int mask = 0; 00873 for( int i = 0; i < n; i++ ) 00874 mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i; 00875 return mask; 00876 } 00877 00878 /** @brief Check if all packed values are less than zero 00879 00880 Unsigned values will be casted to signed: `uchar 254 => char -2`. 00881 For all types except 64-bit. */ 00882 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a) 00883 { 00884 for( int i = 0; i < n; i++ ) 00885 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 ) 00886 return false; 00887 return true; 00888 } 00889 00890 /** @brief Check if any of packed values is less than zero 00891 00892 Unsigned values will be casted to signed: `uchar 254 => char -2`. 00893 For all types except 64-bit. */ 00894 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a) 00895 { 00896 for( int i = 0; i < n; i++ ) 00897 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 ) 00898 return true; 00899 return false; 00900 } 00901 00902 /** @brief Bitwise select 00903 00904 Return value will be built by combining values a and b using the following scheme: 00905 If the i-th bit in _mask_ is 1 00906 select i-th bit from _a_ 00907 else 00908 select i-th bit from _b_ */ 00909 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask, 00910 const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 00911 { 00912 typedef V_TypeTraits<_Tp> Traits; 00913 typedef typename Traits::int_type int_type; 00914 v_reg<_Tp, n> c; 00915 for( int i = 0; i < n; i++ ) 00916 { 00917 int_type m = Traits::reinterpret_int(mask.s[i]); 00918 c.s[i] = Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m) 00919 | (Traits::reinterpret_int(b.s[i]) & ~m)); 00920 } 00921 return c; 00922 } 00923 00924 /** @brief Expand values to the wider pack type 00925 00926 Copy contents of register to two registers with 2x wider pack type. 00927 Scheme: 00928 @code 00929 int32x4 int64x2 int64x2 00930 {A B C D} ==> {A B} , {C D} 00931 @endcode */ 00932 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a, 00933 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0, 00934 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1) 00935 { 00936 for( int i = 0; i < (n/2); i++ ) 00937 { 00938 b0.s[i] = a.s[i]; 00939 b1.s[i] = a.s[i+(n/2)]; 00940 } 00941 } 00942 00943 //! @cond IGNORED 00944 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n> 00945 v_reinterpret_as_int(const v_reg<_Tp, n>& a) 00946 { 00947 v_reg<typename V_TypeTraits<_Tp>::int_type, n> c; 00948 for( int i = 0; i < n; i++ ) 00949 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]); 00950 return c; 00951 } 00952 00953 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n> 00954 v_reinterpret_as_uint(const v_reg<_Tp, n>& a) 00955 { 00956 v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c; 00957 for( int i = 0; i < n; i++ ) 00958 c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]); 00959 return c; 00960 } 00961 //! @endcond 00962 00963 /** @brief Interleave two vectors 00964 00965 Scheme: 00966 @code 00967 {A1 A2 A3 A4} 00968 {B1 B2 B3 B4} 00969 --------------- 00970 {A1 B1 A2 B2} and {A3 B3 A4 B4} 00971 @endcode 00972 For all types except 64-bit. 00973 */ 00974 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, 00975 v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 ) 00976 { 00977 int i; 00978 for( i = 0; i < n/2; i++ ) 00979 { 00980 b0.s[i*2] = a0.s[i]; 00981 b0.s[i*2+1] = a1.s[i]; 00982 } 00983 for( ; i < n; i++ ) 00984 { 00985 b1.s[i*2-n] = a0.s[i]; 00986 b1.s[i*2-n+1] = a1.s[i]; 00987 } 00988 } 00989 00990 /** @brief Load register contents from memory 00991 00992 @param ptr pointer to memory block with data 00993 @return register object 00994 00995 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc. 00996 */ 00997 template<typename _Tp> 00998 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr) 00999 { 01000 return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr); 01001 } 01002 01003 /** @brief Load register contents from memory (aligned) 01004 01005 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary) 01006 */ 01007 template<typename _Tp> 01008 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr) 01009 { 01010 return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr); 01011 } 01012 01013 /** @brief Load register contents from two memory blocks 01014 01015 @param loptr memory block containing data for first half (0..n/2) 01016 @param hiptr memory block containing data for second half (n/2..n) 01017 01018 @code{.cpp} 01019 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 }; 01020 v_int32x4 r = v_load_halves(lo, hi); 01021 @endcode 01022 */ 01023 template<typename _Tp> 01024 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr) 01025 { 01026 v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c; 01027 for( int i = 0; i < c.nlanes/2; i++ ) 01028 { 01029 c.s[i] = loptr[i]; 01030 c.s[i+c.nlanes/2] = hiptr[i]; 01031 } 01032 return c; 01033 } 01034 01035 /** @brief Load register contents from memory with double expand 01036 01037 Same as cv::v_load, but result pack type will be 2x wider than memory type. 01038 01039 @code{.cpp} 01040 short buf[4] = {1, 2, 3, 4}; // type is int16 01041 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32 01042 @endcode 01043 For 8-, 16-, 32-bit integer source types. */ 01044 template<typename _Tp> 01045 inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2> 01046 v_load_expand(const _Tp* ptr) 01047 { 01048 typedef typename V_TypeTraits<_Tp>::w_type w_type; 01049 v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c; 01050 for( int i = 0; i < c.nlanes; i++ ) 01051 { 01052 c.s[i] = ptr[i]; 01053 } 01054 return c; 01055 } 01056 01057 /** @brief Load register contents from memory with quad expand 01058 01059 Same as cv::v_load_expand, but result type is 4 times wider than source. 01060 @code{.cpp} 01061 char buf[4] = {1, 2, 3, 4}; // type is int8 01062 v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32 01063 @endcode 01064 For 8-bit integer source types. */ 01065 template<typename _Tp> 01066 inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4> 01067 v_load_expand_q(const _Tp* ptr) 01068 { 01069 typedef typename V_TypeTraits<_Tp>::q_type q_type; 01070 v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c; 01071 for( int i = 0; i < c.nlanes; i++ ) 01072 { 01073 c.s[i] = ptr[i]; 01074 } 01075 return c; 01076 } 01077 01078 /** @brief Load and deinterleave (4 channels) 01079 01080 Load data from memory deinterleave and store to 4 registers. 01081 Scheme: 01082 @code 01083 {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} 01084 @endcode 01085 For all types except 64-bit. */ 01086 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, 01087 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c) 01088 { 01089 int i, i3; 01090 for( i = i3 = 0; i < n; i++, i3 += 3 ) 01091 { 01092 a.s[i] = ptr[i3]; 01093 b.s[i] = ptr[i3+1]; 01094 c.s[i] = ptr[i3+2]; 01095 } 01096 } 01097 01098 /** @brief Load and deinterleave (3 channels) 01099 01100 Load data from memory deinterleave and store to 3 registers. 01101 Scheme: 01102 @code 01103 {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} 01104 @endcode 01105 For all types except 64-bit. */ 01106 template<typename _Tp, int n> 01107 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, 01108 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, 01109 v_reg<_Tp, n>& d) 01110 { 01111 int i, i4; 01112 for( i = i4 = 0; i < n; i++, i4 += 4 ) 01113 { 01114 a.s[i] = ptr[i4]; 01115 b.s[i] = ptr[i4+1]; 01116 c.s[i] = ptr[i4+2]; 01117 d.s[i] = ptr[i4+3]; 01118 } 01119 } 01120 01121 /** @brief Interleave and store (3 channels) 01122 01123 Interleave and store data from 3 registers to memory. 01124 Scheme: 01125 @code 01126 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...} 01127 @endcode 01128 For all types except 64-bit. */ 01129 template<typename _Tp, int n> 01130 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, 01131 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c) 01132 { 01133 int i, i3; 01134 for( i = i3 = 0; i < n; i++, i3 += 3 ) 01135 { 01136 ptr[i3] = a.s[i]; 01137 ptr[i3+1] = b.s[i]; 01138 ptr[i3+2] = c.s[i]; 01139 } 01140 } 01141 01142 /** @brief Interleave and store (4 channels) 01143 01144 Interleave and store data from 4 registers to memory. 01145 Scheme: 01146 @code 01147 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...} 01148 @endcode 01149 For all types except 64-bit. */ 01150 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, 01151 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, 01152 const v_reg<_Tp, n>& d) 01153 { 01154 int i, i4; 01155 for( i = i4 = 0; i < n; i++, i4 += 4 ) 01156 { 01157 ptr[i4] = a.s[i]; 01158 ptr[i4+1] = b.s[i]; 01159 ptr[i4+2] = c.s[i]; 01160 ptr[i4+3] = d.s[i]; 01161 } 01162 } 01163 01164 /** @brief Store data to memory 01165 01166 Store register contents to memory. 01167 Scheme: 01168 @code 01169 REG {A B C D} ==> MEM {A B C D} 01170 @endcode 01171 Pointer can be unaligned. */ 01172 template<typename _Tp, int n> 01173 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a) 01174 { 01175 for( int i = 0; i < n; i++ ) 01176 ptr[i] = a.s[i]; 01177 } 01178 01179 /** @brief Store data to memory (lower half) 01180 01181 Store lower half of register contents to memory. 01182 Scheme: 01183 @code 01184 REG {A B C D} ==> MEM {A B} 01185 @endcode */ 01186 template<typename _Tp, int n> 01187 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a) 01188 { 01189 for( int i = 0; i < (n/2); i++ ) 01190 ptr[i] = a.s[i]; 01191 } 01192 01193 /** @brief Store data to memory (higher half) 01194 01195 Store higher half of register contents to memory. 01196 Scheme: 01197 @code 01198 REG {A B C D} ==> MEM {C D} 01199 @endcode */ 01200 template<typename _Tp, int n> 01201 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a) 01202 { 01203 for( int i = 0; i < (n/2); i++ ) 01204 ptr[i] = a.s[i+(n/2)]; 01205 } 01206 01207 /** @brief Store data to memory (aligned) 01208 01209 Store register contents to memory. 01210 Scheme: 01211 @code 01212 REG {A B C D} ==> MEM {A B C D} 01213 @endcode 01214 Pointer __should__ be aligned by 16-byte boundary. */ 01215 template<typename _Tp, int n> 01216 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a) 01217 { 01218 for( int i = 0; i < n; i++ ) 01219 ptr[i] = a.s[i]; 01220 } 01221 01222 /** @brief Combine vector from first elements of two vectors 01223 01224 Scheme: 01225 @code 01226 {A1 A2 A3 A4} 01227 {B1 B2 B3 B4} 01228 --------------- 01229 {A1 A2 B1 B2} 01230 @endcode 01231 For all types except 64-bit. */ 01232 template<typename _Tp, int n> 01233 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 01234 { 01235 v_reg<_Tp, n> c; 01236 for( int i = 0; i < (n/2); i++ ) 01237 { 01238 c.s[i] = a.s[i]; 01239 c.s[i+(n/2)] = b.s[i]; 01240 } 01241 return c; 01242 } 01243 01244 /** @brief Combine vector from last elements of two vectors 01245 01246 Scheme: 01247 @code 01248 {A1 A2 A3 A4} 01249 {B1 B2 B3 B4} 01250 --------------- 01251 {A3 A4 B3 B4} 01252 @endcode 01253 For all types except 64-bit. */ 01254 template<typename _Tp, int n> 01255 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 01256 { 01257 v_reg<_Tp, n> c; 01258 for( int i = 0; i < (n/2); i++ ) 01259 { 01260 c.s[i] = a.s[i+(n/2)]; 01261 c.s[i+(n/2)] = b.s[i+(n/2)]; 01262 } 01263 return c; 01264 } 01265 01266 /** @brief Combine two vectors from lower and higher parts of two other vectors 01267 01268 @code{.cpp} 01269 low = cv::v_combine_low(a, b); 01270 high = cv::v_combine_high(a, b); 01271 @endcode */ 01272 template<typename _Tp, int n> 01273 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, 01274 v_reg<_Tp, n>& low, v_reg<_Tp, n>& high) 01275 { 01276 for( int i = 0; i < (n/2); i++ ) 01277 { 01278 low.s[i] = a.s[i]; 01279 low.s[i+(n/2)] = b.s[i]; 01280 high.s[i] = a.s[i+(n/2)]; 01281 high.s[i+(n/2)] = b.s[i+(n/2)]; 01282 } 01283 } 01284 01285 /** @brief Vector extract 01286 01287 Scheme: 01288 @code 01289 {A1 A2 A3 A4} 01290 {B1 B2 B3 B4} 01291 ======================== 01292 shift = 1 {A2 A3 A4 B1} 01293 shift = 2 {A3 A4 B1 B2} 01294 shift = 3 {A4 B1 B2 B3} 01295 @endcode 01296 Restriction: 0 <= shift < nlanes 01297 01298 Usage: 01299 @code 01300 v_int32x4 a, b, c; 01301 c = v_extract<2>(a, b); 01302 @endcode 01303 For integer types only. */ 01304 template<int s, typename _Tp, int n> 01305 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) 01306 { 01307 v_reg<_Tp, n> r; 01308 const int shift = n - s; 01309 int i = 0; 01310 for (; i < shift; ++i) 01311 r.s[i] = a.s[i+s]; 01312 for (; i < n; ++i) 01313 r.s[i] = b.s[i-shift]; 01314 return r; 01315 } 01316 01317 /** @brief Round 01318 01319 Rounds each value. Input type is float vector ==> output type is int vector.*/ 01320 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a) 01321 { 01322 v_reg<int, n> c; 01323 for( int i = 0; i < n; i++ ) 01324 c.s[i] = cvRound(a.s[i]); 01325 return c; 01326 } 01327 01328 /** @brief Floor 01329 01330 Floor each value. Input type is float vector ==> output type is int vector.*/ 01331 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a) 01332 { 01333 v_reg<int, n> c; 01334 for( int i = 0; i < n; i++ ) 01335 c.s[i] = cvFloor(a.s[i]); 01336 return c; 01337 } 01338 01339 /** @brief Ceil 01340 01341 Ceil each value. Input type is float vector ==> output type is int vector.*/ 01342 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a) 01343 { 01344 v_reg<int, n> c; 01345 for( int i = 0; i < n; i++ ) 01346 c.s[i] = cvCeil(a.s[i]); 01347 return c; 01348 } 01349 01350 /** @brief Trunc 01351 01352 Truncate each value. Input type is float vector ==> output type is int vector.*/ 01353 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a) 01354 { 01355 v_reg<int, n> c; 01356 for( int i = 0; i < n; i++ ) 01357 c.s[i] = (int)(a.s[i]); 01358 return c; 01359 } 01360 01361 /** @overload */ 01362 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a) 01363 { 01364 v_reg<int, n*2> c; 01365 for( int i = 0; i < n; i++ ) 01366 { 01367 c.s[i] = cvRound(a.s[i]); 01368 c.s[i+n] = 0; 01369 } 01370 return c; 01371 } 01372 01373 /** @overload */ 01374 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a) 01375 { 01376 v_reg<int, n> c; 01377 for( int i = 0; i < n; i++ ) 01378 { 01379 c.s[i] = cvFloor(a.s[i]); 01380 c.s[i+n] = 0; 01381 } 01382 return c; 01383 } 01384 01385 /** @overload */ 01386 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a) 01387 { 01388 v_reg<int, n> c; 01389 for( int i = 0; i < n; i++ ) 01390 { 01391 c.s[i] = cvCeil(a.s[i]); 01392 c.s[i+n] = 0; 01393 } 01394 return c; 01395 } 01396 01397 /** @overload */ 01398 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a) 01399 { 01400 v_reg<int, n> c; 01401 for( int i = 0; i < n; i++ ) 01402 { 01403 c.s[i] = cvCeil(a.s[i]); 01404 c.s[i+n] = 0; 01405 } 01406 return c; 01407 } 01408 01409 /** @brief Convert to float 01410 01411 Supported input type is cv::v_int32x4. */ 01412 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a) 01413 { 01414 v_reg<float, n> c; 01415 for( int i = 0; i < n; i++ ) 01416 c.s[i] = (float)a.s[i]; 01417 return c; 01418 } 01419 01420 /** @brief Convert to double 01421 01422 Supported input type is cv::v_int32x4. */ 01423 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a) 01424 { 01425 v_reg<double, n> c; 01426 for( int i = 0; i < n; i++ ) 01427 c.s[i] = (double)a.s[i]; 01428 return c; 01429 } 01430 01431 /** @brief Convert to double 01432 01433 Supported input type is cv::v_float32x4. */ 01434 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a) 01435 { 01436 v_reg<double, n> c; 01437 for( int i = 0; i < n; i++ ) 01438 c.s[i] = (double)a.s[i]; 01439 return c; 01440 } 01441 01442 /** @brief Transpose 4x4 matrix 01443 01444 Scheme: 01445 @code 01446 a0 {A1 A2 A3 A4} 01447 a1 {B1 B2 B3 B4} 01448 a2 {C1 C2 C3 C4} 01449 a3 {D1 D2 D3 D4} 01450 =============== 01451 b0 {A1 B1 C1 D1} 01452 b1 {A2 B2 C2 D2} 01453 b2 {A3 B3 C3 D3} 01454 b3 {A4 B4 C4 D4} 01455 @endcode 01456 */ 01457 template<typename _Tp> 01458 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, 01459 const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, 01460 v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1, 01461 v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 ) 01462 { 01463 b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]); 01464 b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]); 01465 b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]); 01466 b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]); 01467 } 01468 01469 //! @brief Helper macro 01470 //! @ingroup core_hal_intrin_impl 01471 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \ 01472 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } 01473 01474 //! @name Init with zero 01475 //! @{ 01476 //! @brief Create new vector with zero elements 01477 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8) 01478 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8) 01479 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16) 01480 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16) 01481 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32) 01482 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32) 01483 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32) 01484 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64) 01485 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64) 01486 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64) 01487 //! @} 01488 01489 //! @brief Helper macro 01490 //! @ingroup core_hal_intrin_impl 01491 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \ 01492 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } 01493 01494 //! @name Init with value 01495 //! @{ 01496 //! @brief Create new vector with elements set to a specific value 01497 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8) 01498 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8) 01499 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16) 01500 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16) 01501 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32) 01502 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32) 01503 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32) 01504 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64) 01505 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64) 01506 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64) 01507 //! @} 01508 01509 //! @brief Helper macro 01510 //! @ingroup core_hal_intrin_impl 01511 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \ 01512 template<typename _Tp0, int n0> inline _Tpvec \ 01513 v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \ 01514 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); } 01515 01516 //! @name Reinterpret 01517 //! @{ 01518 //! @brief Convert vector to different type without modifying underlying data. 01519 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8) 01520 OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8) 01521 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16) 01522 OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16) 01523 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32) 01524 OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32) 01525 OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32) 01526 OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64) 01527 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64) 01528 OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64) 01529 //! @} 01530 01531 //! @brief Helper macro 01532 //! @ingroup core_hal_intrin_impl 01533 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \ 01534 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \ 01535 { return a << n; } 01536 01537 //! @name Left shift 01538 //! @{ 01539 //! @brief Shift left 01540 OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort) 01541 OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short) 01542 OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned) 01543 OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int) 01544 OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64) 01545 OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64) 01546 //! @} 01547 01548 //! @brief Helper macro 01549 //! @ingroup core_hal_intrin_impl 01550 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \ 01551 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \ 01552 { return a >> n; } 01553 01554 //! @name Right shift 01555 //! @{ 01556 //! @brief Shift right 01557 OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort) 01558 OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short) 01559 OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned) 01560 OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int) 01561 OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64) 01562 OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64) 01563 //! @} 01564 01565 //! @brief Helper macro 01566 //! @ingroup core_hal_intrin_impl 01567 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \ 01568 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \ 01569 { \ 01570 _Tpvec c; \ 01571 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 01572 c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 01573 return c; \ 01574 } 01575 01576 //! @name Rounding shift 01577 //! @{ 01578 //! @brief Rounding shift right 01579 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort) 01580 OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short) 01581 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned) 01582 OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int) 01583 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64) 01584 OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64) 01585 //! @} 01586 01587 //! @brief Helper macro 01588 //! @ingroup core_hal_intrin_impl 01589 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \ 01590 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ 01591 { \ 01592 _Tpnvec c; \ 01593 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 01594 { \ 01595 c.s[i] = saturate_cast<_Tpn>(a.s[i]); \ 01596 c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \ 01597 } \ 01598 return c; \ 01599 } 01600 01601 //! @name Pack 01602 //! @{ 01603 //! @brief Pack values from two vectors to one 01604 //! 01605 //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also 01606 //! converts to corresponding unsigned type. 01607 //! 01608 //! - pack: for 16-, 32- and 64-bit integer input types 01609 //! - pack_u: for 16- and 32-bit signed integer input types 01610 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack) 01611 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack) 01612 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack) 01613 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack) 01614 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack) 01615 OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack) 01616 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u) 01617 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u) 01618 //! @} 01619 01620 //! @brief Helper macro 01621 //! @ingroup core_hal_intrin_impl 01622 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ 01623 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \ 01624 { \ 01625 _Tpnvec c; \ 01626 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 01627 { \ 01628 c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 01629 c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 01630 } \ 01631 return c; \ 01632 } 01633 01634 //! @name Pack with rounding shift 01635 //! @{ 01636 //! @brief Pack values from two vectors to one with rounding shift 01637 //! 01638 //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower 01639 //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type. 01640 //! 01641 //! - pack: for 16-, 32- and 64-bit integer input types 01642 //! - pack_u: for 16- and 32-bit signed integer input types 01643 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack) 01644 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack) 01645 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack) 01646 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack) 01647 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack) 01648 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack) 01649 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u) 01650 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u) 01651 //! @} 01652 01653 //! @brief Helper macro 01654 //! @ingroup core_hal_intrin_impl 01655 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ 01656 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ 01657 { \ 01658 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 01659 ptr[i] = saturate_cast<_Tpn>(a.s[i]); \ 01660 } 01661 01662 //! @name Pack and store 01663 //! @{ 01664 //! @brief Store values from the input vector into memory with pack 01665 //! 01666 //! Values will be stored into memory with saturating conversion to narrower type. 01667 //! Variant with _u_ suffix converts to corresponding unsigned type. 01668 //! 01669 //! - pack: for 16-, 32- and 64-bit integer input types 01670 //! - pack_u: for 16- and 32-bit signed integer input types 01671 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack) 01672 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack) 01673 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack) 01674 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack) 01675 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack) 01676 OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack) 01677 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u) 01678 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u) 01679 //! @} 01680 01681 //! @brief Helper macro 01682 //! @ingroup core_hal_intrin_impl 01683 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \ 01684 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \ 01685 { \ 01686 for( int i = 0; i < _Tpvec::nlanes; i++ ) \ 01687 ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \ 01688 } 01689 01690 //! @name Pack and store with rounding shift 01691 //! @{ 01692 //! @brief Store values from the input vector into memory with pack 01693 //! 01694 //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into 01695 //! memory. Variant with _u_ suffix converts to unsigned type. 01696 //! 01697 //! - pack: for 16-, 32- and 64-bit integer input types 01698 //! - pack_u: for 16- and 32-bit signed integer input types 01699 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack) 01700 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack) 01701 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack) 01702 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack) 01703 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack) 01704 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack) 01705 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u) 01706 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u) 01707 //! @} 01708 01709 /** @brief Matrix multiplication 01710 01711 Scheme: 01712 @code 01713 {A0 A1 A2 A3} |V0| 01714 {B0 B1 B2 B3} |V1| 01715 {C0 C1 C2 C3} |V2| 01716 {D0 D1 D2 D3} x |V3| 01717 ==================== 01718 {R0 R1 R2 R3}, where: 01719 R0 = A0V0 + A1V1 + A2V2 + A3V3, 01720 R1 = B0V0 + B1V1 + B2V2 + B3V3 01721 ... 01722 @endcode 01723 */ 01724 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, 01725 const v_float32x4& m1, const v_float32x4& m2, 01726 const v_float32x4& m3) 01727 { 01728 return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0], 01729 v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1], 01730 v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2], 01731 v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]); 01732 } 01733 01734 //! @} 01735 01736 } 01737 01738 #endif 01739
Generated on Tue Jul 12 2022 16:42:38 by 1.7.2