opencv on mbed

Dependencies:   mbed

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers intrin_cpp.hpp Source File

intrin_cpp.hpp

00001 /*M///////////////////////////////////////////////////////////////////////////////////////
00002 //
00003 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
00004 //
00005 //  By downloading, copying, installing or using the software you agree to this license.
00006 //  If you do not agree to this license, do not download, install,
00007 //  copy or use the software.
00008 //
00009 //
00010 //                          License Agreement
00011 //                For Open Source Computer Vision Library
00012 //
00013 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
00014 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
00015 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
00016 // Copyright (C) 2015, Itseez Inc., all rights reserved.
00017 // Third party copyrights are property of their respective owners.
00018 //
00019 // Redistribution and use in source and binary forms, with or without modification,
00020 // are permitted provided that the following conditions are met:
00021 //
00022 //   * Redistribution's of source code must retain the above copyright notice,
00023 //     this list of conditions and the following disclaimer.
00024 //
00025 //   * Redistribution's in binary form must reproduce the above copyright notice,
00026 //     this list of conditions and the following disclaimer in the documentation
00027 //     and/or other materials provided with the distribution.
00028 //
00029 //   * The name of the copyright holders may not be used to endorse or promote products
00030 //     derived from this software without specific prior written permission.
00031 //
00032 // This software is provided by the copyright holders and contributors "as is" and
00033 // any express or implied warranties, including, but not limited to, the implied
00034 // warranties of merchantability and fitness for a particular purpose are disclaimed.
00035 // In no event shall the Intel Corporation or contributors be liable for any direct,
00036 // indirect, incidental, special, exemplary, or consequential damages
00037 // (including, but not limited to, procurement of substitute goods or services;
00038 // loss of use, data, or profits; or business interruption) however caused
00039 // and on any theory of liability, whether in contract, strict liability,
00040 // or tort (including negligence or otherwise) arising in any way out of
00041 // the use of this software, even if advised of the possibility of such damage.
00042 //
00043 //M*/
00044 
00045 #ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
00046 #define __OPENCV_HAL_INTRIN_CPP_HPP__
00047 
00048 #include <limits>
00049 #include <cstring>
00050 #include <algorithm>
00051 #include "opencv2/core/saturate.hpp"
00052 
00053 namespace cv
00054 {
00055 
00056 /** @addtogroup core_hal_intrin
00057 
00058 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
00059 different platforms. Currently there are two supported SIMD extensions: __SSE/SSE2__ on x86
00060 architectures and __NEON__ on ARM architectures, both allow working with 128 bit registers
00061 containing packed values of different types. In case when there is no SIMD extension available
00062 during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as
00063 expected although it could be slower.
00064 
00065 ### Types
00066 
00067 There are several types representing 128-bit register as a vector of packed values, each type is
00068 implemented as a structure based on a one SIMD register.
00069 
00070 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
00071 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
00072 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsgined/signed) - int
00073 - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
00074 - cv::v_float32x4: four 32-bit floating point values (signed) - float
00075 - cv::v_float64x2: two 64-bit floating point valies (signed) - double
00076 
00077 @note
00078 cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
00079 check the CV_SIMD128_64F preprocessor definition:
00080 @code
00081 #if CV_SIMD128_64F
00082 //...
00083 #endif
00084 @endcode
00085 
00086 ### Load and store operations
00087 
00088 These operations allow to set contents of the register explicitly or by loading it from some memory
00089 block and to save contents of the register to memory block.
00090 
00091 - Constructors:
00092 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
00093 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
00094 - Other create methods:
00095 @ref v_setall_s8, @ref v_setall_u8, ...,
00096 @ref v_setzero_u8, @ref v_setzero_s8, ...
00097 - Memory operations:
00098 @ref v_load, @ref v_load_aligned, @ref v_load_halves,
00099 @ref v_store, @ref v_store_aligned,
00100 @ref v_store_high, @ref v_store_low
00101 
00102 ### Value reordering
00103 
00104 These operations allow to reorder or recombine elements in one or multiple vectors.
00105 
00106 - Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
00107 - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
00108 - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
00109 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
00110 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
00111 - Extract: @ref v_extract
00112 
00113 
00114 ### Arithmetic, bitwise and comparison operations
00115 
00116 Element-wise binary and unary operations.
00117 
00118 - Arithmetics:
00119 @ref operator+(const v_reg &a, const v_reg &b) "+",
00120 @ref operator-(const v_reg &a, const v_reg &b) "-",
00121 @ref operator*(const v_reg &a, const v_reg &b) "*",
00122 @ref operator/(const v_reg &a, const v_reg &b) "/",
00123 @ref v_mul_expand
00124 
00125 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
00126 
00127 - Bitwise shifts:
00128 @ref operator<<(const v_reg &a, int s) "<<",
00129 @ref operator>>(const v_reg &a, int s) ">>",
00130 @ref v_shl, @ref v_shr
00131 
00132 - Bitwise logic:
00133 @ref operator&(const v_reg &a, const v_reg &b) "&",
00134 @ref operator|(const v_reg &a, const v_reg &b) "|",
00135 @ref operator^(const v_reg &a, const v_reg &b) "^",
00136 @ref operator~(const v_reg &a) "~"
00137 
00138 - Comparison:
00139 @ref operator>(const v_reg &a, const v_reg &b) ">",
00140 @ref operator>=(const v_reg &a, const v_reg &b) ">=",
00141 @ref operator<(const v_reg &a, const v_reg &b) "<",
00142 @ref operator<=(const v_reg &a, const v_reg &b) "<=",
00143 @ref operator==(const v_reg &a, const v_reg &b) "==",
00144 @ref operator!=(const v_reg &a, const v_reg &b) "!="
00145 
00146 - min/max: @ref v_min, @ref v_max
00147 
00148 ### Reduce and mask
00149 
00150 Most of these operations return only one value.
00151 
00152 - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
00153 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
00154 
00155 ### Other math
00156 
00157 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
00158 - Absolute values: @ref v_abs, @ref v_absdiff
00159 
00160 ### Conversions
00161 
00162 Different type conversions and casts:
00163 
00164 - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
00165 - To float: @ref v_cvt_f32, @ref v_cvt_f64
00166 - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
00167 
00168 ### Matrix operations
00169 
00170 In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_matmul, @ref v_transpose4x4
00171 
00172 ### Usability
00173 
00174 Most operations are implemented only for some subset of the available types, following matrices
00175 shows the applicability of different operations to the types.
00176 
00177 Regular integers:
00178 
00179 | Operations\\Types | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
00180 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
00181 |load, store        | x | x | x | x | x | x |
00182 |interleave         | x | x | x | x | x | x |
00183 |expand             | x | x | x | x | x | x |
00184 |expand_q           | x | x |   |   |   |   |
00185 |add, sub           | x | x | x | x | x | x |
00186 |add_wrap, sub_wrap | x | x | x | x |   |   |
00187 |mul                |   |   | x | x | x | x |
00188 |mul_expand         |   |   | x | x | x |   |
00189 |compare            | x | x | x | x | x | x |
00190 |shift              |   |   | x | x | x | x |
00191 |dotprod            |   |   |   | x |   |   |
00192 |logical            | x | x | x | x | x | x |
00193 |min, max           | x | x | x | x | x | x |
00194 |absdiff            | x | x | x | x | x | x |
00195 |reduce             |   |   |   |   | x | x |
00196 |mask               | x | x | x | x | x | x |
00197 |pack               | x | x | x | x | x | x |
00198 |pack_u             | x |   | x |   |   |   |
00199 |unpack             | x | x | x | x | x | x |
00200 |extract            | x | x | x | x | x | x |
00201 |cvt_flt32          |   |   |   |   |   | x |
00202 |cvt_flt64          |   |   |   |   |   | x |
00203 |transpose4x4       |   |   |   |   | x | x |
00204 
00205 Big integers:
00206 
00207 | Operations\\Types | uint 64x2 | int 64x2 |
00208 |-------------------|:-:|:-:|
00209 |load, store        | x | x |
00210 |add, sub           | x | x |
00211 |shift              | x | x |
00212 |logical            | x | x |
00213 |extract            | x | x |
00214 
00215 Floating point:
00216 
00217 | Operations\\Types | float 32x4 | float 64x2 |
00218 |-------------------|:-:|:-:|
00219 |load, store        | x | x |
00220 |interleave         | x |   |
00221 |add, sub           | x | x |
00222 |mul                | x | x |
00223 |div                | x | x |
00224 |compare            | x | x |
00225 |min, max           | x | x |
00226 |absdiff            | x | x |
00227 |reduce             | x |   |
00228 |mask               | x | x |
00229 |unpack             | x | x |
00230 |cvt_flt32          |   | x |
00231 |cvt_flt64          | x |   |
00232 |sqrt, abs          | x | x |
00233 |float math         | x | x |
00234 |transpose4x4       | x |   |
00235 
00236 
00237  @{ */
00238 
00239 template<typename _Tp, int n> struct v_reg
00240 {
00241 //! @cond IGNORED
00242     typedef _Tp lane_type;
00243     typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
00244     typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
00245     enum { nlanes = n };
00246 // !@endcond
00247 
00248     /** @brief Constructor
00249 
00250     Initializes register with data from memory
00251     @param ptr pointer to memory block with data for register */
00252     explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
00253 
00254     /** @brief Constructor
00255 
00256     Initializes register with two 64-bit values */
00257     v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
00258 
00259     /** @brief Constructor
00260 
00261     Initializes register with four 32-bit values */
00262     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
00263 
00264     /** @brief Constructor
00265 
00266     Initializes register with eight 16-bit values */
00267     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
00268            _Tp s4, _Tp s5, _Tp s6, _Tp s7)
00269     {
00270         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
00271         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
00272     }
00273 
00274     /** @brief Constructor
00275 
00276     Initializes register with sixteen 8-bit values */
00277     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
00278            _Tp s4, _Tp s5, _Tp s6, _Tp s7,
00279            _Tp s8, _Tp s9, _Tp s10, _Tp s11,
00280            _Tp s12, _Tp s13, _Tp s14, _Tp s15)
00281     {
00282         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
00283         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
00284         s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
00285         s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
00286     }
00287 
00288     /** @brief Default constructor
00289 
00290     Does not initialize anything*/
00291     v_reg() {}
00292 
00293     /** @brief Copy constructor */
00294     v_reg(const v_reg<_Tp, n> & r)
00295     {
00296         for( int i = 0; i < n; i++ )
00297             s[i] = r.s[i];
00298     }
00299     /** @brief Access first value
00300 
00301     Returns value of the first lane according to register type, for example:
00302     @code{.cpp}
00303     v_int32x4 r(1, 2, 3, 4);
00304     int v = r.get0(); // returns 1
00305     v_uint64x2 r(1, 2);
00306     uint64_t v = r.get0(); // returns 1
00307     @endcode
00308     */
00309     _Tp get0() const { return s[0]; }
00310 
00311 //! @cond IGNORED
00312     _Tp get(const int i) const { return s[i]; }
00313     v_reg<_Tp, n> high() const
00314     {
00315         v_reg<_Tp, n> c;
00316         int i;
00317         for( i = 0; i < n/2; i++ )
00318         {
00319             c.s[i] = s[i+(n/2)];
00320             c.s[i+(n/2)] = 0;
00321         }
00322         return c;
00323     }
00324 
00325     static v_reg<_Tp, n> zero()
00326     {
00327         v_reg<_Tp, n> c;
00328         for( int i = 0; i < n; i++ )
00329             c.s[i] = (_Tp)0;
00330         return c;
00331     }
00332 
00333     static v_reg<_Tp, n> all(_Tp s)
00334     {
00335         v_reg<_Tp, n> c;
00336         for( int i = 0; i < n; i++ )
00337             c.s[i] = s;
00338         return c;
00339     }
00340 
00341     template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
00342     {
00343         size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
00344         v_reg<_Tp2, n2> c;
00345         std::memcpy(&c.s[0], &s[0], bytes);
00346         return c;
00347     }
00348 
00349     _Tp s[n];
00350 //! @endcond
00351 };
00352 
00353 /** @brief Sixteen 8-bit unsigned integer values */
00354 typedef v_reg<uchar, 16> v_uint8x16;
00355 /** @brief Sixteen 8-bit signed integer values */
00356 typedef v_reg<schar, 16> v_int8x16;
00357 /** @brief Eight 16-bit unsigned integer values */
00358 typedef v_reg<ushort, 8> v_uint16x8;
00359 /** @brief Eight 16-bit signed integer values */
00360 typedef v_reg<short, 8> v_int16x8;
00361 /** @brief Four 32-bit unsigned integer values */
00362 typedef v_reg<unsigned, 4> v_uint32x4;
00363 /** @brief Four 32-bit signed integer values */
00364 typedef v_reg<int, 4> v_int32x4;
00365 /** @brief Four 32-bit floating point values (single precision) */
00366 typedef v_reg<float, 4> v_float32x4;
00367 /** @brief Two 64-bit floating point values (double precision) */
00368 typedef v_reg<double, 2> v_float64x2;
00369 /** @brief Two 64-bit unsigned integer values */
00370 typedef v_reg<uint64, 2> v_uint64x2;
00371 /** @brief Two 64-bit signed integer values */
00372 typedef v_reg<int64, 2> v_int64x2;
00373 
00374 //! @brief Helper macro
00375 //! @ingroup core_hal_intrin_impl
00376 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
00377 template<typename _Tp, int n> inline v_reg<_Tp, n> \
00378     operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00379 { \
00380     v_reg<_Tp, n> c; \
00381     for( int i = 0; i < n; i++ ) \
00382         c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
00383     return c; \
00384 } \
00385 template<typename _Tp, int n> inline v_reg<_Tp, n>& \
00386     operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00387 { \
00388     for( int i = 0; i < n; i++ ) \
00389         a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
00390     return a; \
00391 }
00392 
00393 /** @brief Add values
00394 
00395 For all types. */
00396 OPENCV_HAL_IMPL_BIN_OP(+)
00397 
00398 /** @brief Subtract values
00399 
00400 For all types. */
00401 OPENCV_HAL_IMPL_BIN_OP(-)
00402 
00403 /** @brief Multiply values
00404 
00405 For 16- and 32-bit integer types and floating types. */
00406 OPENCV_HAL_IMPL_BIN_OP(*)
00407 
00408 /** @brief Divide values
00409 
00410 For floating types only. */
00411 OPENCV_HAL_IMPL_BIN_OP(/)
00412 
00413 //! @brief Helper macro
00414 //! @ingroup core_hal_intrin_impl
00415 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
00416 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
00417     (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00418 { \
00419     v_reg<_Tp, n> c; \
00420     typedef typename V_TypeTraits<_Tp>::int_type itype; \
00421     for( int i = 0; i < n; i++ ) \
00422         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
00423                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
00424     return c; \
00425 } \
00426 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
00427     bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00428 { \
00429     typedef typename V_TypeTraits<_Tp>::int_type itype; \
00430     for( int i = 0; i < n; i++ ) \
00431         a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
00432                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
00433     return a; \
00434 }
00435 
00436 /** @brief Bitwise AND
00437 
00438 Only for integer types. */
00439 OPENCV_HAL_IMPL_BIT_OP(&)
00440 
00441 /** @brief Bitwise OR
00442 
00443 Only for integer types. */
00444 OPENCV_HAL_IMPL_BIT_OP(|)
00445 
00446 /** @brief Bitwise XOR
00447 
00448 Only for integer types.*/
00449 OPENCV_HAL_IMPL_BIT_OP(^)
00450 
00451 /** @brief Bitwise NOT
00452 
00453 Only for integer types.*/
00454 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
00455 {
00456     v_reg<_Tp, n> c;
00457     for( int i = 0; i < n; i++ )
00458         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
00459         return c;
00460 }
00461 
00462 //! @brief Helper macro
00463 //! @ingroup core_hal_intrin_impl
00464 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
00465 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
00466 { \
00467     v_reg<_Tp2, n> c; \
00468     for( int i = 0; i < n; i++ ) \
00469         c.s[i] = cfunc(a.s[i]); \
00470     return c; \
00471 }
00472 
00473 /** @brief Square root of elements
00474 
00475 Only for floating point types.*/
00476 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
00477 
00478 //! @cond IGNORED
00479 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
00480 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
00481 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
00482 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
00483 //! @endcond
00484 
00485 /** @brief Absolute value of elements
00486 
00487 Only for floating point types.*/
00488 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
00489                           typename V_TypeTraits<_Tp>::abs_type)
00490 
00491 /** @brief Round elements
00492 
00493 Only for floating point types.*/
00494 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
00495 
00496 /** @brief Floor elements
00497 
00498 Only for floating point types.*/
00499 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
00500 
00501 /** @brief Ceil elements
00502 
00503 Only for floating point types.*/
00504 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
00505 
00506 /** @brief Truncate elements
00507 
00508 Only for floating point types.*/
00509 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
00510 
00511 //! @brief Helper macro
00512 //! @ingroup core_hal_intrin_impl
00513 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
00514 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00515 { \
00516     v_reg<_Tp, n> c; \
00517     for( int i = 0; i < n; i++ ) \
00518         c.s[i] = cfunc(a.s[i], b.s[i]); \
00519     return c; \
00520 }
00521 
00522 //! @brief Helper macro
00523 //! @ingroup core_hal_intrin_impl
00524 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
00525 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
00526 { \
00527     _Tp c = a.s[0]; \
00528     for( int i = 1; i < n; i++ ) \
00529         c = cfunc(c, a.s[i]); \
00530     return c; \
00531 }
00532 
00533 /** @brief Choose min values for each pair
00534 
00535 Scheme:
00536 @code
00537 {A1 A2 ...}
00538 {B1 B2 ...}
00539 --------------
00540 {min(A1,B1) min(A2,B2) ...}
00541 @endcode
00542 For all types except 64-bit integer. */
00543 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
00544 
00545 /** @brief Choose max values for each pair
00546 
00547 Scheme:
00548 @code
00549 {A1 A2 ...}
00550 {B1 B2 ...}
00551 --------------
00552 {max(A1,B1) max(A2,B2) ...}
00553 @endcode
00554 For all types except 64-bit integer. */
00555 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
00556 
00557 /** @brief Find one min value
00558 
00559 Scheme:
00560 @code
00561 {A1 A2 A3 ...} => min(A1,A2,A3,...)
00562 @endcode
00563 For 32-bit integer and 32-bit floating point types. */
00564 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
00565 
00566 /** @brief Find one max value
00567 
00568 Scheme:
00569 @code
00570 {A1 A2 A3 ...} => max(A1,A2,A3,...)
00571 @endcode
00572 For 32-bit integer and 32-bit floating point types. */
00573 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
00574 
00575 //! @cond IGNORED
00576 template<typename _Tp, int n>
00577 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
00578                       v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
00579 {
00580     for( int i = 0; i < n; i++ )
00581     {
00582         minval.s[i] = std::min(a.s[i], b.s[i]);
00583         maxval.s[i] = std::max(a.s[i], b.s[i]);
00584     }
00585 }
00586 //! @endcond
00587 
00588 //! @brief Helper macro
00589 //! @ingroup core_hal_intrin_impl
00590 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
00591 template<typename _Tp, int n> \
00592 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00593 { \
00594     typedef typename V_TypeTraits<_Tp>::int_type itype; \
00595     v_reg<_Tp, n> c; \
00596     for( int i = 0; i < n; i++ ) \
00597         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
00598     return c; \
00599 }
00600 
00601 /** @brief Less-than comparison
00602 
00603 For all types except 64-bit integer values. */
00604 OPENCV_HAL_IMPL_CMP_OP(<)
00605 
00606 /** @brief Greater-than comparison
00607 
00608 For all types except 64-bit integer values. */
00609 OPENCV_HAL_IMPL_CMP_OP(>)
00610 
00611 /** @brief Less-than or equal comparison
00612 
00613 For all types except 64-bit integer values. */
00614 OPENCV_HAL_IMPL_CMP_OP(<=)
00615 
00616 /** @brief Greater-than or equal comparison
00617 
00618 For all types except 64-bit integer values. */
00619 OPENCV_HAL_IMPL_CMP_OP(>=)
00620 
00621 /** @brief Equal comparison
00622 
00623 For all types except 64-bit integer values. */
00624 OPENCV_HAL_IMPL_CMP_OP(==)
00625 
00626 /** @brief Not equal comparison
00627 
00628 For all types except 64-bit integer values. */
00629 OPENCV_HAL_IMPL_CMP_OP(!=)
00630 
00631 //! @brief Helper macro
00632 //! @ingroup core_hal_intrin_impl
00633 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
00634 template<typename _Tp, int n> \
00635 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
00636 { \
00637     typedef _Tp2 rtype; \
00638     v_reg<rtype, n> c; \
00639     for( int i = 0; i < n; i++ ) \
00640         c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
00641     return c; \
00642 }
00643 
00644 /** @brief Add values without saturation
00645 
00646 For 8- and 16-bit integer values. */
00647 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
00648 
00649 /** @brief Subtract values without saturation
00650 
00651 For 8- and 16-bit integer values. */
00652 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
00653 
00654 //! @cond IGNORED
00655 template<typename T> inline T _absdiff(T a, T b)
00656 {
00657     return a > b ? a - b : b - a;
00658 }
00659 //! @endcond
00660 
00661 /** @brief Absolute difference
00662 
00663 Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
00664 Example:
00665 @code{.cpp}
00666 v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
00667 v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
00668 @endcode
00669 For 8-, 16-, 32-bit integer source types. */
00670 template<typename _Tp, int n>
00671 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
00672 {
00673     typedef typename V_TypeTraits<_Tp>::abs_type rtype;
00674     v_reg<rtype, n> c;
00675     const rtype mask = std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0;
00676     for( int i = 0; i < n; i++ )
00677     {
00678         rtype ua = a.s[i] ^ mask;
00679         rtype ub = b.s[i] ^ mask;
00680         c.s[i] = _absdiff(ua, ub);
00681     }
00682     return c;
00683 }
00684 
00685 /** @overload
00686 
00687 For 32-bit floating point values */
00688 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
00689 {
00690     v_float32x4 c;
00691     for( int i = 0; i < c.nlanes; i++ )
00692         c.s[i] = _absdiff(a.s[i], b.s[i]);
00693     return c;
00694 }
00695 
00696 /** @overload
00697 
00698 For 64-bit floating point values */
00699 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
00700 {
00701     v_float64x2 c;
00702     for( int i = 0; i < c.nlanes; i++ )
00703         c.s[i] = _absdiff(a.s[i], b.s[i]);
00704     return c;
00705 }
00706 
00707 /** @brief Inversed square root
00708 
00709 Returns \f$ 1/sqrt(a) \f$
00710 For floating point types only. */
00711 template<typename _Tp, int n>
00712 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
00713 {
00714     v_reg<_Tp, n> c;
00715     for( int i = 0; i < n; i++ )
00716         c.s[i] = 1.f/std::sqrt(a.s[i]);
00717     return c;
00718 }
00719 
00720 /** @brief Magnitude
00721 
00722 Returns \f$ sqrt(a^2 + b^2) \f$
00723 For floating point types only. */
00724 template<typename _Tp, int n>
00725 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
00726 {
00727     v_reg<_Tp, n> c;
00728     for( int i = 0; i < n; i++ )
00729         c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
00730     return c;
00731 }
00732 
00733 /** @brief Square of the magnitude
00734 
00735 Returns \f$ a^2 + b^2 \f$
00736 For floating point types only. */
00737 template<typename _Tp, int n>
00738 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
00739 {
00740     v_reg<_Tp, n> c;
00741     for( int i = 0; i < n; i++ )
00742         c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
00743     return c;
00744 }
00745 
00746 /** @brief Multiply and add
00747 
00748 Returns \f$ a*b + c \f$
00749 For floating point types only. */
00750 template<typename _Tp, int n>
00751 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
00752                               const v_reg<_Tp, n>& c)
00753 {
00754     v_reg<_Tp, n> d;
00755     for( int i = 0; i < n; i++ )
00756         d.s[i] = a.s[i]*b.s[i] + c.s[i];
00757     return d;
00758 }
00759 
00760 /** @brief Dot product of elements
00761 
00762 Multiply values in two registers and sum adjacent result pairs.
00763 Scheme:
00764 @code
00765   {A1 A2 ...} // 16-bit
00766 x {B1 B2 ...} // 16-bit
00767 -------------
00768 {A1B1+A2B2 ...} // 32-bit
00769 @endcode
00770 Implemented only for 16-bit signed source type (v_int16x8).
00771 */
00772 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
00773     v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
00774 {
00775     typedef typename V_TypeTraits<_Tp>::w_type w_type;
00776     v_reg<w_type, n/2> c;
00777     for( int i = 0; i < (n/2); i++ )
00778         c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
00779     return c;
00780 }
00781 
00782 /** @brief Multiply and expand
00783 
00784 Multiply values two registers and store results in two registers with wider pack type.
00785 Scheme:
00786 @code
00787   {A B C D} // 32-bit
00788 x {E F G H} // 32-bit
00789 ---------------
00790 {AE BF}         // 64-bit
00791         {CG DH} // 64-bit
00792 @endcode
00793 Example:
00794 @code{.cpp}
00795 v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
00796 v_uint64x2 c, d; // results
00797 v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
00798 @endcode
00799 Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
00800 */
00801 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
00802                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
00803                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
00804 {
00805     typedef typename V_TypeTraits<_Tp>::w_type w_type;
00806     for( int i = 0; i < (n/2); i++ )
00807     {
00808         c.s[i] = (w_type)a.s[i]*b.s[i];
00809         d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
00810     }
00811 }
00812 
00813 //! @cond IGNORED
00814 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
00815                                                  v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
00816 {
00817     typedef typename V_TypeTraits<_Tp>::w_type w_type;
00818     for( int i = 0; i < (n/2); i++ )
00819     {
00820         c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
00821     }
00822 }
00823 //! @endcond
00824 
00825 //! @brief Helper macro
00826 //! @ingroup core_hal_intrin_impl
00827 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
00828 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
00829 { \
00830     v_reg<_Tp, n> c; \
00831     for( int i = 0; i < n; i++ ) \
00832         c.s[i] = (_Tp)(a.s[i] shift_op imm); \
00833     return c; \
00834 }
00835 
00836 /** @brief Bitwise shift left
00837 
00838 For 16-, 32- and 64-bit integer values. */
00839 OPENCV_HAL_IMPL_SHIFT_OP(<<)
00840 
00841 /** @brief Bitwise shift right
00842 
00843 For 16-, 32- and 64-bit integer values. */
00844 OPENCV_HAL_IMPL_SHIFT_OP(>>)
00845 
00846 /** @brief Sum packed values
00847 
00848 Scheme:
00849 @code
00850 {A1 A2 A3 ...} => sum{A1,A2,A3,...}
00851 @endcode
00852 For 32-bit integer and 32-bit floating point types.*/
00853 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
00854 {
00855     typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
00856     for( int i = 1; i < n; i++ )
00857         c += a.s[i];
00858     return c;
00859 }
00860 
00861 /** @brief Get negative values mask
00862 
00863 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
00864 Example:
00865 @code{.cpp}
00866 v_int32x4 r; // set to {-1, -1, 1, 1}
00867 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
00868 @endcode
00869 For all types except 64-bit. */
00870 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
00871 {
00872     int mask = 0;
00873     for( int i = 0; i < n; i++ )
00874         mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
00875     return mask;
00876 }
00877 
00878 /** @brief Check if all packed values are less than zero
00879 
00880 Unsigned values will be casted to signed: `uchar 254 => char -2`.
00881 For all types except 64-bit. */
00882 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
00883 {
00884     for( int i = 0; i < n; i++ )
00885         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
00886             return false;
00887     return true;
00888 }
00889 
00890 /** @brief Check if any of packed values is less than zero
00891 
00892 Unsigned values will be casted to signed: `uchar 254 => char -2`.
00893 For all types except 64-bit. */
00894 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
00895 {
00896     for( int i = 0; i < n; i++ )
00897         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
00898             return true;
00899     return false;
00900 }
00901 
00902 /** @brief Bitwise select
00903 
00904 Return value will be built by combining values a and b using the following scheme:
00905 If the i-th bit in _mask_ is 1
00906     select i-th bit from _a_
00907 else
00908     select i-th bit from _b_ */
00909 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
00910                                                            const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
00911 {
00912     typedef V_TypeTraits<_Tp> Traits;
00913     typedef typename Traits::int_type int_type;
00914     v_reg<_Tp, n> c;
00915     for( int i = 0; i < n; i++ )
00916     {
00917         int_type m = Traits::reinterpret_int(mask.s[i]);
00918         c.s[i] =  Traits::reinterpret_from_int((Traits::reinterpret_int(a.s[i]) & m)
00919                                              | (Traits::reinterpret_int(b.s[i]) & ~m));
00920     }
00921     return c;
00922 }
00923 
00924 /** @brief Expand values to the wider pack type
00925 
00926 Copy contents of register to two registers with 2x wider pack type.
00927 Scheme:
00928 @code
00929  int32x4     int64x2 int64x2
00930 {A B C D} ==> {A B} , {C D}
00931 @endcode */
00932 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
00933                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
00934                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
00935 {
00936     for( int i = 0; i < (n/2); i++ )
00937     {
00938         b0.s[i] = a.s[i];
00939         b1.s[i] = a.s[i+(n/2)];
00940     }
00941 }
00942 
00943 //! @cond IGNORED
00944 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
00945     v_reinterpret_as_int(const v_reg<_Tp, n>& a)
00946 {
00947     v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
00948     for( int i = 0; i < n; i++ )
00949         c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
00950     return c;
00951 }
00952 
00953 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
00954     v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
00955 {
00956     v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
00957     for( int i = 0; i < n; i++ )
00958         c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
00959     return c;
00960 }
00961 //! @endcond
00962 
00963 /** @brief Interleave two vectors
00964 
00965 Scheme:
00966 @code
00967   {A1 A2 A3 A4}
00968   {B1 B2 B3 B4}
00969 ---------------
00970   {A1 B1 A2 B2} and {A3 B3 A4 B4}
00971 @endcode
00972 For all types except 64-bit.
00973 */
00974 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
00975                                                v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
00976 {
00977     int i;
00978     for( i = 0; i < n/2; i++ )
00979     {
00980         b0.s[i*2] = a0.s[i];
00981         b0.s[i*2+1] = a1.s[i];
00982     }
00983     for( ; i < n; i++ )
00984     {
00985         b1.s[i*2-n] = a0.s[i];
00986         b1.s[i*2-n+1] = a1.s[i];
00987     }
00988 }
00989 
00990 /** @brief Load register contents from memory
00991 
00992 @param ptr pointer to memory block with data
00993 @return register object
00994 
00995 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
00996  */
00997 template<typename _Tp>
00998 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load(const _Tp* ptr)
00999 {
01000     return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
01001 }
01002 
01003 /** @brief Load register contents from memory (aligned)
01004 
01005 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
01006  */
01007 template<typename _Tp>
01008 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_aligned(const _Tp* ptr)
01009 {
01010     return v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes>(ptr);
01011 }
01012 
01013 /** @brief Load register contents from two memory blocks
01014 
01015 @param loptr memory block containing data for first half (0..n/2)
01016 @param hiptr memory block containing data for second half (n/2..n)
01017 
01018 @code{.cpp}
01019 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
01020 v_int32x4 r = v_load_halves(lo, hi);
01021 @endcode
01022  */
01023 template<typename _Tp>
01024 inline v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
01025 {
01026     v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> c;
01027     for( int i = 0; i < c.nlanes/2; i++ )
01028     {
01029         c.s[i] = loptr[i];
01030         c.s[i+c.nlanes/2] = hiptr[i];
01031     }
01032     return c;
01033 }
01034 
01035 /** @brief Load register contents from memory with double expand
01036 
01037 Same as cv::v_load, but result pack type will be 2x wider than memory type.
01038 
01039 @code{.cpp}
01040 short buf[4] = {1, 2, 3, 4}; // type is int16
01041 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
01042 @endcode
01043 For 8-, 16-, 32-bit integer source types. */
01044 template<typename _Tp>
01045 inline v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes / 2>
01046 v_load_expand(const _Tp* ptr)
01047 {
01048     typedef typename V_TypeTraits<_Tp>::w_type w_type;
01049     v_reg<w_type, V_SIMD128Traits<w_type>::nlanes> c;
01050     for( int i = 0; i < c.nlanes; i++ )
01051     {
01052         c.s[i] = ptr[i];
01053     }
01054     return c;
01055 }
01056 
01057 /** @brief Load register contents from memory with quad expand
01058 
01059 Same as cv::v_load_expand, but result type is 4 times wider than source.
01060 @code{.cpp}
01061 char buf[4] = {1, 2, 3, 4}; // type is int8
01062 v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
01063 @endcode
01064 For 8-bit integer source types. */
01065 template<typename _Tp>
01066 inline v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes / 4>
01067 v_load_expand_q(const _Tp* ptr)
01068 {
01069     typedef typename V_TypeTraits<_Tp>::q_type q_type;
01070     v_reg<q_type, V_SIMD128Traits<q_type>::nlanes> c;
01071     for( int i = 0; i < c.nlanes; i++ )
01072     {
01073         c.s[i] = ptr[i];
01074     }
01075     return c;
01076 }
01077 
01078 /** @brief Load and deinterleave (4 channels)
01079 
01080 Load data from memory deinterleave and store to 4 registers.
01081 Scheme:
01082 @code
01083 {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
01084 @endcode
01085 For all types except 64-bit. */
01086 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
01087                                                             v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
01088 {
01089     int i, i3;
01090     for( i = i3 = 0; i < n; i++, i3 += 3 )
01091     {
01092         a.s[i] = ptr[i3];
01093         b.s[i] = ptr[i3+1];
01094         c.s[i] = ptr[i3+2];
01095     }
01096 }
01097 
01098 /** @brief Load and deinterleave (3 channels)
01099 
01100 Load data from memory deinterleave and store to 3 registers.
01101 Scheme:
01102 @code
01103 {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
01104 @endcode
01105 For all types except 64-bit. */
01106 template<typename _Tp, int n>
01107 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
01108                                 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
01109                                 v_reg<_Tp, n>& d)
01110 {
01111     int i, i4;
01112     for( i = i4 = 0; i < n; i++, i4 += 4 )
01113     {
01114         a.s[i] = ptr[i4];
01115         b.s[i] = ptr[i4+1];
01116         c.s[i] = ptr[i4+2];
01117         d.s[i] = ptr[i4+3];
01118     }
01119 }
01120 
01121 /** @brief Interleave and store (3 channels)
01122 
01123 Interleave and store data from 3 registers to memory.
01124 Scheme:
01125 @code
01126 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
01127 @endcode
01128 For all types except 64-bit. */
01129 template<typename _Tp, int n>
01130 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
01131                                 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
01132 {
01133     int i, i3;
01134     for( i = i3 = 0; i < n; i++, i3 += 3 )
01135     {
01136         ptr[i3] = a.s[i];
01137         ptr[i3+1] = b.s[i];
01138         ptr[i3+2] = c.s[i];
01139     }
01140 }
01141 
01142 /** @brief Interleave and store (4 channels)
01143 
01144 Interleave and store data from 4 registers to memory.
01145 Scheme:
01146 @code
01147 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
01148 @endcode
01149 For all types except 64-bit. */
01150 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
01151                                                             const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
01152                                                             const v_reg<_Tp, n>& d)
01153 {
01154     int i, i4;
01155     for( i = i4 = 0; i < n; i++, i4 += 4 )
01156     {
01157         ptr[i4] = a.s[i];
01158         ptr[i4+1] = b.s[i];
01159         ptr[i4+2] = c.s[i];
01160         ptr[i4+3] = d.s[i];
01161     }
01162 }
01163 
01164 /** @brief Store data to memory
01165 
01166 Store register contents to memory.
01167 Scheme:
01168 @code
01169   REG {A B C D} ==> MEM {A B C D}
01170 @endcode
01171 Pointer can be unaligned. */
01172 template<typename _Tp, int n>
01173 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
01174 {
01175     for( int i = 0; i < n; i++ )
01176         ptr[i] = a.s[i];
01177 }
01178 
01179 /** @brief Store data to memory (lower half)
01180 
01181 Store lower half of register contents to memory.
01182 Scheme:
01183 @code
01184   REG {A B C D} ==> MEM {A B}
01185 @endcode */
01186 template<typename _Tp, int n>
01187 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
01188 {
01189     for( int i = 0; i < (n/2); i++ )
01190         ptr[i] = a.s[i];
01191 }
01192 
01193 /** @brief Store data to memory (higher half)
01194 
01195 Store higher half of register contents to memory.
01196 Scheme:
01197 @code
01198   REG {A B C D} ==> MEM {C D}
01199 @endcode */
01200 template<typename _Tp, int n>
01201 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
01202 {
01203     for( int i = 0; i < (n/2); i++ )
01204         ptr[i] = a.s[i+(n/2)];
01205 }
01206 
01207 /** @brief Store data to memory (aligned)
01208 
01209 Store register contents to memory.
01210 Scheme:
01211 @code
01212   REG {A B C D} ==> MEM {A B C D}
01213 @endcode
01214 Pointer __should__ be aligned by 16-byte boundary. */
01215 template<typename _Tp, int n>
01216 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
01217 {
01218     for( int i = 0; i < n; i++ )
01219         ptr[i] = a.s[i];
01220 }
01221 
01222 /** @brief Combine vector from first elements of two vectors
01223 
01224 Scheme:
01225 @code
01226   {A1 A2 A3 A4}
01227   {B1 B2 B3 B4}
01228 ---------------
01229   {A1 A2 B1 B2}
01230 @endcode
01231 For all types except 64-bit. */
01232 template<typename _Tp, int n>
01233 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
01234 {
01235     v_reg<_Tp, n> c;
01236     for( int i = 0; i < (n/2); i++ )
01237     {
01238         c.s[i] = a.s[i];
01239         c.s[i+(n/2)] = b.s[i];
01240     }
01241     return c;
01242 }
01243 
01244 /** @brief Combine vector from last elements of two vectors
01245 
01246 Scheme:
01247 @code
01248   {A1 A2 A3 A4}
01249   {B1 B2 B3 B4}
01250 ---------------
01251   {A3 A4 B3 B4}
01252 @endcode
01253 For all types except 64-bit. */
01254 template<typename _Tp, int n>
01255 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
01256 {
01257     v_reg<_Tp, n> c;
01258     for( int i = 0; i < (n/2); i++ )
01259     {
01260         c.s[i] = a.s[i+(n/2)];
01261         c.s[i+(n/2)] = b.s[i+(n/2)];
01262     }
01263     return c;
01264 }
01265 
01266 /** @brief Combine two vectors from lower and higher parts of two other vectors
01267 
01268 @code{.cpp}
01269 low = cv::v_combine_low(a, b);
01270 high = cv::v_combine_high(a, b);
01271 @endcode */
01272 template<typename _Tp, int n>
01273 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
01274                         v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
01275 {
01276     for( int i = 0; i < (n/2); i++ )
01277     {
01278         low.s[i] = a.s[i];
01279         low.s[i+(n/2)] = b.s[i];
01280         high.s[i] = a.s[i+(n/2)];
01281         high.s[i+(n/2)] = b.s[i+(n/2)];
01282     }
01283 }
01284 
01285 /** @brief Vector extract
01286 
01287 Scheme:
01288 @code
01289   {A1 A2 A3 A4}
01290   {B1 B2 B3 B4}
01291 ========================
01292 shift = 1  {A2 A3 A4 B1}
01293 shift = 2  {A3 A4 B1 B2}
01294 shift = 3  {A4 B1 B2 B3}
01295 @endcode
01296 Restriction: 0 <= shift < nlanes
01297 
01298 Usage:
01299 @code
01300 v_int32x4 a, b, c;
01301 c = v_extract<2>(a, b);
01302 @endcode
01303 For integer types only. */
01304 template<int s, typename _Tp, int n>
01305 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
01306 {
01307     v_reg<_Tp, n> r;
01308     const int shift = n - s;
01309     int i = 0;
01310     for (; i < shift; ++i)
01311         r.s[i] = a.s[i+s];
01312     for (; i < n; ++i)
01313         r.s[i] = b.s[i-shift];
01314     return r;
01315 }
01316 
01317 /** @brief Round
01318 
01319 Rounds each value. Input type is float vector ==> output type is int vector.*/
01320 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
01321 {
01322     v_reg<int, n> c;
01323     for( int i = 0; i < n; i++ )
01324         c.s[i] = cvRound(a.s[i]);
01325     return c;
01326 }
01327 
01328 /** @brief Floor
01329 
01330 Floor each value. Input type is float vector ==> output type is int vector.*/
01331 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
01332 {
01333     v_reg<int, n> c;
01334     for( int i = 0; i < n; i++ )
01335         c.s[i] = cvFloor(a.s[i]);
01336     return c;
01337 }
01338 
01339 /** @brief Ceil
01340 
01341 Ceil each value. Input type is float vector ==> output type is int vector.*/
01342 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
01343 {
01344     v_reg<int, n> c;
01345     for( int i = 0; i < n; i++ )
01346         c.s[i] = cvCeil(a.s[i]);
01347     return c;
01348 }
01349 
01350 /** @brief Trunc
01351 
01352 Truncate each value. Input type is float vector ==> output type is int vector.*/
01353 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
01354 {
01355     v_reg<int, n> c;
01356     for( int i = 0; i < n; i++ )
01357         c.s[i] = (int)(a.s[i]);
01358     return c;
01359 }
01360 
01361 /** @overload */
01362 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
01363 {
01364     v_reg<int, n*2> c;
01365     for( int i = 0; i < n; i++ )
01366     {
01367         c.s[i] = cvRound(a.s[i]);
01368         c.s[i+n] = 0;
01369     }
01370     return c;
01371 }
01372 
01373 /** @overload */
01374 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
01375 {
01376     v_reg<int, n> c;
01377     for( int i = 0; i < n; i++ )
01378     {
01379         c.s[i] = cvFloor(a.s[i]);
01380         c.s[i+n] = 0;
01381     }
01382     return c;
01383 }
01384 
01385 /** @overload */
01386 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
01387 {
01388     v_reg<int, n> c;
01389     for( int i = 0; i < n; i++ )
01390     {
01391         c.s[i] = cvCeil(a.s[i]);
01392         c.s[i+n] = 0;
01393     }
01394     return c;
01395 }
01396 
01397 /** @overload */
01398 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
01399 {
01400     v_reg<int, n> c;
01401     for( int i = 0; i < n; i++ )
01402     {
01403         c.s[i] = cvCeil(a.s[i]);
01404         c.s[i+n] = 0;
01405     }
01406     return c;
01407 }
01408 
01409 /** @brief Convert to float
01410 
01411 Supported input type is cv::v_int32x4. */
01412 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
01413 {
01414     v_reg<float, n> c;
01415     for( int i = 0; i < n; i++ )
01416         c.s[i] = (float)a.s[i];
01417     return c;
01418 }
01419 
01420 /** @brief Convert to double
01421 
01422 Supported input type is cv::v_int32x4. */
01423 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
01424 {
01425     v_reg<double, n> c;
01426     for( int i = 0; i < n; i++ )
01427         c.s[i] = (double)a.s[i];
01428     return c;
01429 }
01430 
01431 /** @brief Convert to double
01432 
01433 Supported input type is cv::v_float32x4. */
01434 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
01435 {
01436     v_reg<double, n> c;
01437     for( int i = 0; i < n; i++ )
01438         c.s[i] = (double)a.s[i];
01439     return c;
01440 }
01441 
01442 /** @brief Transpose 4x4 matrix
01443 
01444 Scheme:
01445 @code
01446 a0  {A1 A2 A3 A4}
01447 a1  {B1 B2 B3 B4}
01448 a2  {C1 C2 C3 C4}
01449 a3  {D1 D2 D3 D4}
01450 ===============
01451 b0  {A1 B1 C1 D1}
01452 b1  {A2 B2 C2 D2}
01453 b2  {A3 B3 C3 D3}
01454 b3  {A4 B4 C4 D4}
01455 @endcode
01456 */
01457 template<typename _Tp>
01458 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
01459                             const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
01460                             v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
01461                             v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
01462 {
01463     b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
01464     b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
01465     b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
01466     b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
01467 }
01468 
01469 //! @brief Helper macro
01470 //! @ingroup core_hal_intrin_impl
01471 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
01472 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
01473 
01474 //! @name Init with zero
01475 //! @{
01476 //! @brief Create new vector with zero elements
01477 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, uchar, u8)
01478 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, schar, s8)
01479 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, ushort, u16)
01480 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, short, s16)
01481 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, unsigned, u32)
01482 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, int, s32)
01483 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, float, f32)
01484 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, double, f64)
01485 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, uint64, u64)
01486 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, int64, s64)
01487 //! @}
01488 
01489 //! @brief Helper macro
01490 //! @ingroup core_hal_intrin_impl
01491 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
01492 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
01493 
01494 //! @name Init with value
01495 //! @{
01496 //! @brief Create new vector with elements set to a specific value
01497 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, u8)
01498 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, s8)
01499 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, u16)
01500 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, s16)
01501 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, u32)
01502 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, s32)
01503 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, f32)
01504 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, f64)
01505 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, u64)
01506 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, s64)
01507 //! @}
01508 
01509 //! @brief Helper macro
01510 //! @ingroup core_hal_intrin_impl
01511 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
01512 template<typename _Tp0, int n0> inline _Tpvec \
01513     v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
01514 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
01515 
01516 //! @name Reinterpret
01517 //! @{
01518 //! @brief Convert vector to different type without modifying underlying data.
01519 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint8x16, uchar, u8)
01520 OPENCV_HAL_IMPL_C_REINTERPRET(v_int8x16, schar, s8)
01521 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint16x8, ushort, u16)
01522 OPENCV_HAL_IMPL_C_REINTERPRET(v_int16x8, short, s16)
01523 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint32x4, unsigned, u32)
01524 OPENCV_HAL_IMPL_C_REINTERPRET(v_int32x4, int, s32)
01525 OPENCV_HAL_IMPL_C_REINTERPRET(v_float32x4, float, f32)
01526 OPENCV_HAL_IMPL_C_REINTERPRET(v_float64x2, double, f64)
01527 OPENCV_HAL_IMPL_C_REINTERPRET(v_uint64x2, uint64, u64)
01528 OPENCV_HAL_IMPL_C_REINTERPRET(v_int64x2, int64, s64)
01529 //! @}
01530 
01531 //! @brief Helper macro
01532 //! @ingroup core_hal_intrin_impl
01533 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
01534 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
01535 { return a << n; }
01536 
01537 //! @name Left shift
01538 //! @{
01539 //! @brief Shift left
01540 OPENCV_HAL_IMPL_C_SHIFTL(v_uint16x8, ushort)
01541 OPENCV_HAL_IMPL_C_SHIFTL(v_int16x8, short)
01542 OPENCV_HAL_IMPL_C_SHIFTL(v_uint32x4, unsigned)
01543 OPENCV_HAL_IMPL_C_SHIFTL(v_int32x4, int)
01544 OPENCV_HAL_IMPL_C_SHIFTL(v_uint64x2, uint64)
01545 OPENCV_HAL_IMPL_C_SHIFTL(v_int64x2, int64)
01546 //! @}
01547 
01548 //! @brief Helper macro
01549 //! @ingroup core_hal_intrin_impl
01550 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
01551 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
01552 { return a >> n; }
01553 
01554 //! @name Right shift
01555 //! @{
01556 //! @brief Shift right
01557 OPENCV_HAL_IMPL_C_SHIFTR(v_uint16x8, ushort)
01558 OPENCV_HAL_IMPL_C_SHIFTR(v_int16x8, short)
01559 OPENCV_HAL_IMPL_C_SHIFTR(v_uint32x4, unsigned)
01560 OPENCV_HAL_IMPL_C_SHIFTR(v_int32x4, int)
01561 OPENCV_HAL_IMPL_C_SHIFTR(v_uint64x2, uint64)
01562 OPENCV_HAL_IMPL_C_SHIFTR(v_int64x2, int64)
01563 //! @}
01564 
01565 //! @brief Helper macro
01566 //! @ingroup core_hal_intrin_impl
01567 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
01568 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
01569 { \
01570     _Tpvec c; \
01571     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
01572         c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
01573     return c; \
01574 }
01575 
01576 //! @name Rounding shift
01577 //! @{
01578 //! @brief Rounding shift right
01579 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint16x8, ushort)
01580 OPENCV_HAL_IMPL_C_RSHIFTR(v_int16x8, short)
01581 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint32x4, unsigned)
01582 OPENCV_HAL_IMPL_C_RSHIFTR(v_int32x4, int)
01583 OPENCV_HAL_IMPL_C_RSHIFTR(v_uint64x2, uint64)
01584 OPENCV_HAL_IMPL_C_RSHIFTR(v_int64x2, int64)
01585 //! @}
01586 
01587 //! @brief Helper macro
01588 //! @ingroup core_hal_intrin_impl
01589 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix) \
01590 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
01591 { \
01592     _Tpnvec c; \
01593     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
01594     { \
01595         c.s[i] = saturate_cast<_Tpn>(a.s[i]); \
01596         c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
01597     } \
01598     return c; \
01599 }
01600 
01601 //! @name Pack
01602 //! @{
01603 //! @brief Pack values from two vectors to one
01604 //!
01605 //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
01606 //! converts to corresponding unsigned type.
01607 //!
01608 //! - pack: for 16-, 32- and 64-bit integer input types
01609 //! - pack_u: for 16- and 32-bit signed integer input types
01610 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, v_uint8x16, uchar, pack)
01611 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_int8x16, schar, pack)
01612 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, v_uint16x8, ushort, pack)
01613 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_int16x8, short, pack)
01614 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, v_uint32x4, unsigned, pack)
01615 OPENCV_HAL_IMPL_C_PACK(v_int64x2, v_int32x4, int, pack)
01616 OPENCV_HAL_IMPL_C_PACK(v_int16x8, v_uint8x16, uchar, pack_u)
01617 OPENCV_HAL_IMPL_C_PACK(v_int32x4, v_uint16x8, ushort, pack_u)
01618 //! @}
01619 
01620 //! @brief Helper macro
01621 //! @ingroup core_hal_intrin_impl
01622 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
01623 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
01624 { \
01625     _Tpnvec c; \
01626     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
01627     { \
01628         c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
01629         c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
01630     } \
01631     return c; \
01632 }
01633 
01634 //! @name Pack with rounding shift
01635 //! @{
01636 //! @brief Pack values from two vectors to one with rounding shift
01637 //!
01638 //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
01639 //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
01640 //!
01641 //! - pack: for 16-, 32- and 64-bit integer input types
01642 //! - pack_u: for 16- and 32-bit signed integer input types
01643 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
01644 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_int8x16, schar, pack)
01645 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
01646 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_int16x8, short, pack)
01647 OPENCV_HAL_IMPL_C_RSHR_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
01648 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int64x2, int64, v_int32x4, int, pack)
01649 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
01650 OPENCV_HAL_IMPL_C_RSHR_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
01651 //! @}
01652 
01653 //! @brief Helper macro
01654 //! @ingroup core_hal_intrin_impl
01655 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
01656 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
01657 { \
01658     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
01659         ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
01660 }
01661 
01662 //! @name Pack and store
01663 //! @{
01664 //! @brief Store values from the input vector into memory with pack
01665 //!
01666 //! Values will be stored into memory with saturating conversion to narrower type.
01667 //! Variant with _u_ suffix converts to corresponding unsigned type.
01668 //!
01669 //! - pack: for 16-, 32- and 64-bit integer input types
01670 //! - pack_u: for 16- and 32-bit signed integer input types
01671 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack)
01672 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack)
01673 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
01674 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_int16x8, short, pack)
01675 OPENCV_HAL_IMPL_C_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
01676 OPENCV_HAL_IMPL_C_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack)
01677 OPENCV_HAL_IMPL_C_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u)
01678 OPENCV_HAL_IMPL_C_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
01679 //! @}
01680 
01681 //! @brief Helper macro
01682 //! @ingroup core_hal_intrin_impl
01683 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
01684 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
01685 { \
01686     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
01687         ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
01688 }
01689 
01690 //! @name Pack and store with rounding shift
01691 //! @{
01692 //! @brief Store values from the input vector into memory with pack
01693 //!
01694 //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
01695 //! memory. Variant with _u_ suffix converts to unsigned type.
01696 //!
01697 //! - pack: for 16-, 32- and 64-bit integer input types
01698 //! - pack_u: for 16- and 32-bit signed integer input types
01699 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint16x8, ushort, v_uint8x16, uchar, pack)
01700 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_int8x16, schar, pack)
01701 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
01702 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_int16x8, short, pack)
01703 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
01704 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int64x2, int64, v_int32x4, int, pack)
01705 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int16x8, short, v_uint8x16, uchar, pack_u)
01706 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(v_int32x4, int, v_uint16x8, ushort, pack_u)
01707 //! @}
01708 
01709 /** @brief Matrix multiplication
01710 
01711 Scheme:
01712 @code
01713 {A0 A1 A2 A3}   |V0|
01714 {B0 B1 B2 B3}   |V1|
01715 {C0 C1 C2 C3}   |V2|
01716 {D0 D1 D2 D3} x |V3|
01717 ====================
01718 {R0 R1 R2 R3}, where:
01719 R0 = A0V0 + A1V1 + A2V2 + A3V3,
01720 R1 = B0V0 + B1V1 + B2V2 + B3V3
01721 ...
01722 @endcode
01723 */
01724 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
01725                             const v_float32x4& m1, const v_float32x4& m2,
01726                             const v_float32x4& m3)
01727 {
01728     return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
01729                        v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
01730                        v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
01731                        v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
01732 }
01733 
01734 //! @}
01735 
01736 }
01737 
01738 #endif
01739