15 #ifndef AVIR_FLOAT8_AVX_INCLUDED
16 #define AVIR_FLOAT8_AVX_INCLUDED
18 #include <immintrin.h>
50 :
value( _mm256_set1_ps( s ))
60 float8& operator = (
const __m256 s )
66 float8& operator = (
const float s )
68 value = _mm256_set1_ps( s );
72 operator float ()
const
74 return( _mm_cvtss_f32( _mm256_extractf128_ps(
value, 0 )));
85 return( _mm256_load_ps( p ));
96 return( _mm256_loadu_ps( p ));
114 lo = _mm_loadu_ps( p );
115 hi = loadu4( p + 4, lim - 4 );
119 lo = loadu4( p, lim );
120 hi = _mm_setzero_ps();
123 return( _mm256_insertf128_ps( _mm256_castps128_ps256( lo ), hi, 1 ));
134 _mm256_store_ps( p,
value );
145 _mm256_storeu_ps( p,
value );
162 _mm_storeu_ps( p, _mm256_extractf128_ps(
value, 0 ));
163 v = _mm256_extractf128_ps(
value, 1 );
169 v = _mm256_extractf128_ps(
value, 0 );
176 _mm_storeu_ps( p, v );
180 _mm_storel_pi( (__m64*) p, v );
181 _mm_store_ss( p + 2, _mm_movehl_ps( v, v ));
188 _mm_storel_pi( (__m64*) p, v );
192 _mm_store_ss( p, v );
203 float8& operator -= (
const float8& s )
209 float8& operator *= (
const float8& s )
215 float8& operator /= (
const float8& s )
221 float8 operator + (
const float8& s )
const
223 return( _mm256_add_ps(
value, s.value ));
226 float8 operator - (
const float8& s )
const
228 return( _mm256_sub_ps(
value, s.value ));
231 float8 operator * (
const float8& s )
const
233 return( _mm256_mul_ps(
value, s.value ));
236 float8 operator / (
const float8& s )
const
238 return( _mm256_div_ps(
value, s.value ));
247 __m128 v = _mm_add_ps( _mm256_extractf128_ps(
value, 0 ),
248 _mm256_extractf128_ps(
value, 1 ));
250 v = _mm_hadd_ps( v, v );
251 v = _mm_hadd_ps( v, v );
252 return( _mm_cvtss_f32( v ));
277 static void addu(
float*
const p,
const float8& v,
const int lim )
294 static __m128 loadu4(
const float*
const p,
const int lim )
300 return( _mm_loadu_ps( p ));
304 return( _mm_set_ps( 0.0f, p[ 2 ], p[ 1 ], p[ 0 ]));
311 return( _mm_set_ps( 0.0f, 0.0f, p[ 1 ], p[ 0 ]));
315 return( _mm_load_ss( p ));
328 inline float8 round(
const float8& v )
330 return( _mm256_round_ps( v.value,
331 ( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )));
344 inline float8 clamp(
const float8& Value,
const float8& minv,
347 return( _mm256_min_ps( _mm256_max_ps( Value.value, minv.value ),
351 typedef fpclass_def_dil< float, avir :: float8 > fpclass_float8_dil;
359 #endif // AVIR_FLOAT8_AVX_INCLUDED
__m256 value
Definition: avir_float8_avx.h:282
float hadd() const
Definition: avir_float8_avx.h:245
static float8 loadu(const float *const p, const int lim)
Definition: avir_float8_avx.h:107
void store(float *const p) const
Definition: avir_float8_avx.h:132
static void addu(float *const p, const float8 &v)
Definition: avir_float8_avx.h:263
void storeu(float *p, int lim) const
Definition: avir_float8_avx.h:156
static float8 loadu(const float *const p)
Definition: avir_float8_avx.h:94
Inclusion file for de-interleaved image resizing functions.
static void addu(float *const p, const float8 &v, const int lim)
Definition: avir_float8_avx.h:277
void storeu(float *const p) const
Definition: avir_float8_avx.h:143
static float8 load(const float *const p)
Definition: avir_float8_avx.h:83
SIMD packed 8-float type.
Definition: avir_float8_avx.h:32