AVIR
High-quality pro image resizing library
 
Loading...
Searching...
No Matches
avir_float8_avx.h
Go to the documentation of this file.
1//$ nobt
2//$ nocpp
3
14
15#ifndef AVIR_FLOAT8_AVX_INCLUDED
16#define AVIR_FLOAT8_AVX_INCLUDED
17
18#include <immintrin.h>
19#include "avir_dil.h"
20
21namespace avir {
22
31
32class float8
33{
34public:
35 float8()
36 {
37 }
38
39 float8( const float8& s )
40 : value( s.value )
41 {
42 }
43
44 float8( const __m256 s )
45 : value( s )
46 {
47 }
48
49 float8( const float s )
50 : value( _mm256_set1_ps( s ))
51 {
52 }
53
54 float8& operator = ( const float8& s )
55 {
56 value = s.value;
57 return( *this );
58 }
59
60 float8& operator = ( const __m256 s )
61 {
62 value = s;
63 return( *this );
64 }
65
66 float8& operator = ( const float s )
67 {
68 value = _mm256_set1_ps( s );
69 return( *this );
70 }
71
72 operator float () const
73 {
74 return( _mm_cvtss_f32( _mm256_extractf128_ps( value, 0 )));
75 }
76
82
83 static float8 load( const float* const p )
84 {
85 return( _mm256_load_ps( p ));
86 }
87
93
94 static float8 loadu( const float* const p )
95 {
96 return( _mm256_loadu_ps( p ));
97 }
98
106
107 static float8 loadu( const float* const p, const int lim )
108 {
109 __m128 lo;
110 __m128 hi;
111
112 if( lim > 4 )
113 {
114 lo = _mm_loadu_ps( p );
115 hi = loadu4( p + 4, lim - 4 );
116 }
117 else
118 {
119 lo = loadu4( p, lim );
120 hi = _mm_setzero_ps();
121 }
122
123 return( _mm256_insertf128_ps( _mm256_castps128_ps256( lo ), hi, 1 ));
124 }
125
131
132 void store( float* const p ) const
133 {
134 _mm256_store_ps( p, value );
135 }
136
142
143 void storeu( float* const p ) const
144 {
145 _mm256_storeu_ps( p, value );
146 }
147
155
156 void storeu( float* p, int lim ) const
157 {
158 __m128 v;
159
160 if( lim > 4 )
161 {
162 _mm_storeu_ps( p, _mm256_extractf128_ps( value, 0 ));
163 v = _mm256_extractf128_ps( value, 1 );
164 p += 4;
165 lim -= 4;
166 }
167 else
168 {
169 v = _mm256_extractf128_ps( value, 0 );
170 }
171
172 if( lim > 2 )
173 {
174 if( lim > 3 )
175 {
176 _mm_storeu_ps( p, v );
177 }
178 else
179 {
180 _mm_storel_pi( (__m64*) p, v );
181 _mm_store_ss( p + 2, _mm_movehl_ps( v, v ));
182 }
183 }
184 else
185 {
186 if( lim == 2 )
187 {
188 _mm_storel_pi( (__m64*) p, v );
189 }
190 else
191 {
192 _mm_store_ss( p, v );
193 }
194 }
195 }
196
197 float8& operator += ( const float8& s )
198 {
199 value = _mm256_add_ps( value, s.value );
200 return( *this );
201 }
202
203 float8& operator -= ( const float8& s )
204 {
205 value = _mm256_sub_ps( value, s.value );
206 return( *this );
207 }
208
209 float8& operator *= ( const float8& s )
210 {
211 value = _mm256_mul_ps( value, s.value );
212 return( *this );
213 }
214
215 float8& operator /= ( const float8& s )
216 {
217 value = _mm256_div_ps( value, s.value );
218 return( *this );
219 }
220
221 float8 operator + ( const float8& s ) const
222 {
223 return( _mm256_add_ps( value, s.value ));
224 }
225
226 float8 operator - ( const float8& s ) const
227 {
228 return( _mm256_sub_ps( value, s.value ));
229 }
230
231 float8 operator * ( const float8& s ) const
232 {
233 return( _mm256_mul_ps( value, s.value ));
234 }
235
236 float8 operator / ( const float8& s ) const
237 {
238 return( _mm256_div_ps( value, s.value ));
239 }
240
244
245 float hadd() const
246 {
247 __m128 v = _mm_add_ps( _mm256_extractf128_ps( value, 0 ),
248 _mm256_extractf128_ps( value, 1 ));
249
250 v = _mm_hadd_ps( v, v );
251 v = _mm_hadd_ps( v, v );
252 return( _mm_cvtss_f32( v ));
253 }
254
262
263 static void addu( float* const p, const float8& v )
264 {
265 ( loadu( p ) + v ).storeu( p );
266 }
267
276
277 static void addu( float* const p, const float8& v, const int lim )
278 {
279 ( loadu( p, lim ) + v ).storeu( p, lim );
280 }
281
282 __m256 value;
284
285private:
293
294 static __m128 loadu4( const float* const p, const int lim )
295 {
296 if( lim > 2 )
297 {
298 if( lim > 3 )
299 {
300 return( _mm_loadu_ps( p ));
301 }
302 else
303 {
304 return( _mm_set_ps( 0.0f, p[ 2 ], p[ 1 ], p[ 0 ]));
305 }
306 }
307 else
308 {
309 if( lim == 2 )
310 {
311 return( _mm_set_ps( 0.0f, 0.0f, p[ 1 ], p[ 0 ]));
312 }
313 else
314 {
315 return( _mm_load_ss( p ));
316 }
317 }
318 }
319};
320
327
328inline float8 round( const float8& v )
329{
330 return( _mm256_round_ps( v.value,
331 ( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )));
332}
333
343
344inline float8 clamp( const float8& Value, const float8& minv,
345 const float8& maxv )
346{
347 return( _mm256_min_ps( _mm256_max_ps( Value.value, minv.value ),
348 maxv.value ));
349}
350
356
357} // namespace avir
358
359#endif // AVIR_FLOAT8_AVX_INCLUDED
T clamp(const T &Value, const T minv, const T maxv)
Definition avir.h:121
T round(const T d)
Definition avir.h:104
Inclusion file for de-interleaved image resizing functions.
fpclass_def_dil< float, avir ::float8 > fpclass_float8_dil
Class that can be used as the "fpclass" template parameter of the avir::CImageResizer class to perfor...
Definition avir_float8_avx.h:351
Floating-point processing definition and abstraction class for de-interleaved processing.
Definition avir_dil.h:1059
SIMD packed 8-float type.
Definition avir_float8_avx.h:33
void store(float *const p) const
Definition avir_float8_avx.h:132
static float8 loadu(const float *const p, const int lim)
Definition avir_float8_avx.h:107
float hadd() const
Definition avir_float8_avx.h:245
static float8 loadu(const float *const p)
Definition avir_float8_avx.h:94
static void addu(float *const p, const float8 &v)
Definition avir_float8_avx.h:263
static void addu(float *const p, const float8 &v, const int lim)
Definition avir_float8_avx.h:277
__m256 value
Packed value of 8 floats.
Definition avir_float8_avx.h:282
void storeu(float *p, int lim) const
Definition avir_float8_avx.h:156
static float8 load(const float *const p)
Definition avir_float8_avx.h:83
void storeu(float *const p) const
Definition avir_float8_avx.h:143