avir/Documentation/a00014_source.html

#ifndef AVIR_FLOAT8_AVX_INCLUDED

#define AVIR_FLOAT8_AVX_INCLUDED


#include <immintrin.h>

#include "avir_dil.h"


namespace avir {


class float8

{

public:

    __m256 value;


    float8()

    {

    }


    float8( const float8& s )

        : value( s.value )

    {

    }


    float8( const __m256 s )

        : value( s )

    {

    }


    float8( const int s )

        : value( _mm256_set1_ps( (float) s ))

    {

    }


    float8( const float s )

        : value( _mm256_set1_ps( s ))

    {

    }


    float8( const double s )

        : value( _mm256_set1_ps( (float) s ))

    {

    }


    float8& operator = ( const float8& s )

    {

        value = s.value;

        return( *this );

    }


    float8& operator = ( const __m256 s )

    {

        value = s;

        return( *this );

    }


    float8& operator = ( const float s )

    {

        value = _mm256_set1_ps( s );

        return( *this );

    }


    operator float () const

    {

        return( _mm_cvtss_f32( _mm256_extractf128_ps( value, 0 )));

    }


    static float8 load( const float* const p )

    {

        return( _mm256_load_ps( p ));

    }


    static float8 loadu( const float* const p )

    {

        return( _mm256_loadu_ps( p ));

    }


    static float8 loadu( const float* const p, const int lim )

    {

        __m128 lo;

        __m128 hi;


        if( lim > 4 )

        {

            lo = _mm_loadu_ps( p );

            hi = loadu4( p + 4, lim - 4 );

        }

        else

        {

            lo = loadu4( p, lim );

            hi = _mm_setzero_ps();

        }


        return( _mm256_insertf128_ps( _mm256_castps128_ps256( lo ), hi, 1 ));

    }


    void store( float* const p ) const

    {

        _mm256_store_ps( p, value );

    }


    void storeu( float* const p ) const

    {

        _mm256_storeu_ps( p, value );

    }


    void storeu( float* p, int lim ) const

    {

        __m128 v;


        if( lim > 4 )

        {

            _mm_storeu_ps( p, _mm256_extractf128_ps( value, 0 ));

            v = _mm256_extractf128_ps( value, 1 );

            p += 4;

            lim -= 4;

        }

        else

        {

            v = _mm256_extractf128_ps( value, 0 );

        }


        if( lim > 2 )

        {

            if( lim > 3 )

            {

                _mm_storeu_ps( p, v );

            }

            else

            {

                _mm_storel_pi( (__m64*) p, v );

                _mm_store_ss( p + 2, _mm_movehl_ps( v, v ));

            }

        }

        else

        {

            if( lim == 2 )

            {

                _mm_storel_pi( (__m64*) p, v );

            }

            else

            {

                _mm_store_ss( p, v );

            }

        }

    }


    float8& operator += ( const float8& s )

    {

        value = _mm256_add_ps( value, s.value );

        return( *this );

    }


    float8& operator -= ( const float8& s )

    {

        value = _mm256_sub_ps( value, s.value );

        return( *this );

    }


    float8& operator *= ( const float8& s )

    {

        value = _mm256_mul_ps( value, s.value );

        return( *this );

    }


    float8& operator /= ( const float8& s )

    {

        value = _mm256_div_ps( value, s.value );

        return( *this );

    }


    float8 operator + ( const float8& s ) const

    {

        return( _mm256_add_ps( value, s.value ));

    }


    float8 operator - ( const float8& s ) const

    {

        return( _mm256_sub_ps( value, s.value ));

    }


    float8 operator * ( const float8& s ) const

    {

        return( _mm256_mul_ps( value, s.value ));

    }


    float8 operator / ( const float8& s ) const

    {

        return( _mm256_div_ps( value, s.value ));

    }


    float hadd() const

    {

        __m128 v = _mm_add_ps( _mm256_extractf128_ps( value, 0 ),

            _mm256_extractf128_ps( value, 1 ));


        v = _mm_hadd_ps( v, v );

        v = _mm_hadd_ps( v, v );


        return( _mm_cvtss_f32( v ));

    }


    static void addu( float* const p, const float8& v )

    {

        ( loadu( p ) + v ).storeu( p );

    }


    static void addu( float* const p, const float8& v, const int lim )

    {

        ( loadu( p, lim ) + v ).storeu( p, lim );

    }


private:


    static __m128 loadu4( const float* const p, const int lim )

    {

        if( lim > 2 )

        {

            if( lim > 3 )

            {

                return( _mm_loadu_ps( p ));

            }

            else

            {

                return( _mm_set_ps( 0.0f, p[ 2 ], p[ 1 ], p[ 0 ]));

            }

        }

        else

        {

            if( lim == 2 )

            {

                return( _mm_set_ps( 0.0f, 0.0f, p[ 1 ], p[ 0 ]));

            }

            else

            {

                return( _mm_load_ss( p ));

            }

        }

    }

};


inline float8 round( const float8& v )

{

    return( _mm256_round_ps( v.value,

        ( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC )));

}


inline float8 clamp( const float8& Value, const float8& minv,

    const float8& maxv )

{

    return( _mm256_min_ps( _mm256_max_ps( Value.value, minv.value ),

        maxv.value ));

}


typedef fpclass_def_dil< float, avir :: float8 > fpclass_float8_dil;


} // namespace avir


#endif // AVIR_FLOAT8_AVX_INCLUDED

avir::clamp
T clamp(const T &Value, const T minv, const T maxv)
"Clamps" (clips) the specified value so that it is not lesser than minv, and not greater than maxv.
Definition avir.h:149

avir::round
T round(const T d)
Rounding function, based on the (int) typecast. Biased result. Not suitable for numbers greater than ...
Definition avir.h:131

avir_dil.h
Inclusion file for de-interleaved image resizing functions.

avir::fpclass_float8_dil
fpclass_def_dil< float, avir ::float8 > fpclass_float8_dil
Class that can be used as the "fpclass" template parameter of the avir::CImageResizer class to perfor...
Definition avir_float8_avx.h:370

avir::fpclass_def_dil
Floating-point processing definition and abstraction class for de-interleaved processing.
Definition avir_dil.h:1017

avir::float8
SIMD packed 8-float type.
Definition avir_float8_avx.h:34

avir::float8::store
void store(float *const p) const
Stores this value to the specified memory location.
Definition avir_float8_avx.h:151

avir::float8::loadu
static float8 loadu(const float *const p, const int lim)
Returns float8 value loaded from the specified memory location, with elements beyond "lim" set to 0.
Definition avir_float8_avx.h:126

avir::float8::hadd
float hadd() const
Returns horizontal sum of elements.
Definition avir_float8_avx.h:264

avir::float8::loadu
static float8 loadu(const float *const p)
Returns float8 value loaded from the specified memory location.
Definition avir_float8_avx.h:111

avir::float8::addu
static void addu(float *const p, const float8 &v)
Performs in-place addition of a value located in memory, and the specified value.
Definition avir_float8_avx.h:283

avir::float8::addu
static void addu(float *const p, const float8 &v, const int lim)
Performs in-place addition of a value located in memory, and the specified value. Limited to the spec...
Definition avir_float8_avx.h:297

avir::float8::value
__m256 value
Packed value of 8 floats.
Definition avir_float8_avx.h:36

avir::float8::storeu
void storeu(float *p, int lim) const
Stores "lim" lower elements of this value to the specified memory location.
Definition avir_float8_avx.h:175

avir::float8::load
static float8 load(const float *const p)
Returns float8 value loaded from the specified memory location.
Definition avir_float8_avx.h:98

avir::float8::storeu
void storeu(float *const p) const
Stores this value to the specified memory location.
Definition avir_float8_avx.h:162