46#ifndef AVIR_CLANCIR_INCLUDED
47#define AVIR_CLANCIR_INCLUDED
52#if __cplusplus >= 201103L
76 #include <immintrin.h>
80 #define LANCIR_ALIGN 32
82#elif defined( __aarch64__ ) || defined( __arm64__ ) || \
83 defined( _M_ARM64 ) || defined( _M_ARM64EC )
85 #if defined( _MSC_VER )
86 #include <arm64_neon.h>
96 #define LANCIR_ALIGN 16
98#elif defined( __ARM_NEON ) || defined( __ARM_NEON__ ) || defined( _M_ARM )
100 #include <arm_neon.h>
104 #define LANCIR_ALIGN 16
106#elif defined( __SSE2__ ) || defined( _M_AMD64 ) || \
107 ( defined( _M_IX86_FP ) && _M_IX86_FP == 2 )
109 #if defined( _MSC_VER )
112 #include <emmintrin.h>
116 #define LANCIR_ALIGN 16
118#elif defined( __wasm_simd128__ )
120 #include <wasm_simd128.h>
123 #define LANCIR_ALIGN 16
127 #define LANCIR_ALIGN 4
131#if defined( LANCIR_SSE2 )
133 #define lancvec_t __m128
134 #define lancvec_const_splat( v ) _mm_set1_ps( v )
135 #define lancvec_load( m ) _mm_load_ps( m )
136 #define lancvec_loadu( m ) _mm_loadu_ps( m )
137 #define lancvec_store( m, v ) _mm_store_ps( m, v )
138 #define lancvec_storeu( m, v ) _mm_storeu_ps( m, v )
139 #define lancvec_add( v1, v2 ) _mm_add_ps( v1, v2 )
140 #define lancvec_mul( v1, v2 ) _mm_mul_ps( v1, v2 )
141 #define lancvec_min( v1, v2 ) _mm_min_ps( v1, v2 )
142 #define lancvec_max( v1, v2 ) _mm_max_ps( v1, v2 )
143 #define lancvec_madd( va, v1, v2 ) _mm_add_ps( va, _mm_mul_ps( v1, v2 ))
144 #define lancvec_addhl( vl, vh ) _mm_add_ps( vl, _mm_movehl_ps( vh, vh ))
145 #define lancvec_store32_addhl( m, v ) \
146 _mm_store_ss( m, _mm_add_ss( v, _mm_shuffle_ps( v, v, 1 )))
148 #define lancvec_store64_addhl( m, v ) \
149 _mm_storel_pi( (__m64*) ( m ), lancvec_addhl( v, v ))
151#elif defined( LANCIR_NEON )
153 #define lancvec_t float32x4_t
154 #define lancvec_const_splat( v ) vdupq_n_f32( v )
155 #define lancvec_load( m ) vld1q_f32( m )
156 #define lancvec_store( m, v ) vst1q_f32( m, v )
157 #define lancvec_add( v1, v2 ) vaddq_f32( v1, v2 )
158 #define lancvec_mul( v1, v2 ) vmulq_f32( v1, v2 )
159 #define lancvec_min( v1, v2 ) vminq_f32( v1, v2 )
160 #define lancvec_max( v1, v2 ) vmaxq_f32( v1, v2 )
161 #define lancvec_madd( va, v1, v2 ) vmlaq_f32( va, v1, v2 )
163 #if defined( LANCIR_ARM32 )
164 #define lancvec_store32_hadd( m, v ) { \
165 const float32x2_t v2 = vadd_f32( vget_high_f32( v ), \
166 vget_low_f32( v )); \
167 *( m ) = vget_lane_f32( v2, 0 ) + \
168 vget_lane_f32( v2, 1 ); } (void) 0
170 #define lancvec_store32_hadd( m, v ) *( m ) = vaddvq_f32( v )
173 #define lancvec_store64_addhl( m, v ) \
174 vst1_f32( m, vadd_f32( vget_high_f32( v ), vget_low_f32( v )));
176#elif defined( LANCIR_WASM )
178 #define lancvec_t v128_t
179 #define lancvec_const_splat( v ) wasm_f32x4_const_splat( v )
180 #define lancvec_load32_splat( m ) wasm_v128_load32_splat( m )
181 #define lancvec_load( m ) wasm_v128_load( m )
182 #define lancvec_store( m, v ) wasm_v128_store( m, v )
183 #define lancvec_add( v1, v2 ) wasm_f32x4_add( v1, v2 )
184 #define lancvec_mul( v1, v2 ) wasm_f32x4_mul( v1, v2 )
185 #define lancvec_min( v1, v2 ) wasm_f32x4_min( v1, v2 )
186 #define lancvec_max( v1, v2 ) wasm_f32x4_max( v1, v2 )
187 #define lancvec_madd( va, v1, v2 ) wasm_f32x4_add( va, \
188 wasm_f32x4_mul( v1, v2 ))
190 #define lancvec_addhl( vl, vh ) wasm_f32x4_add( vl, \
191 wasm_i32x4_shuffle( vh, vh, 6, 7, 2, 3 ))
193 #define lancvec_store32_addhl( m, v ) \
194 *( m ) = ( wasm_f32x4_extract_lane( v, 0 ) + \
195 wasm_f32x4_extract_lane( v, 1 ))
197 #define lancvec_store64_addhl( m, v ) \
198 wasm_v128_store64_lane( m, lancvec_addhl( v, v ), 0 )
204 #if !defined( lancvec_load32_splat )
205 #define lancvec_load32_splat( m ) lancvec_const_splat( *( m ))
208 #if !defined( lancvec_loadu )
209 #define lancvec_loadu( m ) lancvec_load( m )
212 #if !defined( lancvec_storeu )
213 #define lancvec_storeu( m, v ) lancvec_store( m, v )
216 #if !defined( lancvec_store32_hadd )
217 #define lancvec_store32_hadd( m, v ) { \
218 const lancvec_t v2 = lancvec_addhl( v, v ); \
219 lancvec_store32_addhl( m, v2 ); } (void) 0
235#if __cplusplus >= 201103L
237 using std :: intptr_t;
238 using std :: uintptr_t;
245 #if !defined( nullptr )
247 #define LANCIR_NULLPTR
296 const double akx = 0.0,
const double aky = 0.0,
297 const double aox = 0.0,
const double aoy = 0.0 )
330 CLancIR(
const CLancIR& )
335 CLancIR& operator = (
const CLancIR& )
386 template<
typename Tin,
typename Tout >
388 const int SrcHeight, Tout*
const NewBuf,
const int NewWidth,
389 const int NewHeight,
const int ElCount,
392 if(( SrcWidth < 0 ) | ( SrcHeight < 0 ) |
393 ( NewWidth <= 0 ) | ( NewHeight <= 0 ) |
394 ( SrcBuf ==
nullptr ) | ( NewBuf ==
nullptr ) |
395 ( (
const void*) SrcBuf == (
const void*) NewBuf ))
402 *aParams : DefParams );
404 if( Params.
la < 2.0 )
409 const int OutSLen = NewWidth * ElCount;
410 const size_t NewScanlineSize = (size_t) ( Params.
NewSSize < 1 ?
413 if(( SrcWidth == 0 ) | ( SrcHeight == 0 ))
418 for( i = 0; i < NewHeight; i++ )
420 memset( op, 0, (
size_t) OutSLen *
sizeof( Tout ));
421 op += NewScanlineSize;
427 const size_t SrcScanlineSize = (size_t) ( Params.
SrcSSize < 1 ?
428 SrcWidth * ElCount : Params.
SrcSSize );
430 double ox = Params.
ox;
431 double oy = Params.
oy;
435 if( Params.
kx >= 0.0 )
437 kx = ( Params.
kx == 0.0 ?
438 (double) SrcWidth / NewWidth : Params.
kx );
440 ox += ( kx - 1.0 ) * 0.5;
447 if( Params.
ky >= 0.0 )
449 ky = ( Params.
ky == 0.0 ?
450 (double) SrcHeight / NewHeight : Params.
ky );
452 oy += ( ky - 1.0 ) * 0.5;
459 if(
rfv.update( Params.
la, ky, ElCount ))
476 if(
rfh0.update( Params.
la, kx, ElCount ))
482 rsv.update( SrcHeight, NewHeight, oy,
rfv,
spv );
483 rsh.update( SrcWidth, NewWidth, ox, *rfh );
492 const size_t FltWidthE = (size_t) ((
rsh.padl + SrcWidth +
493 rsh.padr ) * ElCount );
495 const double CacheSize = 5500000.0;
496 const double OpSize = (double) SrcScanlineSize * SrcHeight *
497 sizeof( Tin ) + (double) FltWidthE * NewHeight *
sizeof(
float );
499 int BatchSize = (int) ( NewHeight * CacheSize / ( OpSize + 1.0 ));
506 if( BatchSize > NewHeight )
508 BatchSize = NewHeight;
513 const int svs = (
rsv.padl + SrcHeight +
rsv.padr ) * ElCount;
514 float*
const pspv0 =
spv0;
517 FltWidthE * (
size_t) BatchSize );
526 static const bool IsInFloat = ( (Tin) 0.25f != 0 );
527 static const bool IsOutFloat = ( (Tout) 0.25f != 0 );
528 static const bool IsUnityMul = ( IsInFloat && IsOutFloat ) ||
529 ( IsInFloat == IsOutFloat &&
sizeof( Tin ) ==
sizeof( Tout ));
531 const float Clamp = (
sizeof( Tout ) == 1 ? 255.0f : 65535.0f );
532 const float OutMul = ( IsOutFloat ? 1.0f : Clamp ) /
533 ( IsInFloat ? 1.0f : (
sizeof( Tin ) == 1 ? 255.0f : 65535.0f ));
543 const int bc = ( bl > BatchSize ? BatchSize : bl );
545 int kl =
rfv.KernelLen;
546 const Tin* ip = SrcBuf;
549 const int so = (int) rpv[ 0 ].so;
550 float*
const sp =
spv + so * ElCount;
552 int cc = (int) rpv[ bc - 1 ].so - so + kl;
556 const int socc = so + cc;
557 const int spe =
rsv.padl + SrcHeight;
564 if( socc <=
rsv.padl )
587 ip += (size_t) SrcHeight * SrcScanlineSize;
597 ip += (size_t) ( so -
rsv.padl ) * SrcScanlineSize;
607 for( i = 0; i < SrcWidth; i++ )
618 for( i = 0; i < SrcWidth; i++ )
629 for( i = 0; i < SrcWidth; i++ )
639 for( i = 0; i < SrcWidth; i++ )
651 kl = rfh -> KernelLen;
655 for( i = 0; i < bc; i++ )
660 OutSLen, Clamp, OutMul );
663 opn += NewScanlineSize;
669 for( i = 0; i < bc; i++ )
674 OutSLen, Clamp, OutMul );
677 opn += NewScanlineSize;
683 for( i = 0; i < bc; i++ )
688 OutSLen, Clamp, OutMul );
691 opn += NewScanlineSize;
696 for( i = 0; i < bc; i++ )
701 OutSLen, Clamp, OutMul );
704 opn += NewScanlineSize;
744 template<
typename Tin,
typename Tout >
746 const int SrcHeight,
const int SrcSSize, Tout*
const NewBuf,
747 const int NewWidth,
const int NewHeight,
const int NewSSize,
748 const int ElCount,
const double kx0 = 0.0,
const double ky0 = 0.0,
749 double ox = 0.0,
double oy = 0.0 )
751 const CLancIRParams Params( SrcSSize, NewSSize, kx0, ky0, ox, oy );
753 return(
resizeImage( SrcBuf, SrcWidth, SrcHeight, NewBuf, NewWidth,
754 NewHeight, ElCount, &Params ));
782 template<
typename Tb,
typename Tl >
783 static void reallocBuf( Tb*& buf0, Tb*& buf, Tl& len, Tl newlen )
789 if( buf0 !=
nullptr )
796 buf0 =
new Tb[ newlen ];
817 template<
typename Tb,
typename Tl >
829 buf =
new Tb[ newlen ];
834 class CResizeScanline;
842 friend class CResizeScanline;
882 bool update(
const double la0,
const double k0,
const int ElCount0 )
884 if( la0 ==
la && k0 ==
k && ElCount0 ==
ElCount )
889 const double NormFreq = ( k0 <= 1.0 ? 1.0 : 1.0 / k0 );
890 Freq = 3.1415926535897932 * NormFreq;
893 Len2 = la0 / NormFreq;
942 const int Frac = (int) ( x *
FracCount + 0.5 );
1039 : svalue1( sin( ph ))
1040 , svalue2( sin( ph - si ))
1041 , sincr( 2.0 * cos( si ))
1051 const double res = svalue1;
1053 svalue1 = sincr * res - svalue2;
1087 if( t + FracDelay < -
Len2 )
1096 int IsZeroX = ( fabs( FracDelay - 1.0 ) < 2.3e-13 );
1097 int mt = 0 - IsZeroX;
1098 IsZeroX |= ( fabs( FracDelay ) < 2.3e-13 );
1135 ut = t + 1 + FracDelay;
1148 t = (int) ( op - op0 + 1 );
1152 *op0 = (float) ( *op0 * s );
1174 const float* ip = p + kl - 1;
1175 float* op = p + ( kl - 1 ) * erp;
1182 const float v = *ip;
1195 const float v = *ip;
1208 const float v = *ip;
1242 class CResizeScanline
1290 void update(
const int SrcLen0,
const int DstLen0,
const double o0,
1298 const int fl2m1 = rf.
fl2 - 1;
1299 padl = fl2m1 - (int) floor( o0 );
1309 const double k = rf.
k;
1311 const int DstLen_m1 = DstLen0 - 1;
1312 const double oe = o0 + k * DstLen_m1;
1313 const int ie = (int) floor( oe );
1315 padr = ie + rf.
fl2 + 1 - SrcLen0;
1325 const intptr_t ElCountF = rf.
ElCount * (intptr_t)
sizeof(
float );
1326 const int so =
padl - fl2m1;
1331 for( i = 0; i < DstLen_m1; i++ )
1333 const double ox = o0 + k * i;
1334 const int ix = (int) floor( ox );
1338 rp -> spo = (intptr_t) sp + rpso * ElCountF;
1345 rp -> spo = (intptr_t) sp + rpso * ElCountF;
1366 const intptr_t ElCountF = rf.
ElCount * (intptr_t)
sizeof(
float );
1370 for( i = 0; i <
DstLen; i++ )
1372 rp[ i ].
spo = (intptr_t) sp + rp[ i ].so * ElCountF;
1406 template<
typename T >
1408 int cc,
int repl,
int repr )
1414 v0 = (float) ip[ 0 ];
1421 }
while( --repl != 0 );
1426 op[ 0 ] = (float) ip[ 0 ];
1434 const T*
const ipe = ip - ipinc;
1435 v0 = (float) ipe[ 0 ];
1442 }
while( --repr != 0 );
1446 template<
typename T >
1448 int cc,
int repl,
int repr )
1454 v0 = (float) ip[ 0 ];
1455 v1 = (float) ip[ 1 ];
1463 }
while( --repl != 0 );
1468 op[ 0 ] = (float) ip[ 0 ];
1469 op[ 1 ] = (float) ip[ 1 ];
1477 const T*
const ipe = ip - ipinc;
1478 v0 = (float) ipe[ 0 ];
1479 v1 = (float) ipe[ 1 ];
1487 }
while( --repr != 0 );
1491 template<
typename T >
1493 int cc,
int repl,
int repr )
1499 v0 = (float) ip[ 0 ];
1500 v1 = (float) ip[ 1 ];
1501 v2 = (float) ip[ 2 ];
1510 }
while( --repl != 0 );
1515 op[ 0 ] = (float) ip[ 0 ];
1516 op[ 1 ] = (float) ip[ 1 ];
1517 op[ 2 ] = (float) ip[ 2 ];
1525 const T*
const ipe = ip - ipinc;
1526 v0 = (float) ipe[ 0 ];
1527 v1 = (float) ipe[ 1 ];
1528 v2 = (float) ipe[ 2 ];
1537 }
while( --repr != 0 );
1541 template<
typename T >
1543 int cc,
int repl,
int repr )
1545 float v0, v1, v2, v3;
1549 v0 = (float) ip[ 0 ];
1550 v1 = (float) ip[ 1 ];
1551 v2 = (float) ip[ 2 ];
1552 v3 = (float) ip[ 3 ];
1562 }
while( --repl != 0 );
1567 op[ 0 ] = (float) ip[ 0 ];
1568 op[ 1 ] = (float) ip[ 1 ];
1569 op[ 2 ] = (float) ip[ 2 ];
1570 op[ 3 ] = (float) ip[ 3 ];
1578 const T*
const ipe = ip - ipinc;
1579 v0 = (float) ipe[ 0 ];
1580 v1 = (float) ipe[ 1 ];
1581 v2 = (float) ipe[ 2 ];
1582 v3 = (float) ipe[ 3 ];
1592 }
while( --repr != 0 );
1613 const float* ip = op + rs.
padl;
1618 for( i = 0; i < rs.
padl; i++ )
1628 for( i = 0; i < rs.
padr; i++ )
1636 const float* ip = op + rs.
padl * 2;
1642 for( i = 0; i < rs.
padl; i++ )
1649 const int lc = l * 2;
1656 for( i = 0; i < rs.
padr; i++ )
1666 const float* ip = op + rs.
padl * 3;
1673 for( i = 0; i < rs.
padl; i++ )
1681 const int lc = l * 3;
1689 for( i = 0; i < rs.
padr; i++ )
1700 const float* ip = op + rs.
padl * 4;
1708 for( i = 0; i < rs.
padl; i++ )
1717 const int lc = l * 4;
1726 for( i = 0; i < rs.
padr; i++ )
1748 return( (
int) (( v > Clamp ? Clamp : ( v < 0.0f ? 0.0f : v )) +
1772 template<
bool IsOutFloat,
bool IsUnityMul,
typename T >
1774 const float Clamp,
const float OutMul )
1780 if(
sizeof( op[ 0 ]) ==
sizeof( ip[ 0 ]))
1782 memcpy( op, ip, (
size_t) l *
sizeof( op[ 0 ]));
1791 op[ 0 ] = (T) ip[ 0 ];
1792 op[ 1 ] = (T) ip[ 1 ];
1793 op[ 2 ] = (T) ip[ 2 ];
1794 op[ 3 ] = (T) ip[ 3 ];
1813 bool DoScalar =
true;
1815 if(
sizeof( op[ 0 ]) ==
sizeof( ip[ 0 ]))
1817 #if LANCIR_ALIGN > 4
1820 const lancvec_t om = lancvec_load32_splat( &OutMul );
1824 lancvec_storeu( (
float*) op,
1825 lancvec_mul( lancvec_load( ip ), om ));
1839 op[ 0 ] = (T) ( ip[ 0 ] * OutMul );
1840 op[ 1 ] = (T) ( ip[ 1 ] * OutMul );
1841 op[ 2 ] = (T) ( ip[ 2 ] * OutMul );
1842 op[ 3 ] = (T) ( ip[ 3 ] * OutMul );
1851 *op = (T) ( *ip * OutMul );
1863 #if LANCIR_ALIGN > 4
1865 const lancvec_t minv = lancvec_const_splat( 0.0f );
1866 const lancvec_t maxv = lancvec_load32_splat( &Clamp );
1867 const lancvec_t om = lancvec_load32_splat( &OutMul );
1869 #if defined( LANCIR_SSE2 )
1870 unsigned int prevrm = _MM_GET_ROUNDING_MODE();
1871 _MM_SET_ROUNDING_MODE( _MM_ROUND_NEAREST );
1873 const lancvec_t v05 = lancvec_const_splat( 0.5f );
1876 if(
sizeof( op[ 0 ]) == 4 )
1880 const lancvec_t v = lancvec_load( ip );
1881 const lancvec_t cv = lancvec_max( lancvec_min(
1882 ( IsUnityMul ? v : lancvec_mul( v, om )),
1885 #if defined( LANCIR_SSE2 )
1887 _mm_storeu_si128( (__m128i*) op, _mm_cvtps_epi32( cv ));
1889 #elif defined( LANCIR_NEON )
1891 vst1q_u32( (
unsigned int*) op, vcvtq_u32_f32( vaddq_f32(
1894 #elif defined( LANCIR_WASM )
1896 wasm_v128_store( op, wasm_i32x4_trunc_sat_f32x4(
1897 wasm_f32x4_add( cv, v05 )));
1907 if(
sizeof( op[ 0 ]) == 2 )
1911 const lancvec_t v = lancvec_load( ip );
1912 const lancvec_t cv = lancvec_max( lancvec_min(
1913 ( IsUnityMul ? v : lancvec_mul( v, om )),
1916 #if defined( LANCIR_SSE2 )
1918 const __m128i v32 = _mm_cvtps_epi32( cv );
1919 const __m128i v16s = _mm_shufflehi_epi16(
1920 _mm_shufflelo_epi16( v32, 0 | 2 << 2 ), 0 | 2 << 2 );
1922 const __m128i v16 = _mm_shuffle_epi32( v16s, 0 | 2 << 2 );
1925 _mm_store_si128( &tmp, v16 );
1926 memcpy( op, &tmp, 8 );
1928 #elif defined( LANCIR_NEON )
1930 const uint32x4_t v32 = vcvtq_u32_f32(
1931 vaddq_f32( cv, v05 ));
1933 const uint16x4_t v16 = vmovn_u32( v32 );
1935 vst1_u16( (
unsigned short*) op, v16 );
1937 #elif defined( LANCIR_WASM )
1939 const v128_t v32 = wasm_i32x4_trunc_sat_f32x4(
1940 wasm_f32x4_add( cv, v05 ));
1942 wasm_v128_store64_lane( op,
1943 wasm_u16x8_narrow_i32x4( v32, v32 ), 0 );
1956 const lancvec_t v = lancvec_load( ip );
1957 const lancvec_t cv = lancvec_max( lancvec_min(
1958 ( IsUnityMul ? v : lancvec_mul( v, om )),
1961 #if defined( LANCIR_SSE2 )
1963 const __m128i v32 = _mm_cvtps_epi32( cv );
1964 const __m128i v16s = _mm_shufflehi_epi16(
1965 _mm_shufflelo_epi16( v32, 0 | 2 << 2 ), 0 | 2 << 2 );
1967 const __m128i v16 = _mm_shuffle_epi32( v16s, 0 | 2 << 2 );
1968 const __m128i v8 = _mm_packus_epi16( v16, v16 );
1970 *(
int*) op = _mm_cvtsi128_si32( v8 );
1972 #elif defined( LANCIR_NEON )
1974 const uint32x4_t v32 = vcvtq_u32_f32(
1975 vaddq_f32( cv, v05 ));
1977 const uint16x4_t v16 = vmovn_u32( v32 );
1978 const uint8x8_t v8 = vmovn_u16( vcombine_u16( v16, v16 ));
1980 *(
unsigned int*) op = vget_lane_u32( (uint32x2_t) v8, 0 );
1982 #elif defined( LANCIR_WASM )
1984 const v128_t v32 = wasm_i32x4_trunc_sat_f32x4(
1985 wasm_f32x4_add( cv, v05 ));
1987 const v128_t v16 = wasm_u16x8_narrow_i32x4( v32, v32 );
1989 wasm_v128_store32_lane( op,
1990 wasm_u8x16_narrow_i16x8( v16, v16 ), 0 );
2000 #if defined( LANCIR_SSE2 )
2001 _MM_SET_ROUNDING_MODE( prevrm );
2023 op[ 0 ] = (T)
roundclamp( ip[ 0 ] * OutMul, Clamp );
2024 op[ 1 ] = (T)
roundclamp( ip[ 1 ] * OutMul, Clamp );
2025 op[ 2 ] = (T)
roundclamp( ip[ 2 ] * OutMul, Clamp );
2026 op[ 3 ] = (T)
roundclamp( ip[ 3 ] * OutMul, Clamp );
2063 #define LANCIR_LF_PRE \
2064 const CResizePos* const rpe = rp + DstLen; \
2065 while( rp != rpe ) \
2067 const float* flt = rp -> flt; \
2071 ip = (const float*) ( (intptr_t) sp + rp -> spo ); \
2075 ip = (const float*) rp -> spo; \
2083 #define LANCIR_LF_POST \
2102 template<
bool UseSP >
2103 static void resize1(
const float*
const sp,
float* op,
const size_t opinc,
2104 const CResizePos* rp,
const int kl,
const int DstLen )
2106 const int ci = kl >> 2;
2108 if(( kl & 3 ) == 0 )
2114 #if LANCIR_ALIGN > 4
2116 lancvec_t sum = lancvec_mul(
2117 lancvec_load( flt ), lancvec_loadu( ip ));
2123 sum = lancvec_madd( sum, lancvec_load( flt ),
2124 lancvec_loadu( ip ));
2127 lancvec_store32_hadd( op, sum );
2131 float sum0 = flt[ 0 ] * ip[ 0 ];
2132 float sum1 = flt[ 1 ] * ip[ 1 ];
2133 float sum2 = flt[ 2 ] * ip[ 2 ];
2134 float sum3 = flt[ 3 ] * ip[ 3 ];
2140 sum0 += flt[ 0 ] * ip[ 0 ];
2141 sum1 += flt[ 1 ] * ip[ 1 ];
2142 sum2 += flt[ 2 ] * ip[ 2 ];
2143 sum3 += flt[ 3 ] * ip[ 3 ];
2146 op[ 0 ] = ( sum0 + sum1 ) + ( sum2 + sum3 );
2158 #if LANCIR_ALIGN > 4
2160 lancvec_t sum = lancvec_mul( lancvec_load( flt ),
2161 lancvec_loadu( ip ));
2167 sum = lancvec_madd( sum, lancvec_load( flt ),
2168 lancvec_loadu( ip ));
2171 #if defined( LANCIR_NEON )
2173 float32x2_t sum2 = vadd_f32( vget_high_f32( sum ),
2174 vget_low_f32( sum ));
2176 sum2 = vmla_f32( sum2, vld1_f32( flt + 4 ),
2177 vld1_f32( ip + 4 ));
2179 #if defined( LANCIR_ARM32 )
2180 op[ 0 ] = vget_lane_f32( sum2, 0 ) +
2181 vget_lane_f32( sum2, 1 );
2183 op[ 0 ] = vaddv_f32( sum2 );
2188 const lancvec_t sum2 = lancvec_mul( lancvec_loadu( flt + 2 ),
2189 lancvec_loadu( ip + 2 ));
2191 sum = lancvec_addhl( sum, sum );
2192 sum = lancvec_addhl( sum, sum2 );
2194 lancvec_store32_addhl( op, sum );
2200 float sum0 = flt[ 0 ] * ip[ 0 ];
2201 float sum1 = flt[ 1 ] * ip[ 1 ];
2202 float sum2 = flt[ 2 ] * ip[ 2 ];
2203 float sum3 = flt[ 3 ] * ip[ 3 ];
2209 sum0 += flt[ 0 ] * ip[ 0 ];
2210 sum1 += flt[ 1 ] * ip[ 1 ];
2211 sum2 += flt[ 2 ] * ip[ 2 ];
2212 sum3 += flt[ 3 ] * ip[ 3 ];
2215 op[ 0 ] = ( sum0 + sum1 ) + ( sum2 + sum3 ) +
2216 flt[ 4 ] * ip[ 4 ] + flt[ 5 ] * ip[ 5 ];
2224 template<
bool UseSP >
2225 static void resize2(
const float*
const sp,
float* op,
const size_t opinc,
2226 const CResizePos* rp,
const int kl,
const int DstLen )
2228 #if LANCIR_ALIGN > 4
2229 const int ci = kl >> 2;
2230 const int cir = kl & 3;
2232 const int ci = kl >> 1;
2239 #if defined( LANCIR_AVX )
2241 __m256 sum = _mm256_mul_ps( _mm256_load_ps( flt ),
2242 _mm256_loadu_ps( ip ));
2248 sum = _mm256_add_ps( sum, _mm256_mul_ps( _mm256_load_ps( flt ),
2249 _mm256_loadu_ps( ip )));
2252 __m128 res = _mm_add_ps( _mm256_extractf128_ps( sum, 0 ),
2253 _mm256_extractf128_ps( sum, 1 ));
2257 res = _mm_add_ps( res, _mm_mul_ps( _mm_load_ps( flt + 8 ),
2258 _mm_loadu_ps( ip + 8 )));
2261 _mm_storel_pi( (__m64*) op,
2262 _mm_add_ps( res, _mm_movehl_ps( res, res )));
2264 #elif LANCIR_ALIGN > 4
2266 lancvec_t sumA = lancvec_mul(
2267 lancvec_load( flt ), lancvec_loadu( ip ));
2269 lancvec_t sumB = lancvec_mul(
2270 lancvec_load( flt + 4 ), lancvec_loadu( ip + 4 ));
2276 sumA = lancvec_madd( sumA, lancvec_load( flt ),
2277 lancvec_loadu( ip ));
2279 sumB = lancvec_madd( sumB, lancvec_load( flt + 4 ),
2280 lancvec_loadu( ip + 4 ));
2283 sumA = lancvec_add( sumA, sumB );
2287 sumA = lancvec_madd( sumA, lancvec_load( flt + 8 ),
2288 lancvec_loadu( ip + 8 ));
2291 lancvec_store64_addhl( op, sumA );
2295 const float xx = flt[ 0 ];
2296 const float xx2 = flt[ 1 ];
2297 float sum0 = xx * ip[ 0 ];
2298 float sum1 = xx * ip[ 1 ];
2299 float sum2 = xx2 * ip[ 2 ];
2300 float sum3 = xx2 * ip[ 3 ];
2306 const float xx = flt[ 0 ];
2307 const float xx2 = flt[ 1 ];
2308 sum0 += xx * ip[ 0 ];
2309 sum1 += xx * ip[ 1 ];
2310 sum2 += xx2 * ip[ 2 ];
2311 sum3 += xx2 * ip[ 3 ];
2314 op[ 0 ] = sum0 + sum2;
2315 op[ 1 ] = sum1 + sum3;
2322 template<
bool UseSP >
2323 static void resize3(
const float*
const sp,
float* op,
const size_t opinc,
2324 const CResizePos* rp,
const int kl,
const int DstLen )
2326 #if LANCIR_ALIGN > 4
2328 const int ci = kl >> 2;
2329 const int cir = kl & 3;
2336 #if defined( LANCIR_AVX )
2338 __m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
2339 __m256 sumB = _mm256_mul_ps( _mm256_loadu_ps( flt + 4 ),
2340 _mm256_loadu_ps( ip + 4 ));
2346 sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ),
2347 _mm_loadu_ps( ip )));
2349 sumB = _mm256_add_ps( sumB, _mm256_mul_ps(
2350 _mm256_loadu_ps( flt + 4 ), _mm256_loadu_ps( ip + 4 )));
2355 sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 12 ),
2356 _mm_loadu_ps( ip + 12 )));
2359 _mm_storeu_ps( res, sumA );
2361 float o0 = res[ 0 ] + res[ 3 ];
2362 float o1 = res[ 1 ];
2363 float o2 = res[ 2 ];
2365 _mm256_storeu_ps( res + 4, sumB );
2372 lancvec_t sumA = lancvec_mul( lancvec_load( flt ),
2373 lancvec_loadu( ip ));
2375 lancvec_t sumB = lancvec_mul( lancvec_load( flt + 4 ),
2376 lancvec_loadu( ip + 4 ));
2378 lancvec_t sumC = lancvec_mul( lancvec_load( flt + 8 ),
2379 lancvec_loadu( ip + 8 ));
2385 sumA = lancvec_madd( sumA, lancvec_load( flt ),
2386 lancvec_loadu( ip ));
2388 sumB = lancvec_madd( sumB, lancvec_load( flt + 4 ),
2389 lancvec_loadu( ip + 4 ));
2391 sumC = lancvec_madd( sumC, lancvec_load( flt + 8 ),
2392 lancvec_loadu( ip + 8 ));
2397 sumA = lancvec_madd( sumA, lancvec_load( flt + 12 ),
2398 lancvec_loadu( ip + 12 ));
2401 lancvec_storeu( res, sumA );
2402 lancvec_storeu( res + 4, sumB );
2404 float o0 = res[ 0 ] + res[ 3 ];
2405 float o1 = res[ 1 ] + res[ 4 ];
2406 float o2 = res[ 2 ] + res[ 5 ];
2408 lancvec_storeu( res + 8, sumC );
2412 o0 += res[ 6 ] + res[ 9 ];
2413 o1 += res[ 7 ] + res[ 10 ];
2414 o2 += res[ 8 ] + res[ 11 ];
2418 o1 += flt[ 16 ] * ip[ 16 ];
2419 o2 += flt[ 17 ] * ip[ 17 ];
2428 const int ci = kl >> 1;
2434 const float xx = flt[ 0 ];
2435 float sum0 = xx * ip[ 0 ];
2436 float sum1 = xx * ip[ 1 ];
2437 float sum2 = xx * ip[ 2 ];
2438 const float xx2 = flt[ 1 ];
2439 float sum3 = xx2 * ip[ 3 ];
2440 float sum4 = xx2 * ip[ 4 ];
2441 float sum5 = xx2 * ip[ 5 ];
2447 const float xx = flt[ 0 ];
2448 sum0 += xx * ip[ 0 ];
2449 sum1 += xx * ip[ 1 ];
2450 sum2 += xx * ip[ 2 ];
2451 const float xx2 = flt[ 1 ];
2452 sum3 += xx2 * ip[ 3 ];
2453 sum4 += xx2 * ip[ 4 ];
2454 sum5 += xx2 * ip[ 5 ];
2457 op[ 0 ] = sum0 + sum3;
2458 op[ 1 ] = sum1 + sum4;
2459 op[ 2 ] = sum2 + sum5;
2466 template<
bool UseSP >
2467 static void resize4(
const float*
const sp,
float* op,
const size_t opinc,
2468 const CResizePos* rp,
const int kl,
const int DstLen )
2470 #if LANCIR_ALIGN > 4
2471 const int ci = kl >> 1;
2480 #if defined( LANCIR_AVX )
2482 __m256 sum = _mm256_mul_ps( _mm256_load_ps( flt ),
2483 _mm256_loadu_ps( ip ));
2489 sum = _mm256_add_ps( sum, _mm256_mul_ps( _mm256_load_ps( flt ),
2490 _mm256_loadu_ps( ip )));
2493 _mm_store_ps( op, _mm_add_ps( _mm256_extractf128_ps( sum, 0 ),
2494 _mm256_extractf128_ps( sum, 1 )));
2496 #elif LANCIR_ALIGN > 4
2498 lancvec_t sumA = lancvec_mul( lancvec_load( flt ),
2499 lancvec_load( ip ));
2501 lancvec_t sumB = lancvec_mul( lancvec_load( flt + 4 ),
2502 lancvec_load( ip + 4 ));
2508 sumA = lancvec_madd( sumA, lancvec_load( flt ),
2509 lancvec_load( ip ));
2511 sumB = lancvec_madd( sumB, lancvec_load( flt + 4 ),
2512 lancvec_load( ip + 4 ));
2515 lancvec_store( op, lancvec_add( sumA, sumB ));
2519 const float xx = flt[ 0 ];
2520 float sum0 = xx * ip[ 0 ];
2521 float sum1 = xx * ip[ 1 ];
2522 float sum2 = xx * ip[ 2 ];
2523 float sum3 = xx * ip[ 3 ];
2529 const float xx = flt[ 0 ];
2530 sum0 += xx * ip[ 0 ];
2531 sum1 += xx * ip[ 1 ];
2532 sum2 += xx * ip[ 2 ];
2533 sum3 += xx * ip[ 3 ];
2548 #undef LANCIR_LF_PRE
2549 #undef LANCIR_LF_POST
2553#undef lancvec_const_splat
2554#undef lancvec_load32_splat
2558#undef lancvec_storeu
2565#undef lancvec_store32_addhl
2566#undef lancvec_store32_hadd
2567#undef lancvec_store64_addhl
2569#if defined( LANCIR_NULLPTR )
2571 #undef LANCIR_NULLPTR
#define LANCIR_ALIGN
Address alignment (granularity) used by resizing functions, in bytes.
Definition lancir.h:127
#define LANCIR_LF_POST
Scanline resize function epilogue.
Definition lancir.h:2083
#define LANCIR_LF_PRE
Scanline resize function prologue.
Definition lancir.h:2063
LANCIR resizing parameters class.
Definition lancir.h:261
CLancIRParams(const int aSrcSSize=0, const int aNewSSize=0, const double akx=0.0, const double aky=0.0, const double aox=0.0, const double aoy=0.0)
Default constructor, with optional arguments that correspond to class variables.
Definition lancir.h:295
double oy
Start Y pixel offset within the source image, can be negative. A positive offset moves the image to t...
Definition lancir.h:278
double ky
Resizing step - vertical. Same as kx.
Definition lancir.h:275
double la
Lanczos window function's a parameter, greater or equal to 2.0.
Definition lancir.h:280
int NewSSize
Physical size of the destination scanline, in elements (not bytes). If this value is below 1,...
Definition lancir.h:266
double ox
Start X pixel offset within the source image, can be negative. A positive offset moves the image to t...
Definition lancir.h:276
int SrcSSize
Physical size of the source scanline, in elements (not bytes). If this value is below 1,...
Definition lancir.h:263
double kx
Resizing step - horizontal (one output pixel corresponds to k input pixels). A downsizing factor if g...
Definition lancir.h:269
static void copyScanline1v(const T *ip, const size_t ipinc, float *op, int cc, int repl, int repr)
Scanline copying function, for vertical resizing.
Definition lancir.h:1407
float * FltBuf
Address-aligned FltBuf0.
Definition lancir.h:760
int resizeImage(const Tin *const SrcBuf, const int SrcWidth, const int SrcHeight, const int SrcSSize, Tout *const NewBuf, const int NewWidth, const int NewHeight, const int NewSSize, const int ElCount, const double kx0=0.0, const double ky0=0.0, double ox=0.0, double oy=0.0)
Legacy image resizing function.
Definition lancir.h:745
static void resize3(const float *const sp, float *op, const size_t opinc, const CResizePos *rp, const int kl, const int DstLen)
Function performs scanline resizing. Variants for 1-4-channel images.
Definition lancir.h:2323
static void resize2(const float *const sp, float *op, const size_t opinc, const CResizePos *rp, const int kl, const int DstLen)
Function performs scanline resizing. Variants for 1-4-channel images.
Definition lancir.h:2225
CResizeScanline rsv
Vertical resize scanline.
Definition lancir.h:1386
static void padScanline2h(float *op, CResizeScanline &rs, const int l)
Scanline padding function, for horizontal resizing.
Definition lancir.h:1634
static void resize4(const float *const sp, float *op, const size_t opinc, const CResizePos *rp, const int kl, const int DstLen)
Function performs scanline resizing. Variants for 1-4-channel images.
Definition lancir.h:2467
static int roundclamp(const float v, const float Clamp)
Rounds a value, and applies clamping.
Definition lancir.h:1746
CResizeFilters rfh0
Resizing filters for horizontal resizing (may not be in use).
Definition lancir.h:1384
float * spv0
Scanline buffer for vertical resizing, also used at the output stage.
Definition lancir.h:761
static void outputScanline(const float *ip, T *op, int l, const float Clamp, const float OutMul)
Scanline output function.
Definition lancir.h:1773
CResizeFilters rfv
Resizing filters for vertical resizing.
Definition lancir.h:1383
static void copyScanline4v(const T *ip, const size_t ipinc, float *op, int cc, int repl, int repr)
Scanline copying function, for vertical resizing.
Definition lancir.h:1542
static void padScanline1h(float *op, CResizeScanline &rs, const int l)
Scanline padding function, for horizontal resizing.
Definition lancir.h:1611
static void padScanline4h(float *op, CResizeScanline &rs, const int l)
Scanline padding function, for horizontal resizing.
Definition lancir.h:1698
float * FltBuf0
Intermediate resizing buffer.
Definition lancir.h:758
static void copyScanline2v(const T *ip, const size_t ipinc, float *op, int cc, int repl, int repr)
Scanline copying function, for vertical resizing.
Definition lancir.h:1447
int resizeImage(const Tin *const SrcBuf, const int SrcWidth, const int SrcHeight, Tout *const NewBuf, const int NewWidth, const int NewHeight, const int ElCount, const CLancIRParams *const aParams=nullptr)
Function resizes an image.
Definition lancir.h:387
size_t FltBuf0Len
Length of FltBuf0.
Definition lancir.h:759
int spv0len
Length of spv0.
Definition lancir.h:763
static void reallocBuf(Tb *&buf, Tl &len, const Tl newlen)
Typed buffer reallocation function.
Definition lancir.h:818
static void copyScanline3v(const T *ip, const size_t ipinc, float *op, int cc, int repl, int repr)
Scanline copying function, for vertical resizing.
Definition lancir.h:1492
CResizeScanline rsh
Horizontal resize scanline.
Definition lancir.h:1387
static void resize1(const float *const sp, float *op, const size_t opinc, const CResizePos *rp, const int kl, const int DstLen)
Function performs scanline resizing. Variants for 1-4-channel images.
Definition lancir.h:2103
static void padScanline3h(float *op, CResizeScanline &rs, const int l)
Scanline padding function, for horizontal resizing.
Definition lancir.h:1664
float * spv
Address-aligned spv0.
Definition lancir.h:764
static void reallocBuf(Tb *&buf0, Tb *&buf, Tl &len, Tl newlen)
Typed buffer reallocation function, with address alignment.
Definition lancir.h:783
Class for fractional delay filter bank storage and calculation.
Definition lancir.h:841
float * Bufs[BufCount]
Address-aligned Bufs0.
Definition lancir.h:988
int CurBufFill
The number of fractional positions filled in the current filter buffer.
Definition lancir.h:990
const float * getFilter(const double x)
Filter acquisition function.
Definition lancir.h:940
double Len2
Half resampling filter's length, unrounded.
Definition lancir.h:972
double k
Current k.
Definition lancir.h:997
int CurBuf
Filter buffer currently being filled.
Definition lancir.h:989
int ElCount
Current ElCount.
Definition lancir.h:998
static const int BufLen
The number of fractional filters a single buffer (filter batch) may contain. Both the BufLen and BufC...
Definition lancir.h:981
int ElRepl
The number of repetitions of each filter tap.
Definition lancir.h:978
bool update(const double la0, const double k0, const int ElCount0)
Function updates the filter bank.
Definition lancir.h:882
static const int BufCount
The maximal number of buffers (filter batches) that can be in use.
Definition lancir.h:979
void makeFilterNorm(float *op, const double FracDelay) const
Filter calculation function.
Definition lancir.h:1076
void setBuf(const int bi)
Current buffer (filter batch) repositioning function.
Definition lancir.h:1009
int KernelLenA
SIMD-aligned and replicated filter kernel's length.
Definition lancir.h:976
static void replicateFilter(float *const p, const int kl, const int erp)
Filter tap replication function, for SIMD operations.
Definition lancir.h:1171
double la
Current la.
Definition lancir.h:996
float ** Filters
Fractional delay filters for all positions. A particular pointer equals nullptr, if a filter for such...
Definition lancir.h:992
double FreqA
Circular frequency of the window function.
Definition lancir.h:971
int fl2
Half resampling filter's length, integer.
Definition lancir.h:973
float * Bufs0[BufCount]
Buffers that hold all filters, original.
Definition lancir.h:984
int FiltersLen
Allocated length of Filters, in elements.
Definition lancir.h:995
double Freq
Circular frequency of the filter.
Definition lancir.h:970
int FracCount
The number of fractional positions for which filters can be created.
Definition lancir.h:974
int KernelLen
Resampling filter kernel's length, taps. Available after the update() function call....
Definition lancir.h:845
int Bufs0Len[BufCount]
Allocated lengthes in Bufs0, in float elements.
Definition lancir.h:986
Sine-wave signal generator class.
Definition lancir.h:1027
CSineGen(const double si, const double ph)
Constructor initializes this sine-wave signal generator.
Definition lancir.h:1038
double generate()
Generates the next sine-wave sample, without biasing.
Definition lancir.h:1049
Structure defines source scanline positions and filters for each destination pixel.
Definition lancir.h:1227
intptr_t so
Offset within the source scanline, in pixels.
Definition lancir.h:1231
intptr_t spo
Source scanline's pixel offset, in bytes, or a direct pointer to scanline buffer.
Definition lancir.h:1229
const float * flt
Fractional delay filter.
Definition lancir.h:1228
Scanline resizing positions class.
Definition lancir.h:1243
int SrcLen
Current SrcLen.
Definition lancir.h:1378
int poslen
Allocated pos buffer's length.
Definition lancir.h:1377
void update(const int SrcLen0, const int DstLen0, const double o0, CResizeFilters &rf, float *const sp=nullptr)
Scanline positions update function.
Definition lancir.h:1290
int padl
Left-padding (in pixels) required for source scanline.
Definition lancir.h:1245
CResizePos * pos
Source scanline positions (offsets) and filters for each destination pixel position.
Definition lancir.h:1247
void reset()
Object's reset function.
Definition lancir.h:1270
int padr
Right-padding (in pixels) required for source scanline.
Definition lancir.h:1246
double o
Current o.
Definition lancir.h:1380
void updateSPO(CResizeFilters &rf, float *const sp)
Scanline pixel offsets update function.
Definition lancir.h:1364
int DstLen
Current DstLen.
Definition lancir.h:1379