diff options
author | Timo Rothenpieler <timo@rothenpieler.org> | 2020-10-31 20:22:33 +0100 |
---|---|---|
committer | Timo Rothenpieler <timo@rothenpieler.org> | 2020-11-03 19:58:13 +0100 |
commit | f1d0f83712470c0fef13b8215cccbdb77ba7f3bf (patch) | |
tree | fd943ac7c7227ce12c13ede5ce7cad8220078058 /compat/cuda | |
parent | d5763edab21b08cd1388a2f41885d11d61653917 (diff) | |
download | ffmpeg-f1d0f83712470c0fef13b8215cccbdb77ba7f3bf.tar.gz |
avfilter/scale_cuda: add bicubic interpolation
Diffstat (limited to 'compat/cuda')
-rw-r--r-- | compat/cuda/cuda_runtime.h | 68 |
1 files changed, 61 insertions, 7 deletions
diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h index 92c55ad859..353efcf5f9 100644 --- a/compat/cuda/cuda_runtime.h +++ b/compat/cuda/cuda_runtime.h @@ -49,18 +49,23 @@ typedef struct __device_builtin__ __align__(4) ushort2 unsigned short x, y; } ushort2; -typedef struct __device_builtin__ uint3 +typedef struct __device_builtin__ __align__(8) float2 { - unsigned int x, y, z; -} uint3; - -typedef struct uint3 dim3; + float x, y; +} float2; typedef struct __device_builtin__ __align__(8) int2 { int x, y; } int2; +typedef struct __device_builtin__ uint3 +{ + unsigned int x, y, z; +} uint3; + +typedef struct uint3 dim3; + typedef struct __device_builtin__ __align__(4) uchar4 { unsigned char x, y, z, w; @@ -76,6 +81,11 @@ typedef struct __device_builtin__ __align__(16) int4 int x, y, z, w; } int4; +typedef struct __device_builtin__ __align__(16) float4 +{ + float x, y, z, w; +} float4; + // Accessors for special registers #define GETCOMP(reg, comp) \ asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \ @@ -100,24 +110,31 @@ GET(getThreadIdx, tid) #define threadIdx (getThreadIdx()) // Basic initializers (simple macros rather than inline functions) +#define make_int2(a, b) ((int2){.x = a, .y = b}) #define make_uchar2(a, b) ((uchar2){.x = a, .y = b}) #define make_ushort2(a, b) ((ushort2){.x = a, .y = b}) +#define make_float2(a, b) ((float2){.x = a, .y = b}) +#define make_int4(a, b, c, d) ((int4){.x = a, .y = b, .z = c, .w = d}) #define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d}) #define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d}) +#define make_float4(a, b, c, d) ((float4){.x = a, .y = b, .z = c, .w = d}) // Conversions from the tex instruction's 4-register output to various types #define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);} TEX2D(unsigned char, a & 0xFF) TEX2D(unsigned short, a & 0xFFFF) +TEX2D(float, a) TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF)) TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF)) +TEX2D(float2, make_float2(a, b)) TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF)) TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF)) +TEX2D(float4, make_float4(a, b, c, d)) // Template calling tex instruction and converting the output to the selected type -template <class T> -static inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y) +template<typename T> +inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y) { T ret; unsigned ret1, ret2, ret3, ret4; @@ -128,4 +145,41 @@ static inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y return ret; } +template<> +inline __device__ float4 tex2D<float4>(cudaTextureObject_t texObject, float x, float y) +{ + float4 ret; + asm("tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" : + "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : + "l"(texObject), "f"(x), "f"(y)); + return ret; +} + +template<> +inline __device__ float tex2D<float>(cudaTextureObject_t texObject, float x, float y) +{ + return tex2D<float4>(texObject, x, y).x; +} + +template<> +inline __device__ float2 tex2D<float2>(cudaTextureObject_t texObject, float x, float y) +{ + float4 ret = tex2D<float4>(texObject, x, y); + return make_float2(ret.x, ret.y); +} + +// Math helper functions +static inline __device__ float floorf(float a) { return __builtin_floorf(a); } +static inline __device__ float floor(float a) { return __builtin_floorf(a); } +static inline __device__ double floor(double a) { return __builtin_floor(a); } +static inline __device__ float ceilf(float a) { return __builtin_ceilf(a); } +static inline __device__ float ceil(float a) { return __builtin_ceilf(a); } +static inline __device__ double ceil(double a) { return __builtin_ceil(a); } +static inline __device__ float truncf(float a) { return __builtin_truncf(a); } +static inline __device__ float trunc(float a) { return __builtin_truncf(a); } +static inline __device__ double trunc(double a) { return __builtin_trunc(a); } +static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); } +static inline __device__ float fabs(float a) { return __builtin_fabsf(a); } +static inline __device__ double fabs(double a) { return __builtin_fabs(a); } + #endif /* COMPAT_CUDA_CUDA_RUNTIME_H */ |