Skip to content

Commit 0d407dd

Browse files
authored
Merge pull request #10 from KernelTuner/dev
Prepare 0.3 release
2 parents 8333b04 + 4d18563 commit 0d407dd

37 files changed

+2727
-1141
lines changed

.github/workflows/cmake-action.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
runs-on: ubuntu-latest
2020

2121
steps:
22-
- uses: Jimver/cuda-toolkit@v0.2.11
22+
- uses: Jimver/cuda-toolkit@v0.2.22
2323
id: cuda-toolkit
2424
with:
2525
method: network
@@ -33,7 +33,7 @@ jobs:
3333
- name: Configure CMake
3434
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
3535
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
36-
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DKERNEL_FLOAT_BUILD_TEST=1 -DKERNEL_FLOAT_BUILD_EXAMPLE=1
36+
run: CUDAARCHS=all cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DKERNEL_FLOAT_BUILD_TEST=1 -DKERNEL_FLOAT_BUILD_EXAMPLE=1
3737

3838
- name: Build
3939
# Build your program with the given configuration

.github/workflows/cmake.yml

+5-5
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@ jobs:
1313
build-cuda:
1414
uses: ./.github/workflows/cmake-action.yml
1515
with:
16-
cuda-version: "12.2.0"
16+
cuda-version: "12.8.0"
1717

18-
build-cuda-11-7:
18+
build-cuda-12-6:
1919
needs: build-cuda
2020
uses: ./.github/workflows/cmake-action.yml
2121
with:
22-
cuda-version: "11.7.0"
22+
cuda-version: "12.6.0"
2323

24-
build-cuda-12-0:
24+
build-cuda-12-5:
2525
needs: build-cuda
2626
uses: ./.github/workflows/cmake-action.yml
2727
with:
28-
cuda-version: "12.0.0"
28+
cuda-version: "12.5.0"

CMakeLists.txt

+29-2
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,44 @@
11
cmake_minimum_required(VERSION 3.20)
22

33
set (PROJECT_NAME kernel_float)
4-
project(${PROJECT_NAME} CXX CUDA)
4+
project(${PROJECT_NAME} LANGUAGES CXX)
55

6-
set(CMAKE_C_STANDARD 11)
6+
set(CMAKE_CXX_STANDARD 17)
7+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
78

9+
# Validate and enable the appropriate language
10+
if (NOT DEFINED KERNEL_FLOAT_LANGUAGE)
11+
set(KERNEL_FLOAT_LANGUAGE "CUDA")
12+
endif()
13+
14+
if (KERNEL_FLOAT_LANGUAGE STREQUAL "CUDA")
15+
enable_language(CUDA)
16+
set(KERNEL_FLOAT_LANGUAGE_CUDA ON)
17+
elseif (KERNEL_FLOAT_LANGUAGE STREQUAL "HIP")
18+
enable_language(HIP)
19+
set(KERNEL_FLOAT_LANGUAGE_HIP ON)
20+
else()
21+
message(FATAL_ERROR "KERNEL_FLOAT_LANGUAGE must be either 'HIP' or 'CUDA'")
22+
endif()
23+
24+
# Create an interface library for kernel_float
825
add_library(${PROJECT_NAME} INTERFACE)
926
target_include_directories(${PROJECT_NAME} INTERFACE "${PROJECT_SOURCE_DIR}/include")
1027

28+
# Optionally build tests and examples if the corresponding flags are set
29+
option(KERNEL_FLOAT_BUILD_TEST "Build kernel float tests" OFF)
30+
option(KERNEL_FLOAT_BUILD_EXAMPLE "Build kernel float examples" OFF)
31+
1132
if (KERNEL_FLOAT_BUILD_TEST)
1233
add_subdirectory(tests)
1334
endif()
1435

1536
if (KERNEL_FLOAT_BUILD_EXAMPLE)
1637
add_subdirectory(examples)
1738
endif()
39+
40+
# Display configuration
41+
message(STATUS "=== Kernel Float ===")
42+
message(STATUS "Using GPU Language: ${KERNEL_FLOAT_LANGUAGE}")
43+
message(STATUS "Building Tests: ${KERNEL_FLOAT_BUILD_TEST}")
44+
message(STATUS "Building Examples: ${KERNEL_FLOAT_BUILD_EXAMPLE}")

README.md

+8-7
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
![GitHub Repo stars](https://img.shields.io/github/stars/KernelTuner/kernel_float?style=social)
1010

1111

12-
_Kernel Float_ is a header-only library for CUDA that simplifies working with vector types and reduced precision floating-point arithmetic in GPU code.
12+
_Kernel Float_ is a header-only library for CUDA/HIP that simplifies working with vector types and reduced precision floating-point arithmetic in GPU code.
1313

1414

1515
## Summary
1616

17-
CUDA natively offers several reduced precision floating-point types (`__half`, `__nv_bfloat16`, `__nv_fp8_e4m3`, `__nv_fp8_e5m2`)
17+
CUDA/HIP natively offers several reduced precision floating-point types (`__half`, `__nv_bfloat16`, `__nv_fp8_e4m3`, `__nv_fp8_e5m2`)
1818
and vector types (e.g., `__half2`, `__nv_fp8x4_e4m3`, `float3`).
1919
However, working with these types is cumbersome:
2020
mathematical operations require intrinsics (e.g., `__hadd2` performs addition for `__half2`),
@@ -24,9 +24,9 @@ and some functionality is missing (e.g., one cannot convert a `__half` to `__nv_
2424
_Kernel Float_ resolves this by offering a single data type `kernel_float::vec<T, N>` that stores `N` elements of type `T`.
2525
Internally, the data is stored as a fixed-sized array of elements.
2626
Operator overloading (like `+`, `*`, `&&`) has been implemented such that the most optimal intrinsic for the available types is selected automatically.
27-
Many mathetical functions (like `log`, `exp`, `sin`) and common operations (such as `sum`, `range`, `for_each`) are also available.
27+
Many mathematical functions (like `log`, `exp`, `sin`) and common operations (such as `sum`, `range`, `for_each`) are also available.
2828

29-
By using this library, developers can avoid the complexity of working with reduced precision floating-point types in CUDA and focus on their applications.
29+
Using Kernel Float, developers avoid the complexity of reduced precision floating-point types in CUDA and can focus on their applications.
3030

3131

3232
## Features
@@ -40,6 +40,7 @@ In a nutshell, _Kernel Float_ offers the following features:
4040
* Easy integration as a single header file.
4141
* Written for C++17.
4242
* Compatible with NVCC (NVIDIA Compiler) and NVRTC (NVIDIA Runtime Compilation).
43+
* Compatible with HIPCC (AMD HIP Compiler)
4344

4445

4546
## Example
@@ -49,7 +50,7 @@ Check out the [examples](https://github.com/KernelTuner/kernel_float/tree/master
4950

5051
Below shows a simple example of a CUDA kernel that adds a `constant` to the `input` array and writes the results to the `output` array.
5152
Each thread processes two elements.
52-
Notice how easy it would be change the precision (for example, `double` to `half`) or the vector size (for example, 4 instead of 2 items per thread).
53+
Notice how easy it would be to change the precision (for example, `double` to `half`) or the vector size (for example, 4 instead of 2 items per thread).
5354

5455

5556
```cpp
@@ -63,14 +64,14 @@ __global__ void kernel(const kf::vec<half, 2>* input, float constant, kf::vec<fl
6364

6465
```
6566
66-
Here is how the same kernel would like without Kernel Float.
67+
Here is how the same kernel would look for CUDA without Kernel Float.
6768
6869
```cpp
6970
__global__ void kernel(const __half* input, float constant, float* output) {
7071
int i = blockIdx.x * blockDim.x + threadIdx.x;
7172
__half in0 = input[2 * i + 0];
7273
__half in1 = input[2 * i + 1];
73-
__half2 a = __halves2half2(in0, int1);
74+
__half2 a = __halves2half2(in0, in1);
7475
float b = float(constant);
7576
__half c = __float2half(b);
7677
__half2 d = __half2half2(c);

docs/guides/accuracy.md

+13-8
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,13 @@ kf::vec<float, 4> c = kf::fast_rcp(x);
2525
kf::vec<float, 4> d = kf::fast_div(a, b);
2626
```
2727
28-
These functions are only functional for 32-bit and 16-bit floats.
28+
These functions are only functional for 32-bit and 16-bit floats.
2929
For other input types, the operation falls back to the regular version.
3030
3131
## Approximate Math
3232
33-
For 16-bit floats, several approximate functions are provided.
34-
These use approximations (typically low-degree polynomials) to calculate rough estimates of the functions.
33+
For 16-bit floats, several approximate functions are provided.
34+
These use approximations (typically low-degree polynomials) to calculate rough estimates of the functions.
3535
This can be very fast but also less accurate.
3636
3737
@@ -69,14 +69,15 @@ kf::vec<half, 4> a = kf::approx_sin<3>(x);
6969

7070
## Tuning Accuracy Level
7171

72-
Many functions in Kernel Float accept an additional Accuracy option as a template parameter.
72+
Many functions in Kernel Float accept an additional `Accuracy` option as a template parameter.
7373
This allows you to tune the accuracy level without changing the function name.
7474

75-
There are four possible values for this parameter:
75+
There are five possible values for this parameter:
7676

7777
- `kf::accurate_policy`: Use the most accurate version of the function available.
7878
- `kf::fast_policy`: Use the "fast math" version.
79-
- `kf::approx_policy<N>`: Use the approximate version with degree `N`.
79+
- `kf::approx_level_policy<N>`: Use the approximate version with accuracy level `N` (higher is more accurate).
80+
- `kf::approx_policy`: Use the approximate version with a default accuracy level.
8081
- `kf::default_policy`: Use a global default policy (see the next section).
8182

8283
For example, consider this code:
@@ -97,15 +98,19 @@ kf::vec<float, 2> c = kf::cos<kf::accurate_policy>(input);
9798
kf::vec<float, 2> d = kf::cos<kf::fast_policy>(input);
9899

99100
// Use the approximate policy
100-
kf::vec<float, 2> e = kf::cos<kf::approx_policy<3>>(input);
101+
kf::vec<float, 2> e = kf::cos<kf::approx_policy>(input);
102+
103+
// Use the approximate policy with degree 3 polynomial.
104+
kf::vec<float, 2> f = kf::cos<kf::approx_level_policy<3>>(input);
101105

102106
// You can use aliases to define your own policy
103107
using my_own_policy = kf::fast_policy;
104-
kf::vec<float, 2> f = kf::cos<my_own_policy>(input);
108+
kf::vec<float, 2> g = kf::cos<my_own_policy>(input);
105109
```
106110
107111
## Setting `default_policy`
108112
113+
If no policy is explicitly set, any function use the `kf::default_policy`.
109114
By default, `kf::default_policy` is set to `kf::accurate_policy`.
110115
111116
Set the preprocessor option `KERNEL_FLOAT_FAST_MATH=1` to change the default policy to `kf::fast_policy`.

example.cu

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#include "kernel_float.h"
2+
#include <cuda_fp16.h>
3+
4+
namespace kf = kernel_float;
5+
6+
__global__ void kernel(
7+
kf::vec_ptr<half, 4, const __nv_fp8_e5m2> input,
8+
float constant,
9+
kf::vec_ptr<half, 4> output
10+
) {
11+
int i = blockIdx.x * blockDim.x + threadIdx.x;
12+
output(i) = input[i] + kf::cast<half>(constant);
13+
}

examples/hip_compat.h

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#pragma once
2+
3+
/**
4+
* This header file provides a mapping from CUDA-specific function names and types to their equivalent HIP
5+
* counterparts, allowing for cross-platform development between CUDA and HIP. By including this header, code
6+
* originally written for CUDA can be compiled with the HIP compiler (hipcc) by automatically replacing CUDA API
7+
* calls with their HIP equivalents.
8+
*/
9+
#ifdef __HIPCC__
10+
#define cudaError_t hipError_t
11+
#define cudaSuccess hipSuccess
12+
#define cudaGetErrorString hipGetErrorString
13+
#define cudaGetLastError hipGetLastError
14+
#define cudaMalloc hipMalloc
15+
#define cudaFree hipFree
16+
#define cudaMemcpy hipMemcpy
17+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
18+
#define cudaMemcpyDefault hipMemcpyDefault
19+
#define cudaMemset hipMemset
20+
#define cudaSetDevice hipSetDevice
21+
#define cudaDeviceSynchronize hipDeviceSynchronize
22+
#endif

examples/pi/CMakeLists.txt

+12-6
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
1-
cmake_minimum_required(VERSION 3.17)
1+
cmake_minimum_required(VERSION 3.20)
22

33
set (PROJECT_NAME kernel_float_pi)
4-
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
5-
set (CMAKE_CXX_STANDARD 17)
4+
project(${PROJECT_NAME} LANGUAGES CXX)
65

6+
set (CMAKE_CXX_STANDARD 17)
77
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
88
target_link_libraries(${PROJECT_NAME} kernel_float)
9-
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
109

11-
find_package(CUDA REQUIRED)
12-
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
10+
if(${KERNEL_FLOAT_LANGUAGE_CUDA})
11+
find_package(CUDA REQUIRED)
12+
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
13+
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
14+
endif()
15+
16+
if(${KERNEL_FLOAT_LANGUAGE_HIP})
17+
set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
18+
endif()

examples/pi/main.cu

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <stdio.h>
22
#include <stdlib.h>
33

4+
#include "../hip_compat.h"
45
#include "kernel_float.h"
56

67
#define CUDA_CHECK(call) \
@@ -9,12 +10,12 @@
910
if (__err != cudaSuccess) { \
1011
fprintf( \
1112
stderr, \
12-
"CUDA error at %s:%d code=%d(%s) \"%s\" \n", \
13+
"CUDA error at %s:%d (%s): %s (code %d) \n", \
1314
__FILE__, \
1415
__LINE__, \
15-
__err, \
16+
#call, \
1617
cudaGetErrorString(__err), \
17-
#call); \
18+
__err); \
1819
exit(EXIT_FAILURE); \
1920
} \
2021
} while (0)

examples/vector_add/CMakeLists.txt

+11-5
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
cmake_minimum_required(VERSION 3.17)
22

33
set (PROJECT_NAME kernel_float_vecadd)
4-
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
5-
set (CMAKE_CXX_STANDARD 17)
4+
project(${PROJECT_NAME} LANGUAGES CXX)
65

6+
set (CMAKE_CXX_STANDARD 17)
77
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
88
target_link_libraries(${PROJECT_NAME} kernel_float)
9-
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
109

11-
find_package(CUDA REQUIRED)
12-
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
10+
if(${KERNEL_FLOAT_LANGUAGE_HIP})
11+
set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
12+
endif()
13+
14+
if(${KERNEL_FLOAT_LANGUAGE_CUDA})
15+
find_package(CUDA REQUIRED)
16+
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
17+
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
18+
endif()

examples/vector_add/main.cu

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <stdexcept>
44
#include <vector>
55

6+
#include "../hip_compat.h"
67
#include "kernel_float.h"
78
namespace kf = kernel_float;
89

@@ -21,7 +22,7 @@ __global__ void my_kernel(
2122
int i = blockIdx.x * blockDim.x + threadIdx.x;
2223

2324
if (i * N < length) {
24-
output(i) = kf::fma(input[i], input[i], kf::cast<__half>(constant));
25+
output[i] = kf::fma(input[i], input[i], kf::cast<half>(constant));
2526
}
2627
}
2728

+11-5
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
cmake_minimum_required(VERSION 3.17)
22

33
set (PROJECT_NAME kernel_float_vecadd_tiling)
4-
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
5-
set (CMAKE_CXX_STANDARD 17)
4+
project(${PROJECT_NAME} LANGUAGES CXX)
65

6+
set (CMAKE_CXX_STANDARD 17)
77
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
88
target_link_libraries(${PROJECT_NAME} kernel_float)
9-
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
109

11-
find_package(CUDA REQUIRED)
12-
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
10+
if(${KERNEL_FLOAT_LANGUAGE_HIP})
11+
set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
12+
endif()
13+
14+
if(${KERNEL_FLOAT_LANGUAGE_CUDA})
15+
find_package(CUDA REQUIRED)
16+
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
17+
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
18+
endif()

examples/vector_add_tiling/main.cu

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <stdexcept>
44
#include <vector>
55

6+
#include "../hip_compat.h"
67
#include "kernel_float.h"
78
#include "kernel_float/tiling.h"
89
namespace kf = kernel_float;

include/kernel_float.h

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifndef KERNEL_FLOAT_H
22
#define KERNEL_FLOAT_H
33

4+
#include "kernel_float/approx.h"
45
#include "kernel_float/base.h"
56
#include "kernel_float/bf16.h"
67
#include "kernel_float/binops.h"

0 commit comments

Comments
 (0)