From b1bbd65aaa52af57e47bc3a1185794025237d635 Mon Sep 17 00:00:00 2001 From: przmk Date: Mon, 22 Jul 2024 00:27:24 +0200 Subject: [PATCH] Add uniform buffers --- build.zig | 8 + build.zig.zon | 1 + libs/zmath/LICENSE | 22 + libs/zmath/README.md | 133 + libs/zmath/build.zig | 62 + libs/zmath/build.zig.zon | 10 + libs/zmath/src/benchmark.zig | 469 ++++ libs/zmath/src/main.zig | 16 + libs/zmath/src/util.zig | 188 ++ libs/zmath/src/zmath.zig | 4568 ++++++++++++++++++++++++++++++++++ src/main.zig | 20 + src/shaders/shader.vert | 8 +- src/vulkan_renderer.zig | 194 +- 13 files changed, 5695 insertions(+), 4 deletions(-) create mode 100644 libs/zmath/LICENSE create mode 100644 libs/zmath/README.md create mode 100644 libs/zmath/build.zig create mode 100644 libs/zmath/build.zig.zon create mode 100644 libs/zmath/src/benchmark.zig create mode 100644 libs/zmath/src/main.zig create mode 100644 libs/zmath/src/util.zig create mode 100644 libs/zmath/src/zmath.zig diff --git a/build.zig b/build.zig index b80b57a..17b8214 100644 --- a/build.zig +++ b/build.zig @@ -14,6 +14,8 @@ pub fn build(b: *std.Build) void { .optimize = optimize, }); + // --- Dependencies --- + // Vulkan const vkzig_dep = b.dependency("vulkan_zig", .{ .registry = @as([]const u8, b.pathFromRoot("./vk.xml")), @@ -35,6 +37,12 @@ pub fn build(b: *std.Build) void { sdl_sdk.link(exe, .dynamic, .SDL2); exe.root_module.addImport("sdl2", sdl_sdk.getWrapperModuleVulkan(vkzig_bindings)); + // zmath + const zmath = b.dependency("zmath", .{}); + exe.root_module.addImport("zmath", zmath.module("root")); + + // --- + b.installArtifact(exe); const check = b.step("check", "Check if vulkan-test compiles"); diff --git a/build.zig.zon b/build.zig.zon index 7a2fd1e..911ff6c 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -4,6 +4,7 @@ .version = "0.1.0", .dependencies = .{ + .zmath = .{ .path = "libs/zmath" }, .vulkan_zig = .{ .url = "https://github.com/Snektron/vulkan-zig/archive/9f6e6177b1fdb3ed22231d9216a24480e84cfa5e.tar.gz", .hash = "1220f2961df224f7d35dee774b26194b8b937cc252fa8e4023407776c58521d53e38", diff --git a/libs/zmath/LICENSE b/libs/zmath/LICENSE new file mode 100644 index 0000000..3db02da --- /dev/null +++ b/libs/zmath/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2021 Michal Ziulek +Copyright (c) 2024 zig-gamedev contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/libs/zmath/README.md b/libs/zmath/README.md new file mode 100644 index 0000000..cc2e8f8 --- /dev/null +++ b/libs/zmath/README.md @@ -0,0 +1,133 @@ +# zmath v0.10.0 - SIMD math library for game developers + +Tested on x86_64 and AArch64. + +Provides ~140 optimized routines and ~70 extensive tests. + +Can be used with any graphics API. + +Documentation can be found [here](https://github.com/michal-z/zig-gamedev/blob/main/libs/zmath/src/zmath.zig). + +Benchamrks can be found [here](https://github.com/michal-z/zig-gamedev/blob/main/libs/zmath/src/benchmark.zig). + +An intro article can be found [here](https://zig.news/michalz/fast-multi-platform-simd-math-library-in-zig-2adn). + +## Getting started + +Copy `zmath` into a subdirectory of your project and add the following to your `build.zig.zon` .dependencies: +```zig + .zmath = .{ .path = "libs/zmath" }, +``` + +Then in your `build.zig` add: + +```zig +pub fn build(b: *std.Build) void { + const exe = b.addExecutable(.{ ... }); + + const zmath = b.dependency("zmath", .{}); + exe.root_module.addImport("zmath", zmath.module("root")); +} +``` + +Now in your code you may import and use zmath: + +```zig +const zm = @import("zmath"); + +pub fn main() !void { + // + // OpenGL/Vulkan example + // + const object_to_world = zm.rotationY(..); + const world_to_view = zm.lookAtRh( + zm.f32x4(3.0, 3.0, 3.0, 1.0), // eye position + zm.f32x4(0.0, 0.0, 0.0, 1.0), // focus point + zm.f32x4(0.0, 1.0, 0.0, 0.0), // up direction ('w' coord is zero because this is a vector not a point) + ); + // `perspectiveFovRhGl` produces Z values in [-1.0, 1.0] range (Vulkan app should use `perspectiveFovRh`) + const view_to_clip = zm.perspectiveFovRhGl(0.25 * math.pi, aspect_ratio, 0.1, 20.0); + + const object_to_view = zm.mul(object_to_world, world_to_view); + const object_to_clip = zm.mul(object_to_view, view_to_clip); + + // Transposition is needed because GLSL uses column-major matrices by default + gl.uniformMatrix4fv(0, 1, gl.TRUE, zm.arrNPtr(&object_to_clip)); + + // In GLSL: gl_Position = vec4(in_position, 1.0) * object_to_clip; + + // + // DirectX example + // + const object_to_world = zm.rotationY(..); + const world_to_view = zm.lookAtLh( + zm.f32x4(3.0, 3.0, -3.0, 1.0), // eye position + zm.f32x4(0.0, 0.0, 0.0, 1.0), // focus point + zm.f32x4(0.0, 1.0, 0.0, 0.0), // up direction ('w' coord is zero because this is a vector not a point) + ); + const view_to_clip = zm.perspectiveFovLh(0.25 * math.pi, aspect_ratio, 0.1, 20.0); + + const object_to_view = zm.mul(object_to_world, world_to_view); + const object_to_clip = zm.mul(object_to_view, view_to_clip); + + // Transposition is needed because HLSL uses column-major matrices by default + const mem = allocateUploadMemory(...); + zm.storeMat(mem, zm.transpose(object_to_clip)); + + // In HLSL: out_position_sv = mul(float4(in_position, 1.0), object_to_clip); + + // + // 'WASD' camera movement example + // + { + const speed = zm.f32x4s(10.0); + const delta_time = zm.f32x4s(demo.frame_stats.delta_time); + const transform = zm.mul(zm.rotationX(demo.camera.pitch), zm.rotationY(demo.camera.yaw)); + var forward = zm.normalize3(zm.mul(zm.f32x4(0.0, 0.0, 1.0, 0.0), transform)); + + zm.storeArr3(&demo.camera.forward, forward); + + const right = speed * delta_time * zm.normalize3(zm.cross3(zm.f32x4(0.0, 1.0, 0.0, 0.0), forward)); + forward = speed * delta_time * forward; + + var cam_pos = zm.loadArr3(demo.camera.position); + + if (keyDown('W')) { + cam_pos += forward; + } else if (keyDown('S')) { + cam_pos -= forward; + } + if (keyDown('D')) { + cam_pos += right; + } else if (keyDown('A')) { + cam_pos -= right; + } + + zm.storeArr3(&demo.camera.position, cam_pos); + } + + // + // SIMD wave equation solver example (works with vector width 4, 8 and 16) + // 'T' can be F32x4, F32x8 or F32x16 + // + var z_index: i32 = 0; + while (z_index < grid_size) : (z_index += 1) { + const z = scale * @intToFloat(f32, z_index - grid_size / 2); + const vz = zm.splat(T, z); + + var x_index: i32 = 0; + while (x_index < grid_size) : (x_index += zm.veclen(T)) { + const x = scale * @intToFloat(f32, x_index - grid_size / 2); + const vx = zm.splat(T, x) + voffset * zm.splat(T, scale); + + const d = zm.sqrt(vx * vx + vz * vz); + const vy = zm.sin(d - vtime); + + const index = @intCast(usize, x_index + z_index * grid_size); + zm.store(xslice[index..], vx, 0); + zm.store(yslice[index..], vy, 0); + zm.store(zslice[index..], vz, 0); + } + } +} +``` diff --git a/libs/zmath/build.zig b/libs/zmath/build.zig new file mode 100644 index 0000000..89f44b7 --- /dev/null +++ b/libs/zmath/build.zig @@ -0,0 +1,62 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + + const options = .{ + .optimize = b.option( + std.builtin.OptimizeMode, + "optimize", + "Select optimization mode", + ) orelse b.standardOptimizeOption(.{ + .preferred_optimize_mode = .ReleaseFast, + }), + .enable_cross_platform_determinism = b.option( + bool, + "enable_cross_platform_determinism", + "Enable cross-platform determinism", + ) orelse true, + }; + + const options_step = b.addOptions(); + inline for (std.meta.fields(@TypeOf(options))) |field| { + options_step.addOption(field.type, field.name, @field(options, field.name)); + } + + const options_module = options_step.createModule(); + + const zmath = b.addModule("root", .{ + .root_source_file = b.path("src/main.zig"), + .imports = &.{ + .{ .name = "zmath_options", .module = options_module }, + }, + }); + + const test_step = b.step("test", "Run zmath tests"); + + const tests = b.addTest(.{ + .name = "zmath-tests", + .root_source_file = b.path("src/main.zig"), + .target = target, + .optimize = options.optimize, + }); + b.installArtifact(tests); + + tests.root_module.addImport("zmath_options", options_module); + + test_step.dependOn(&b.addRunArtifact(tests).step); + + const benchmark_step = b.step("benchmark", "Run zmath benchmarks"); + + const benchmarks = b.addExecutable(.{ + .name = "zmath-benchmarks", + .root_source_file = b.path("src/benchmark.zig"), + .target = target, + .optimize = options.optimize, + }); + b.installArtifact(benchmarks); + + benchmarks.root_module.addImport("zmath", zmath); + + benchmark_step.dependOn(&b.addRunArtifact(benchmarks).step); +} diff --git a/libs/zmath/build.zig.zon b/libs/zmath/build.zig.zon new file mode 100644 index 0000000..dd7cab2 --- /dev/null +++ b/libs/zmath/build.zig.zon @@ -0,0 +1,10 @@ +.{ + .name = "zmath", + .version = "0.10.0", + .paths = .{ + "build.zig", + "build.zig.zon", + "src", + "README.md", + }, +} diff --git a/libs/zmath/src/benchmark.zig b/libs/zmath/src/benchmark.zig new file mode 100644 index 0000000..eb1aa8a --- /dev/null +++ b/libs/zmath/src/benchmark.zig @@ -0,0 +1,469 @@ +// ------------------------------------------------------------------------------------------------- +// zmath - benchmarks +// ------------------------------------------------------------------------------------------------- +// 'zig build benchmark -Doptimize=ReleaseFast' will build and benchmakrs with all optimisations. +// +// ------------------------------------------------------------------------------------------------- +// 'AMD Ryzen 9 3950X 16-Core Processor', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast +// ------------------------------------------------------------------------------------------------- +// matrix mul benchmark (AOS) - scalar version: 1.5880s, zmath version: 1.0642s +// cross3, scale, bias benchmark (AOS) - scalar version: 0.9318s, zmath version: 0.6888s +// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.2258s, zmath version: 1.1095s +// quaternion mul benchmark (AOS) - scalar version: 1.4123s, zmath version: 0.6958s +// wave benchmark (SOA) - scalar version: 4.8165s, zmath version: 0.7338s +// +// ------------------------------------------------------------------------------------------------- +// 'AMD Ryzen 7 5800X 8-Core Processer', Linux 5.17.14, Zig 0.10.0-dev.2624+d506275a0, ReleaseFast +// ------------------------------------------------------------------------------------------------- +// matrix mul benchmark (AOS) - scalar version: 1.3672s, zmath version: 0.8617s +// cross3, scale, bias benchmark (AOS) - scalar version: 0.6586s, zmath version: 0.4803s +// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.0620s, zmath version: 0.8942s +// quaternion mul benchmark (AOS) - scalar version: 1.1324s, zmath version: 0.6064s +// wave benchmark (SOA) - scalar version: 3.6598s, zmath version: 0.4231s +// +// ------------------------------------------------------------------------------------------------- +// 'Apple M1 Max', macOS Version 12.4, Zig 0.10.0-dev.2657+74442f350, ReleaseFast +// ------------------------------------------------------------------------------------------------- +// matrix mul benchmark (AOS) - scalar version: 1.0297s, zmath version: 1.0538s +// cross3, scale, bias benchmark (AOS) - scalar version: 0.6294s, zmath version: 0.6532s +// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 0.9807s, zmath version: 1.0988s +// quaternion mul benchmark (AOS) - scalar version: 1.5413s, zmath version: 0.7800s +// wave benchmark (SOA) - scalar version: 3.4220s, zmath version: 1.0255s +// +// ------------------------------------------------------------------------------------------------- +// '11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast +// ------------------------------------------------------------------------------------------------- +// matrix mul benchmark (AOS) - scalar version: 2.2308s, zmath version: 0.9376s +// cross3, scale, bias benchmark (AOS) - scalar version: 1.0821s, zmath version: 0.5110s +// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.6580s, zmath version: 0.9167s +// quaternion mul benchmark (AOS) - scalar version: 2.0139s, zmath version: 0.5856s +// wave benchmark (SOA) - scalar version: 3.7832s, zmath version: 0.3642s +// +// ------------------------------------------------------------------------------------------------- + +pub fn main() !void { + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const allocator = gpa.allocator(); + + // m = mul(ma, mb); data set fits in L1 cache; AOS data layout. + try mat4MulBenchmark(allocator, 100_000); + + // v = 0.01 * cross3(va, vb) + vec3(1.0); data set fits in L1 cache; AOS data layout. + try cross3ScaleBiasBenchmark(allocator, 10_000); + + // v = dot3(va, vb) * (0.1 * cross3(va, vb) + vec3(1.0)); data set fits in L1 cache; AOS data layout. + try cross3Dot3ScaleBiasBenchmark(allocator, 10_000); + + // q = qmul(qa, qb); data set fits in L1 cache; AOS data layout. + try quatBenchmark(allocator, 10_000); + + // d = sqrt(x * x + z * z); y = sin(d - t); SOA layout. + try waveBenchmark(allocator, 1_000); +} + +const std = @import("std"); +const time = std.time; +const Timer = time.Timer; +const zm = @import("zmath"); + +var prng = std.Random.DefaultPrng.init(0); +const random = prng.random(); + +noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { + std.debug.print("\n", .{}); + std.debug.print("{s:>42} - ", .{"matrix mul benchmark (AOS)"}); + + var data0 = std.ArrayList([16]f32).init(allocator); + defer data0.deinit(); + var data1 = std.ArrayList([16]f32).init(allocator); + defer data1.deinit(); + + var i: usize = 0; + while (i < 64) : (i += 1) { + try data0.append([16]f32{ + random.float(f32), random.float(f32), random.float(f32), random.float(f32), + random.float(f32), random.float(f32), random.float(f32), random.float(f32), + random.float(f32), random.float(f32), random.float(f32), random.float(f32), + random.float(f32), random.float(f32), random.float(f32), random.float(f32), + }); + try data1.append([16]f32{ + random.float(f32), random.float(f32), random.float(f32), random.float(f32), + random.float(f32), random.float(f32), random.float(f32), random.float(f32), + random.float(f32), random.float(f32), random.float(f32), random.float(f32), + random.float(f32), random.float(f32), random.float(f32), random.float(f32), + }); + } + + // Warmup, fills L1 cache. + i = 0; + while (i < 100) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const ma = zm.loadMat(a[0..]); + const mb = zm.loadMat(b[0..]); + const r = zm.mul(ma, mb); + std.mem.doNotOptimizeAway(&r); + } + } + } + + { + i = 0; + var timer = try Timer.start(); + const start = timer.lap(); + while (i < count) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const r = [16]f32{ + a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12], + a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13], + a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14], + a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15], + a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12], + a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13], + a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14], + a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15], + a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12], + a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13], + a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14], + a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15], + a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12], + a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13], + a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14], + a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15], + }; + std.mem.doNotOptimizeAway(&r); + } + } + } + const end = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + + std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); + } + + { + i = 0; + var timer = try Timer.start(); + const start = timer.lap(); + while (i < count) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const ma = zm.loadMat(a[0..]); + const mb = zm.loadMat(b[0..]); + const r = zm.mul(ma, mb); + std.mem.doNotOptimizeAway(&r); + } + } + } + const end = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + + std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); + } +} + +noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { + std.debug.print("{s:>42} - ", .{"cross3, scale, bias benchmark (AOS)"}); + + var data0 = std.ArrayList([3]f32).init(allocator); + defer data0.deinit(); + var data1 = std.ArrayList([3]f32).init(allocator); + defer data1.deinit(); + + var i: usize = 0; + while (i < 256) : (i += 1) { + try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); + try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); + } + + // Warmup, fills L1 cache. + i = 0; + while (i < 100) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const va = zm.loadArr3(a); + const vb = zm.loadArr3(b); + const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0); + std.mem.doNotOptimizeAway(&cp); + } + } + } + + { + i = 0; + var timer = try Timer.start(); + const start = timer.lap(); + while (i < count) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const r = [3]f32{ + 0.01 * (a[1] * b[2] - a[2] * b[1]) + 1.0, + 0.01 * (a[2] * b[0] - a[0] * b[2]) + 1.0, + 0.01 * (a[0] * b[1] - a[1] * b[0]) + 1.0, + }; + std.mem.doNotOptimizeAway(&r); + } + } + } + const end = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + + std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); + } + + { + i = 0; + var timer = try Timer.start(); + const start = timer.lap(); + while (i < count) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const va = zm.loadArr3(a); + const vb = zm.loadArr3(b); + const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0); + std.mem.doNotOptimizeAway(&cp); + } + } + } + const end = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + + std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); + } +} + +noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { + std.debug.print("{s:>42} - ", .{"cross3, dot3, scale, bias benchmark (AOS)"}); + + var data0 = std.ArrayList([3]f32).init(allocator); + defer data0.deinit(); + var data1 = std.ArrayList([3]f32).init(allocator); + defer data1.deinit(); + + var i: usize = 0; + while (i < 256) : (i += 1) { + try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); + try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); + } + + // Warmup, fills L1 cache. + i = 0; + while (i < 100) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const va = zm.loadArr3(a); + const vb = zm.loadArr3(b); + const r = (zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0)))[0]; + std.mem.doNotOptimizeAway(&r); + } + } + } + + { + i = 0; + var timer = try Timer.start(); + const start = timer.lap(); + while (i < count) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const d = a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; + const r = [3]f32{ + d * (0.1 * (a[1] * b[2] - a[2] * b[1]) + 1.0), + d * (0.1 * (a[2] * b[0] - a[0] * b[2]) + 1.0), + d * (0.1 * (a[0] * b[1] - a[1] * b[0]) + 1.0), + }; + std.mem.doNotOptimizeAway(&r); + } + } + } + const end = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + + std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); + } + + { + i = 0; + var timer = try Timer.start(); + const start = timer.lap(); + while (i < count) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const va = zm.loadArr3(a); + const vb = zm.loadArr3(b); + const r = zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0)); + std.mem.doNotOptimizeAway(&r); + } + } + } + const end = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + + std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); + } +} + +noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { + std.debug.print("{s:>42} - ", .{"quaternion mul benchmark (AOS)"}); + + var data0 = std.ArrayList([4]f32).init(allocator); + defer data0.deinit(); + var data1 = std.ArrayList([4]f32).init(allocator); + defer data1.deinit(); + + var i: usize = 0; + while (i < 256) : (i += 1) { + try data0.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) }); + try data1.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) }); + } + + // Warmup, fills L1 cache. + i = 0; + while (i < 100) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const va = zm.loadArr4(a); + const vb = zm.loadArr4(b); + const r = zm.qmul(va, vb); + std.mem.doNotOptimizeAway(&r); + } + } + } + + { + i = 0; + var timer = try Timer.start(); + const start = timer.lap(); + while (i < count) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const r = [4]f32{ + (b[3] * a[0]) + (b[0] * a[3]) + (b[1] * a[2]) - (b[2] * a[1]), + (b[3] * a[1]) - (b[0] * a[2]) + (b[1] * a[3]) + (b[2] * a[0]), + (b[3] * a[2]) + (b[0] * a[1]) - (b[1] * a[0]) + (b[2] * a[3]), + (b[3] * a[3]) - (b[0] * a[0]) - (b[1] * a[1]) - (b[2] * a[2]), + }; + std.mem.doNotOptimizeAway(&r); + } + } + } + const end = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + + std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); + } + + { + i = 0; + var timer = try Timer.start(); + const start = timer.lap(); + while (i < count) : (i += 1) { + for (data1.items) |b| { + for (data0.items) |a| { + const va = zm.loadArr4(a); + const vb = zm.loadArr4(b); + const r = zm.qmul(va, vb); + std.mem.doNotOptimizeAway(&r); + } + } + } + const end = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + + std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); + } +} + +noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { + _ = allocator; + std.debug.print("{s:>42} - ", .{"wave benchmark (SOA)"}); + + const grid_size = 1024; + { + var t: f32 = 0.0; + + const scale: f32 = 0.05; + + var timer = try Timer.start(); + const start = timer.lap(); + + var iter: usize = 0; + while (iter < count) : (iter += 1) { + var z_index: i32 = 0; + while (z_index < grid_size) : (z_index += 1) { + const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2)); + + var x_index: i32 = 0; + while (x_index < grid_size) : (x_index += 4) { + const x0 = scale * @as(f32, @floatFromInt(x_index + 0 - grid_size / 2)); + const x1 = scale * @as(f32, @floatFromInt(x_index + 1 - grid_size / 2)); + const x2 = scale * @as(f32, @floatFromInt(x_index + 2 - grid_size / 2)); + const x3 = scale * @as(f32, @floatFromInt(x_index + 3 - grid_size / 2)); + + const d0 = zm.sqrt(x0 * x0 + z * z); + const d1 = zm.sqrt(x1 * x1 + z * z); + const d2 = zm.sqrt(x2 * x2 + z * z); + const d3 = zm.sqrt(x3 * x3 + z * z); + + const y0 = zm.sin(d0 - t); + const y1 = zm.sin(d1 - t); + const y2 = zm.sin(d2 - t); + const y3 = zm.sin(d3 - t); + + std.mem.doNotOptimizeAway(&y0); + std.mem.doNotOptimizeAway(&y1); + std.mem.doNotOptimizeAway(&y2); + std.mem.doNotOptimizeAway(&y3); + } + } + t += 0.001; + } + const end = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + + std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); + } + + { + const T = zm.F32x16; + + const static = struct { + const offsets = [16]f32{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + }; + const voffset = zm.load(static.offsets[0..], T, 0); + var vt = zm.splat(T, 0.0); + + const scale: f32 = 0.05; + + var timer = try Timer.start(); + const start = timer.lap(); + + var iter: usize = 0; + while (iter < count) : (iter += 1) { + var z_index: i32 = 0; + while (z_index < grid_size) : (z_index += 1) { + const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2)); + const vz = zm.splat(T, z); + + var x_index: i32 = 0; + while (x_index < grid_size) : (x_index += zm.veclen(T)) { + const x = scale * @as(f32, @floatFromInt(x_index - grid_size / 2)); + const vx = zm.splat(T, x) + voffset * zm.splat(T, scale); + + const d = zm.sqrt(vx * vx + vz * vz); + + const vy = zm.sin(d - vt); + + std.mem.doNotOptimizeAway(&vy); + } + } + vt += zm.splat(T, 0.001); + } + const end = timer.read(); + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + + std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); + } +} diff --git a/libs/zmath/src/main.zig b/libs/zmath/src/main.zig new file mode 100644 index 0000000..3158a6c --- /dev/null +++ b/libs/zmath/src/main.zig @@ -0,0 +1,16 @@ +//-------------------------------------------------------------------------------------------------- +// +// SIMD math library for game developers +// https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath +// +// See zmath.zig for more details. +// See util.zig for additional functionality. +// +//-------------------------------------------------------------------------------------------------- +pub usingnamespace @import("zmath.zig"); +pub const util = @import("util.zig"); + +// ensure transitive closure of test coverage +comptime { + _ = util; +} diff --git a/libs/zmath/src/util.zig b/libs/zmath/src/util.zig new file mode 100644 index 0000000..3a626ef --- /dev/null +++ b/libs/zmath/src/util.zig @@ -0,0 +1,188 @@ +// ============================================================================== +// +// Collection of useful functions building on top of, and extending, core zmath. +// https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath +// +// ------------------------------------------------------------------------------ +// 1. Matrix functions +// ------------------------------------------------------------------------------ +// +// As an example, in a left handed Y-up system: +// getAxisX is equivalent to the right vector +// getAxisY is equivalent to the up vector +// getAxisZ is equivalent to the forward vector +// +// getTranslationVec(m: Mat) Vec +// getAxisX(m: Mat) Vec +// getAxisY(m: Mat) Vec +// getAxisZ(m: Mat) Vec +// +// ============================================================================== + +const zm = @import("zmath.zig"); +const std = @import("std"); +const math = std.math; +const expect = std.testing.expect; + +pub fn getTranslationVec(m: zm.Mat) zm.Vec { + var translation = m[3]; + translation[3] = 0; + return translation; +} + +pub fn setTranslationVec(m: *zm.Mat, translation: zm.Vec) void { + const w = m[3][3]; + m[3] = translation; + m[3][3] = w; +} + +pub fn getScaleVec(m: zm.Mat) zm.Vec { + const scale_x = zm.length3(zm.f32x4(m[0][0], m[1][0], m[2][0], 0))[0]; + const scale_y = zm.length3(zm.f32x4(m[0][1], m[1][1], m[2][1], 0))[0]; + const scale_z = zm.length3(zm.f32x4(m[0][2], m[1][2], m[2][2], 0))[0]; + return zm.f32x4(scale_x, scale_y, scale_z, 0); +} + +pub fn getRotationQuat(_m: zm.Mat) zm.Quat { + // Ortho normalize given matrix. + const c1 = zm.normalize3(zm.f32x4(_m[0][0], _m[1][0], _m[2][0], 0)); + const c2 = zm.normalize3(zm.f32x4(_m[0][1], _m[1][1], _m[2][1], 0)); + const c3 = zm.normalize3(zm.f32x4(_m[0][2], _m[1][2], _m[2][2], 0)); + var m = _m; + m[0][0] = c1[0]; + m[1][0] = c1[1]; + m[2][0] = c1[2]; + m[0][1] = c2[0]; + m[1][1] = c2[1]; + m[2][1] = c2[2]; + m[0][2] = c3[0]; + m[1][2] = c3[1]; + m[2][2] = c3[2]; + + // Extract rotation + return zm.quatFromMat(m); +} + +pub fn getAxisX(m: zm.Mat) zm.Vec { + return zm.normalize3(zm.f32x4(m[0][0], m[0][1], m[0][2], 0.0)); +} + +pub fn getAxisY(m: zm.Mat) zm.Vec { + return zm.normalize3(zm.f32x4(m[1][0], m[1][1], m[1][2], 0.0)); +} + +pub fn getAxisZ(m: zm.Mat) zm.Vec { + return zm.normalize3(zm.f32x4(m[2][0], m[2][1], m[2][2], 0.0)); +} + +test "zmath.util.mat.translation" { + // zig fmt: off + const mat_data = [18]f32{ + 1.0, + 2.0, 3.0, 4.0, 5.0, + 6.0, 7.0, 8.0, 9.0, + 10.0,11.0, 12.0,13.0, + 14.0, 15.0, 16.0, 17.0, + 18.0, + }; + // zig fmt: on + const mat = zm.loadMat(mat_data[1..]); + const translation = getTranslationVec(mat); + try zm.expectVecApproxEqAbs(translation, zm.f32x4(14.0, 15.0, 16.0, 0.0), 0.0001); +} + +test "zmath.util.mat.scale" { + const mat = zm.mul(zm.scaling(3, 4, 5), zm.translation(6, 7, 8)); + const scale = getScaleVec(mat); + try zm.expectVecApproxEqAbs(scale, zm.f32x4(3.0, 4.0, 5.0, 0.0), 0.0001); +} + +test "zmath.util.mat.rotation" { + const rotate_origin = zm.matFromRollPitchYaw(0.1, 1.2, 2.3); + const mat = zm.mul(zm.mul(rotate_origin, zm.scaling(3, 4, 5)), zm.translation(6, 7, 8)); + const rotate_get = getRotationQuat(mat); + const v0 = zm.mul(zm.f32x4s(1), rotate_origin); + const v1 = zm.mul(zm.f32x4s(1), zm.quatToMat(rotate_get)); + try zm.expectVecApproxEqAbs(v0, v1, 0.0001); +} + +test "zmath.util.mat.z_vec" { + const degToRad = std.math.degreesToRadians; + var identity = zm.identity(); + var z_vec = getAxisZ(identity); + try zm.expectVecApproxEqAbs(z_vec, zm.f32x4(0.0, 0.0, 1.0, 0), 0.0001); + const rot_yaw = zm.rotationY(degToRad(90)); + identity = zm.mul(identity, rot_yaw); + z_vec = getAxisZ(identity); + try zm.expectVecApproxEqAbs(z_vec, zm.f32x4(1.0, 0.0, 0.0, 0), 0.0001); +} + +test "zmath.util.mat.y_vec" { + const degToRad = std.math.degreesToRadians; + var identity = zm.identity(); + var y_vec = getAxisY(identity); + try zm.expectVecApproxEqAbs(y_vec, zm.f32x4(0.0, 1.0, 0.0, 0), 0.01); + const rot_yaw = zm.rotationY(degToRad(90)); + identity = zm.mul(identity, rot_yaw); + y_vec = getAxisY(identity); + try zm.expectVecApproxEqAbs(y_vec, zm.f32x4(0.0, 1.0, 0.0, 0), 0.01); + const rot_pitch = zm.rotationX(degToRad(90)); + identity = zm.mul(identity, rot_pitch); + y_vec = getAxisY(identity); + try zm.expectVecApproxEqAbs(y_vec, zm.f32x4(0.0, 0.0, 1.0, 0), 0.01); +} + +test "zmath.util.mat.right" { + const degToRad = std.math.degreesToRadians; + var identity = zm.identity(); + var right = getAxisX(identity); + try zm.expectVecApproxEqAbs(right, zm.f32x4(1.0, 0.0, 0.0, 0), 0.01); + const rot_yaw = zm.rotationY(degToRad(90)); + identity = zm.mul(identity, rot_yaw); + right = getAxisX(identity); + try zm.expectVecApproxEqAbs(right, zm.f32x4(0.0, 0.0, -1.0, 0), 0.01); + const rot_pitch = zm.rotationX(degToRad(90)); + identity = zm.mul(identity, rot_pitch); + right = getAxisX(identity); + try zm.expectVecApproxEqAbs(right, zm.f32x4(0.0, 1.0, 0.0, 0), 0.01); +} + +// ------------------------------------------------------------------------------ +// This software is available under 2 licenses -- choose whichever you prefer. +// ------------------------------------------------------------------------------ +// ALTERNATIVE A - MIT License +// Copyright (c) 2022 Michal Ziulek and Contributors +// Permission is hereby granted, free of charge, to any person obtaining identity copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// ------------------------------------------------------------------------------ +// ALTERNATIVE B - Public Domain (www.unlicense.org) +// This is free and unencumbered software released into the public domain. +// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this +// software, either in source code form or as identity compiled binary, for any purpose, +// commercial or non-commercial, and by any means. +// In jurisdictions that recognize copyright laws, the author or authors of this +// software dedicate any and all copyright interest in the software to the public +// domain. We make this dedication for the benefit of the public at large and to +// the detriment of our heirs and successors. We intend this dedication to be an +// overt act of relinquishment in perpetuity of all present and future rights to +// this software under copyright law. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// ------------------------------------------------------------------------------ diff --git a/libs/zmath/src/zmath.zig b/libs/zmath/src/zmath.zig new file mode 100644 index 0000000..e38eebf --- /dev/null +++ b/libs/zmath/src/zmath.zig @@ -0,0 +1,4568 @@ +// ============================================================================== +// +// SIMD math library for game developers +// https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath +// +// Should work on all OSes supported by Zig. Works on x86_64 and ARM. +// Provides ~140 optimized routines and ~70 extensive tests. +// Can be used with any graphics API. +// +// zmath uses row-major matrices, row vectors (each row vector is stored in a SIMD register). +// Handedness is determined by which function version is used (Rh vs. Lh), +// otherwise the function works with either left-handed or right-handed view coordinates. +// +// const va = f32x4(1.0, 2.0, 3.0, 1.0); +// const vb = f32x4(-1.0, 1.0, -1.0, 1.0); +// const v0 = va + vb - f32x4(0.0, 1.0, 0.0, 1.0) * f32x4s(3.0); +// const v1 = cross3(va, vb) + f32x4(1.0, 1.0, 1.0, 1.0); +// const v2 = va + dot3(va, vb) / v1; // dotN() returns scalar replicated on all vector components +// +// const m = rotationX(math.pi * 0.25); +// const v = f32x4(...); +// const v0 = mul(v, m); // 'v' treated as a row vector +// const v1 = mul(m, v); // 'v' treated as a column vector +// const f = m[row][column]; +// +// const b = va < vb; +// if (all(b, 0)) { ... } // '0' means check all vector components; if all are 'true' +// if (all(b, 3)) { ... } // '3' means check first three vector components; if all first three are 'true' +// if (any(b, 0)) { ... } // '0' means check all vector components; if any is 'true' +// if (any(b, 3)) { ... } // '3' means check first three vector components; if any from first three is 'true' +// +// var v4 = load(mem[0..], F32x4, 0); +// var v8 = load(mem[100..], F32x8, 0); +// var v16 = load(mem[200..], F32x16, 0); +// +// var camera_position = [3]f32{ 1.0, 2.0, 3.0 }; +// var cam_pos = loadArr3(camera_position); +// ... +// storeArr3(&camera_position, cam_pos); +// +// v4 = sin(v4); // SIMDx4 +// v8 = cos(v8); // .x86_64 -> 2 x SIMDx4, .x86_64+avx+fma -> SIMDx8 +// v16 = atan(v16); // .x86_64 -> 4 x SIMDx4, .x86_64+avx+fma -> 2 x SIMDx8, .x86_64+avx512f -> SIMDx16 +// +// store(mem[0..], v4, 0); +// store(mem[100..], v8, 0); +// store(mem[200..], v16, 0); +// +// ------------------------------------------------------------------------------ +// 1. Initialization functions +// ------------------------------------------------------------------------------ +// +// f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 +// f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 +// f32x16(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32, +// e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 +// +// f32x4s(e0: f32) F32x4 +// f32x8s(e0: f32) F32x8 +// f32x16s(e0: f32) F32x16 +// +// boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 +// boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 +// boolx16(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool, +// e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 +// +// load(mem: []const f32, comptime T: type, comptime len: u32) T +// store(mem: []f32, v: anytype, comptime len: u32) void +// +// loadArr2(arr: [2]f32) F32x4 +// loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 +// loadArr3(arr: [3]f32) F32x4 +// loadArr3w(arr: [3]f32, w: f32) F32x4 +// loadArr4(arr: [4]f32) F32x4 +// +// storeArr2(arr: *[2]f32, v: F32x4) void +// storeArr3(arr: *[3]f32, v: F32x4) void +// storeArr4(arr: *[4]f32, v: F32x4) void +// +// arr3Ptr(ptr: anytype) *const [3]f32 +// arrNPtr(ptr: anytype) [*]const f32 +// +// splat(comptime T: type, value: f32) T +// splatInt(comptime T: type, value: u32) T +// +// ------------------------------------------------------------------------------ +// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) +// ------------------------------------------------------------------------------ +// +// all(vb: anytype, comptime len: u32) bool +// any(vb: anytype, comptime len: u32) bool +// +// isNearEqual(v0: F32xN, v1: F32xN, epsilon: F32xN) BoolxN +// isNan(v: F32xN) BoolxN +// isInf(v: F32xN) BoolxN +// isInBounds(v: F32xN, bounds: F32xN) BoolxN +// +// andInt(v0: F32xN, v1: F32xN) F32xN +// andNotInt(v0: F32xN, v1: F32xN) F32xN +// orInt(v0: F32xN, v1: F32xN) F32xN +// norInt(v0: F32xN, v1: F32xN) F32xN +// xorInt(v0: F32xN, v1: F32xN) F32xN +// +// minFast(v0: F32xN, v1: F32xN) F32xN +// maxFast(v0: F32xN, v1: F32xN) F32xN +// min(v0: F32xN, v1: F32xN) F32xN +// max(v0: F32xN, v1: F32xN) F32xN +// round(v: F32xN) F32xN +// floor(v: F32xN) F32xN +// trunc(v: F32xN) F32xN +// ceil(v: F32xN) F32xN +// clamp(v0: F32xN, v1: F32xN) F32xN +// clampFast(v0: F32xN, v1: F32xN) F32xN +// saturate(v: F32xN) F32xN +// saturateFast(v: F32xN) F32xN +// lerp(v0: F32xN, v1: F32xN, t: f32) F32xN +// lerpV(v0: F32xN, v1: F32xN, t: F32xN) F32xN +// lerpInverse(v0: F32xN, v1: F32xN, t: f32) F32xN +// lerpInverseV(v0: F32xN, v1: F32xN, t: F32xN) F32xN +// mapLinear(v: F32xN, min1: f32, max1: f32, min2: f32, max2: f32) F32xN +// mapLinearV(v: F32xN, min1: F32xN, max1: F32xN, min2: F32xN, max2: F32xN) F32xN +// sqrt(v: F32xN) F32xN +// abs(v: F32xN) F32xN +// mod(v0: F32xN, v1: F32xN) F32xN +// modAngle(v: F32xN) F32xN +// mulAdd(v0: F32xN, v1: F32xN, v2: F32xN) F32xN +// select(mask: BoolxN, v0: F32xN, v1: F32xN) +// sin(v: F32xN) F32xN +// cos(v: F32xN) F32xN +// sincos(v: F32xN) [2]F32xN +// asin(v: F32xN) F32xN +// acos(v: F32xN) F32xN +// atan(v: F32xN) F32xN +// atan2(vy: F32xN, vx: F32xN) F32xN +// cmulSoa(re0: F32xN, im0: F32xN, re1: F32xN, im1: F32xN) [2]F32xN +// +// ------------------------------------------------------------------------------ +// 3. 2D, 3D, 4D vector functions +// ------------------------------------------------------------------------------ +// +// swizzle(v: Vec, c, c, c, c) Vec (comptime c = .x | .y | .z | .w) +// dot2(v0: Vec, v1: Vec) F32x4 +// dot3(v0: Vec, v1: Vec) F32x4 +// dot4(v0: Vec, v1: Vec) F32x4 +// cross3(v0: Vec, v1: Vec) Vec +// lengthSq2(v: Vec) F32x4 +// lengthSq3(v: Vec) F32x4 +// lengthSq4(v: Vec) F32x4 +// length2(v: Vec) F32x4 +// length3(v: Vec) F32x4 +// length4(v: Vec) F32x4 +// normalize2(v: Vec) Vec +// normalize3(v: Vec) Vec +// normalize4(v: Vec) Vec +// +// vecToArr2(v: Vec) [2]f32 +// vecToArr3(v: Vec) [3]f32 +// vecToArr4(v: Vec) [4]f32 +// +// ------------------------------------------------------------------------------ +// 4. Matrix functions +// ------------------------------------------------------------------------------ +// +// identity() Mat +// mul(m0: Mat, m1: Mat) Mat +// mul(s: f32, m: Mat) Mat +// mul(m: Mat, s: f32) Mat +// mul(v: Vec, m: Mat) Vec +// mul(m: Mat, v: Vec) Vec +// transpose(m: Mat) Mat +// rotationX(angle: f32) Mat +// rotationY(angle: f32) Mat +// rotationZ(angle: f32) Mat +// translation(x: f32, y: f32, z: f32) Mat +// translationV(v: Vec) Mat +// scaling(x: f32, y: f32, z: f32) Mat +// scalingV(v: Vec) Mat +// lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat +// lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat +// lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat +// lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat +// perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat +// perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat +// perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat +// perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat +// orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat +// orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat +// orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat +// orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat +// orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat +// orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat +// orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat +// orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat +// determinant(m: Mat) F32x4 +// inverse(m: Mat) Mat +// inverseDet(m: Mat, det: ?*F32x4) Mat +// matToQuat(m: Mat) Quat +// matFromAxisAngle(axis: Vec, angle: f32) Mat +// matFromNormAxisAngle(axis: Vec, angle: f32) Mat +// matFromQuat(quat: Quat) Mat +// matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat +// matFromRollPitchYawV(angles: Vec) Mat +// matFromArr(arr: [16]f32) Mat +// +// loadMat(mem: []const f32) Mat +// loadMat43(mem: []const f32) Mat +// loadMat34(mem: []const f32) Mat +// storeMat(mem: []f32, m: Mat) void +// storeMat43(mem: []f32, m: Mat) void +// storeMat34(mem: []f32, m: Mat) void +// +// matToArr(m: Mat) [16]f32 +// matToArr43(m: Mat) [12]f32 +// matToArr34(m: Mat) [12]f32 +// +// ------------------------------------------------------------------------------ +// 5. Quaternion functions +// ------------------------------------------------------------------------------ +// +// qmul(q0: Quat, q1: Quat) Quat +// qidentity() Quat +// conjugate(quat: Quat) Quat +// inverse(q: Quat) Quat +// rotate(q: Quat, v: Vec) Vec +// slerp(q0: Quat, q1: Quat, t: f32) Quat +// slerpV(q0: Quat, q1: Quat, t: F32x4) Quat +// quatToMat(quat: Quat) Mat +// quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void +// quatFromMat(m: Mat) Quat +// quatFromAxisAngle(axis: Vec, angle: f32) Quat +// quatFromNormAxisAngle(axis: Vec, angle: f32) Quat +// quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat +// quatFromRollPitchYawV(angles: Vec) Quat +// +// ------------------------------------------------------------------------------ +// 6. Color functions +// ------------------------------------------------------------------------------ +// +// adjustSaturation(color: F32x4, saturation: f32) F32x4 +// adjustContrast(color: F32x4, contrast: f32) F32x4 +// rgbToHsl(rgb: F32x4) F32x4 +// hslToRgb(hsl: F32x4) F32x4 +// rgbToHsv(rgb: F32x4) F32x4 +// hsvToRgb(hsv: F32x4) F32x4 +// rgbToSrgb(rgb: F32x4) F32x4 +// srgbToRgb(srgb: F32x4) F32x4 +// +// ------------------------------------------------------------------------------ +// X. Misc functions +// ------------------------------------------------------------------------------ +// +// linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 +// sin(v: f32) f32 +// cos(v: f32) f32 +// sincos(v: f32) [2]f32 +// asin(v: f32) f32 +// acos(v: f32) f32 +// +// fftInitUnityTable(unitytable: []F32x4) void +// fft(re: []F32x4, im: []F32x4, unitytable: []const F32x4) void +// ifft(re: []F32x4, im: []const F32x4, unitytable: []const F32x4) void +// +// ============================================================================== + +// Fundamental types +pub const F32x4 = @Vector(4, f32); +pub const F32x8 = @Vector(8, f32); +pub const F32x16 = @Vector(16, f32); +pub const Boolx4 = @Vector(4, bool); +pub const Boolx8 = @Vector(8, bool); +pub const Boolx16 = @Vector(16, bool); + +// "Higher-level" aliases +pub const Vec = F32x4; +pub const Mat = [4]F32x4; +pub const Quat = F32x4; + +const builtin = @import("builtin"); +const std = @import("std"); +const math = std.math; +const assert = std.debug.assert; +const expect = std.testing.expect; + +const cpu_arch = builtin.cpu.arch; +const has_avx = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx) else false; +const has_avx512f = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f) else false; +const has_fma = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .fma) else false; +// ------------------------------------------------------------------------------ +// +// 1. Initialization functions +// +// ------------------------------------------------------------------------------ +pub inline fn f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 { + return .{ e0, e1, e2, e3 }; +} +pub inline fn f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 { + return .{ e0, e1, e2, e3, e4, e5, e6, e7 }; +} +// zig fmt: off +pub inline fn f32x16( + e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32, + e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 { + return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef }; +} +// zig fmt: on + +pub inline fn f32x4s(e0: f32) F32x4 { + return splat(F32x4, e0); +} +pub inline fn f32x8s(e0: f32) F32x8 { + return splat(F32x8, e0); +} +pub inline fn f32x16s(e0: f32) F32x16 { + return splat(F32x16, e0); +} + +pub inline fn boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 { + return .{ e0, e1, e2, e3 }; +} +pub inline fn boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 { + return .{ e0, e1, e2, e3, e4, e5, e6, e7 }; +} +// zig fmt: off +pub inline fn boolx16( + e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool, + e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 { + return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef }; +} +// zig fmt: on + +pub inline fn veclen(comptime T: type) comptime_int { + return @typeInfo(T).Vector.len; +} + +pub inline fn splat(comptime T: type, value: f32) T { + return @splat(value); +} +pub inline fn splatInt(comptime T: type, value: u32) T { + return @splat(@bitCast(value)); +} + +pub fn load(mem: []const f32, comptime T: type, comptime len: u32) T { + var v = splat(T, 0.0); + const loop_len = if (len == 0) veclen(T) else len; + comptime var i: u32 = 0; + inline while (i < loop_len) : (i += 1) { + v[i] = mem[i]; + } + return v; +} +test "zmath.load" { + const a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 }; + var ptr = &a; + var i: u32 = 0; + const v0 = load(a[i..], F32x4, 2); + try expectVecEqual(v0, F32x4{ 1.0, 2.0, 0.0, 0.0 }); + i += 2; + const v1 = load(a[i .. i + 2], F32x4, 2); + try expectVecEqual(v1, F32x4{ 3.0, 4.0, 0.0, 0.0 }); + const v2 = load(a[5..7], F32x4, 2); + try expectVecEqual(v2, F32x4{ 6.0, 7.0, 0.0, 0.0 }); + const v3 = load(ptr[1..], F32x4, 2); + try expectVecEqual(v3, F32x4{ 2.0, 3.0, 0.0, 0.0 }); + i += 1; + const v4 = load(ptr[i .. i + 2], F32x4, 2); + try expectVecEqual(v4, F32x4{ 4.0, 5.0, 0.0, 0.0 }); +} + +pub fn store(mem: []f32, v: anytype, comptime len: u32) void { + const T = @TypeOf(v); + const loop_len = if (len == 0) veclen(T) else len; + comptime var i: u32 = 0; + inline while (i < loop_len) : (i += 1) { + mem[i] = v[i]; + } +} +test "zmath.store" { + var a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 }; + const v = load(a[1..], F32x4, 3); + store(a[2..], v, 4); + try expect(a[0] == 1.0); + try expect(a[1] == 2.0); + try expect(a[2] == 2.0); + try expect(a[3] == 3.0); + try expect(a[4] == 4.0); + try expect(a[5] == 0.0); +} + +pub inline fn loadArr2(arr: [2]f32) F32x4 { + return f32x4(arr[0], arr[1], 0.0, 0.0); +} +pub inline fn loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 { + return f32x4(arr[0], arr[1], z, w); +} +pub inline fn loadArr3(arr: [3]f32) F32x4 { + return f32x4(arr[0], arr[1], arr[2], 0.0); +} +pub inline fn loadArr3w(arr: [3]f32, w: f32) F32x4 { + return f32x4(arr[0], arr[1], arr[2], w); +} +pub inline fn loadArr4(arr: [4]f32) F32x4 { + return f32x4(arr[0], arr[1], arr[2], arr[3]); +} + +pub inline fn storeArr2(arr: *[2]f32, v: F32x4) void { + arr.* = .{ v[0], v[1] }; +} +pub inline fn storeArr3(arr: *[3]f32, v: F32x4) void { + arr.* = .{ v[0], v[1], v[2] }; +} +pub inline fn storeArr4(arr: *[4]f32, v: F32x4) void { + arr.* = .{ v[0], v[1], v[2], v[3] }; +} + +pub inline fn arr3Ptr(ptr: anytype) *const [3]f32 { + comptime assert(@typeInfo(@TypeOf(ptr)) == .Pointer); + const T = std.meta.Child(@TypeOf(ptr)); + comptime assert(T == F32x4); + return @as(*const [3]f32, @ptrCast(ptr)); +} + +pub inline fn arrNPtr(ptr: anytype) [*]const f32 { + comptime assert(@typeInfo(@TypeOf(ptr)) == .Pointer); + const T = std.meta.Child(@TypeOf(ptr)); + comptime assert(T == Mat or T == F32x4 or T == F32x8 or T == F32x16); + return @as([*]const f32, @ptrCast(ptr)); +} +test "zmath.arrNPtr" { + { + const mat = identity(); + const f32ptr = arrNPtr(&mat); + try expect(f32ptr[0] == 1.0); + try expect(f32ptr[5] == 1.0); + try expect(f32ptr[10] == 1.0); + try expect(f32ptr[15] == 1.0); + } + { + const v8 = f32x8s(1.0); + const f32ptr = arrNPtr(&v8); + try expect(f32ptr[1] == 1.0); + try expect(f32ptr[7] == 1.0); + } +} + +test "zmath.loadArr" { + { + const camera_position = [3]f32{ 1.0, 2.0, 3.0 }; + const simd_reg = loadArr3(camera_position); + try expectVecEqual(simd_reg, f32x4(1.0, 2.0, 3.0, 0.0)); + } + { + const camera_position = [3]f32{ 1.0, 2.0, 3.0 }; + const simd_reg = loadArr3w(camera_position, 1.0); + try expectVecEqual(simd_reg, f32x4(1.0, 2.0, 3.0, 1.0)); + } +} + +pub inline fn vecToArr2(v: Vec) [2]f32 { + return .{ v[0], v[1] }; +} +pub inline fn vecToArr3(v: Vec) [3]f32 { + return .{ v[0], v[1], v[2] }; +} +pub inline fn vecToArr4(v: Vec) [4]f32 { + return .{ v[0], v[1], v[2], v[3] }; +} +// ------------------------------------------------------------------------------ +// +// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) +// +// ------------------------------------------------------------------------------ +pub fn all(vb: anytype, comptime len: u32) bool { + const T = @TypeOf(vb); + if (len > veclen(T)) { + @compileError("zmath.all(): 'len' is greater than vector len of type " ++ @typeName(T)); + } + const loop_len = if (len == 0) veclen(T) else len; + const ab: [veclen(T)]bool = vb; + comptime var i: u32 = 0; + var result = true; + inline while (i < loop_len) : (i += 1) { + result = result and ab[i]; + } + return result; +} +test "zmath.all" { + try expect(all(boolx8(true, true, true, true, true, false, true, false), 5) == true); + try expect(all(boolx8(true, true, true, true, true, false, true, false), 6) == false); + try expect(all(boolx8(true, true, true, true, false, false, false, false), 4) == true); + try expect(all(boolx4(true, true, true, false), 3) == true); + try expect(all(boolx4(true, true, true, false), 1) == true); + try expect(all(boolx4(true, false, false, false), 1) == true); + try expect(all(boolx4(false, true, false, false), 1) == false); + try expect(all(boolx8(true, true, true, true, true, false, true, false), 0) == false); + try expect(all(boolx4(false, true, false, false), 0) == false); + try expect(all(boolx4(true, true, true, true), 0) == true); +} + +pub fn any(vb: anytype, comptime len: u32) bool { + const T = @TypeOf(vb); + if (len > veclen(T)) { + @compileError("zmath.any(): 'len' is greater than vector len of type " ++ @typeName(T)); + } + const loop_len = if (len == 0) veclen(T) else len; + const ab: [veclen(T)]bool = vb; + comptime var i: u32 = 0; + var result = false; + inline while (i < loop_len) : (i += 1) { + result = result or ab[i]; + } + return result; +} +test "zmath.any" { + try expect(any(boolx8(true, true, true, true, true, false, true, false), 0) == true); + try expect(any(boolx8(false, false, false, true, true, false, true, false), 3) == false); + try expect(any(boolx8(false, false, false, false, false, true, false, false), 4) == false); +} + +pub inline fn isNearEqual( + v0: anytype, + v1: anytype, + epsilon: anytype, +) @Vector(veclen(@TypeOf(v0)), bool) { + const T = @TypeOf(v0, v1, epsilon); + const delta = v0 - v1; + const temp = maxFast(delta, splat(T, 0.0) - delta); + return temp <= epsilon; +} +test "zmath.isNearEqual" { + { + const v0 = f32x4(1.0, 2.0, -3.0, 4.001); + const v1 = f32x4(1.0, 2.1, 3.0, 4.0); + const b = isNearEqual(v0, v1, splat(F32x4, 0.01)); + try expect(@reduce(.And, b == boolx4(true, false, false, true))); + } + { + const v0 = f32x8(1.0, 2.0, -3.0, 4.001, 1.001, 2.3, -0.0, 0.0); + const v1 = f32x8(1.0, 2.1, 3.0, 4.0, -1.001, 2.1, 0.0, 0.0); + const b = isNearEqual(v0, v1, splat(F32x8, 0.01)); + try expect(@reduce(.And, b == boolx8(true, false, false, true, false, false, true, true))); + } + try expect(all(isNearEqual( + splat(F32x4, math.inf(f32)), + splat(F32x4, math.inf(f32)), + splat(F32x4, 0.0001), + ), 0) == false); + try expect(all(isNearEqual( + splat(F32x4, -math.inf(f32)), + splat(F32x4, math.inf(f32)), + splat(F32x4, 0.0001), + ), 0) == false); + try expect(all(isNearEqual( + splat(F32x4, -math.inf(f32)), + splat(F32x4, -math.inf(f32)), + splat(F32x4, 0.0001), + ), 0) == false); + try expect(all(isNearEqual( + splat(F32x4, -math.nan(f32)), + splat(F32x4, math.inf(f32)), + splat(F32x4, 0.0001), + ), 0) == false); +} + +pub inline fn isNan( + v: anytype, +) @Vector(veclen(@TypeOf(v)), bool) { + return v != v; +} +test "zmath.isNan" { + { + const v0 = f32x4(math.inf(f32), math.nan(f32), math.nan(f32), 7.0); + const b = isNan(v0); + try expect(@reduce(.And, b == boolx4(false, true, true, false))); + } + { + const v0 = f32x8(0, math.nan(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0); + const b = isNan(v0); + try expect(@reduce(.And, b == boolx8(false, true, false, false, false, true, true, false))); + } +} + +pub inline fn isInf( + v: anytype, +) @Vector(veclen(@TypeOf(v)), bool) { + const T = @TypeOf(v); + return abs(v) == splat(T, math.inf(f32)); +} +test "zmath.isInf" { + { + const v0 = f32x4(math.inf(f32), math.nan(f32), math.snan(f32), 7.0); + const b = isInf(v0); + try expect(@reduce(.And, b == boolx4(true, false, false, false))); + } + { + const v0 = f32x8(0, math.inf(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0); + const b = isInf(v0); + try expect(@reduce(.And, b == boolx8(false, true, false, false, true, false, false, false))); + } +} + +pub inline fn isInBounds( + v: anytype, + bounds: anytype, +) @Vector(veclen(@TypeOf(v)), bool) { + const T = @TypeOf(v, bounds); + const Tu = @Vector(veclen(T), u1); + const Tr = @Vector(veclen(T), bool); + + // 2 x cmpleps, xorps, load, andps + const b0 = v <= bounds; + const b1 = (bounds * splat(T, -1.0)) <= v; + const b0u = @as(Tu, @bitCast(b0)); + const b1u = @as(Tu, @bitCast(b1)); + return @as(Tr, @bitCast(b0u & b1u)); +} +test "zmath.isInBounds" { + { + const v0 = f32x4(0.5, -2.0, -1.0, 1.9); + const v1 = f32x4(-1.6, -2.001, -1.0, 1.9); + const bounds = f32x4(1.0, 2.0, 1.0, 2.0); + const b0 = isInBounds(v0, bounds); + const b1 = isInBounds(v1, bounds); + try expect(@reduce(.And, b0 == boolx4(true, true, true, true))); + try expect(@reduce(.And, b1 == boolx4(false, false, true, true))); + } + { + const v0 = f32x8(2.0, 1.0, 2.0, 1.0, 0.5, -2.0, -1.0, 1.9); + const bounds = f32x8(1.0, 1.0, 1.0, math.inf(f32), 1.0, math.nan(f32), 1.0, 2.0); + const b0 = isInBounds(v0, bounds); + try expect(@reduce(.And, b0 == boolx8(false, true, false, true, true, false, true, true))); + } +} + +pub inline fn andInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + const Tu = @Vector(veclen(T), u32); + const v0u = @as(Tu, @bitCast(v0)); + const v1u = @as(Tu, @bitCast(v1)); + return @as(T, @bitCast(v0u & v1u)); // andps +} +test "zmath.andInt" { + { + const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); + const v1 = f32x4(1.0, 2.0, 3.0, math.inf(f32)); + const v = andInt(v0, v1); + try expect(v[3] == math.inf(f32)); + try expectVecEqual(v, f32x4(0.0, 2.0, 0.0, math.inf(f32))); + } + { + const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); + const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, math.inf(f32)); + const v = andInt(v0, v1); + try expect(v[7] == math.inf(f32)); + try expectVecEqual(v, f32x8(0, 0, 0, 0, 0.0, 2.0, 0.0, math.inf(f32))); + } +} + +pub inline fn andNotInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + const Tu = @Vector(veclen(T), u32); + const v0u = @as(Tu, @bitCast(v0)); + const v1u = @as(Tu, @bitCast(v1)); + return @as(T, @bitCast(~v0u & v1u)); // andnps +} +test "zmath.andNotInt" { + { + const v0 = f32x4(1.0, 2.0, 3.0, 4.0); + const v1 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); + const v = andNotInt(v1, v0); + try expectVecEqual(v, f32x4(1.0, 0.0, 3.0, 0.0)); + } + { + const v0 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0); + const v1 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); + const v = andNotInt(v1, v0); + try expectVecEqual(v, f32x8(0, 0, 0, 0, 1.0, 0.0, 3.0, 0.0)); + } +} + +pub inline fn orInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + const Tu = @Vector(veclen(T), u32); + const v0u = @as(Tu, @bitCast(v0)); + const v1u = @as(Tu, @bitCast(v1)); + return @as(T, @bitCast(v0u | v1u)); // orps +} +test "zmath.orInt" { + { + const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); + const v1 = f32x4(1.0, 2.0, 3.0, 4.0); + const v = orInt(v0, v1); + try expect(v[0] == 1.0); + try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0)); + try expect(v[2] == 3.0); + try expect(v[3] == 4.0); + } + { + const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); + const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0); + const v = orInt(v0, v1); + try expect(v[4] == 1.0); + try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0)); + try expect(v[6] == 3.0); + try expect(v[7] == 4.0); + } +} + +pub inline fn norInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + const Tu = @Vector(veclen(T), u32); + const v0u = @as(Tu, @bitCast(v0)); + const v1u = @as(Tu, @bitCast(v1)); + return @as(T, @bitCast(~(v0u | v1u))); // por, pcmpeqd, pxor +} + +pub inline fn xorInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + const Tu = @Vector(veclen(T), u32); + const v0u = @as(Tu, @bitCast(v0)); + const v1u = @as(Tu, @bitCast(v1)); + return @as(T, @bitCast(v0u ^ v1u)); // xorps +} +test "zmath.xorInt" { + { + const v0 = f32x4(1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); + const v1 = f32x4(1.0, 0, 0, 0); + const v = xorInt(v0, v1); + try expect(v[0] == 0.0); + try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0)); + try expect(v[2] == 0.0); + try expect(v[3] == 0.0); + } + { + const v0 = f32x8(0, 0, 0, 0, 1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); + const v1 = f32x8(0, 0, 0, 0, 1.0, 0, 0, 0); + const v = xorInt(v0, v1); + try expect(v[4] == 0.0); + try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0)); + try expect(v[6] == 0.0); + try expect(v[7] == 0.0); + } +} + +pub inline fn minFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + return select(v0 < v1, v0, v1); // minps +} +test "zmath.minFast" { + { + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = minFast(v0, v1); + try expectVecEqual(v, f32x4(1.0, 1.0, 2.0, 7.0)); + } + { + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = minFast(v0, v1); + try expect(v[0] == 1.0); + try expect(v[1] == 1.0); + try expect(!math.isNan(v[1])); + try expect(v[2] == 4.0); + try expect(v[3] == math.inf(f32)); + try expect(!math.isNan(v[3])); + } +} + +pub inline fn maxFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + return select(v0 > v1, v0, v1); // maxps +} +test "zmath.maxFast" { + { + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = maxFast(v0, v1); + try expectVecEqual(v, f32x4(2.0, 3.0, 4.0, math.inf(f32))); + } + { + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = maxFast(v0, v1); + try expect(v[0] == 2.0); + try expect(v[1] == 1.0); + try expect(v[2] == 5.0); + try expect(v[3] == math.inf(f32)); + try expect(!math.isNan(v[3])); + } +} + +pub inline fn min(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + // This will handle inf & nan + return @min(v0, v1); // minps, cmpunordps, andps, andnps, orps +} +test "zmath.min" { + // Calling math.inf causes test to fail! + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; + { + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = min(v0, v1); + try expectVecEqual(v, f32x4(1.0, 1.0, 2.0, 7.0)); + } + { + const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0); + const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32)); + const v = min(v0, v1); + try expectVecEqual(v, f32x8(0.0, 0.0, -2.0, 0.0, 1.0, 1.0, 2.0, 7.0)); + } + { + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = min(v0, v1); + try expect(v[0] == 1.0); + try expect(v[1] == 1.0); + try expect(!math.isNan(v[1])); + try expect(v[2] == 4.0); + try expect(v[3] == math.inf(f32)); + try expect(!math.isNan(v[3])); + } + + { + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32)); + const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32)); + const v = min(v0, v1); + try expect(v[0] == -math.inf(f32)); + try expect(v[1] == -math.inf(f32)); + try expect(v[2] == math.inf(f32)); + try expect(!math.isNan(v[2])); + try expect(math.isNan(v[3])); + try expect(!math.isInf(v[3])); + } +} + +pub inline fn max(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + // This will handle inf & nan + return @max(v0, v1); // maxps, cmpunordps, andps, andnps, orps +} +test "zmath.max" { + // Calling math.inf causes test to fail! + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; + { + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = max(v0, v1); + try expectVecEqual(v, f32x4(2.0, 3.0, 4.0, math.inf(f32))); + } + { + const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0); + const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32)); + const v = max(v0, v1); + try expectVecEqual(v, f32x8(0.0, 1.0, 0.0, 0.0, 2.0, 3.0, 4.0, math.inf(f32))); + } + { + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); + const v = max(v0, v1); + try expect(v[0] == 2.0); + try expect(v[1] == 1.0); + try expect(v[2] == 5.0); + try expect(v[3] == math.inf(f32)); + try expect(!math.isNan(v[3])); + } + { + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32)); + const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32)); + const v = max(v0, v1); + try expect(v[0] == -math.inf(f32)); + try expect(v[1] == math.inf(f32)); + try expect(v[2] == math.inf(f32)); + try expect(!math.isNan(v[2])); + try expect(math.isNan(v[3])); + try expect(!math.isInf(v[3])); + } +} + +pub fn round(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + if (cpu_arch == .x86_64 and has_avx) { + if (T == F32x4) { + return asm ("vroundps $0, %%xmm0, %%xmm0" + : [ret] "={xmm0}" (-> T), + : [v] "{xmm0}" (v), + ); + } else if (T == F32x8) { + return asm ("vroundps $0, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> T), + : [v] "{ymm0}" (v), + ); + } else if (T == F32x16 and has_avx512f) { + return asm ("vrndscaleps $0, %%zmm0, %%zmm0" + : [ret] "={zmm0}" (-> T), + : [v] "{zmm0}" (v), + ); + } else if (T == F32x16 and !has_avx512f) { + const arr: [16]f32 = v; + var ymm0 = @as(F32x8, arr[0..8].*); + var ymm1 = @as(F32x8, arr[8..16].*); + ymm0 = asm ("vroundps $0, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> F32x8), + : [v] "{ymm0}" (ymm0), + ); + ymm1 = asm ("vroundps $0, %%ymm1, %%ymm1" + : [ret] "={ymm1}" (-> F32x8), + : [v] "{ymm1}" (ymm1), + ); + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); + } + } else { + const sign = andInt(v, splatNegativeZero(T)); + const magic = orInt(splatNoFraction(T), sign); + var r1 = v + magic; + r1 = r1 - magic; + const r2 = abs(v); + const mask = r2 <= splatNoFraction(T); + return select(mask, r1, v); + } +} +test "zmath.round" { + { + try expect(all(round(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); + try expect(all(round(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); + try expect(all(isNan(round(splat(F32x4, math.nan(f32)))), 0)); + try expect(all(isNan(round(splat(F32x4, -math.nan(f32)))), 0)); + try expect(all(isNan(round(splat(F32x4, math.snan(f32)))), 0)); + try expect(all(isNan(round(splat(F32x4, -math.snan(f32)))), 0)); + } + { + const v = round(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); + try expectVecApproxEqAbs( + v, + f32x16(1.0, -1.0, -2.0, 2.0, 2.0, 3.0, 3.0, 4.0, 6.0, 6.0, 8.0, 9.0, 10.0, 11.0, 13.0, 13.0), + 0.0, + ); + } + var v = round(f32x4(1.1, -1.1, -1.5, 1.5)); + try expectVecEqual(v, f32x4(1.0, -1.0, -2.0, 2.0)); + + const v1 = f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32)); + v = round(v1); + try expect(v[3] == math.inf(f32)); + try expectVecEqual(v, f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + + const v2 = f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32)); + v = round(v2); + try expect(math.isNan(v2[0])); + try expect(math.isNan(v2[1])); + try expect(math.isNan(v2[2])); + try expect(v2[3] == -math.inf(f32)); + + const v3 = f32x4(1001.5, -201.499, -10000.99, -101.5); + v = round(v3); + try expectVecEqual(v, f32x4(1002.0, -201.0, -10001.0, -102.0)); + + const v4 = f32x4(-1_388_609.9, 1_388_609.5, 1_388_109.01, 2_388_609.5); + v = round(v4); + try expectVecEqual(v, f32x4(-1_388_610.0, 1_388_610.0, 1_388_109.0, 2_388_610.0)); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = round(splat(F32x4, f)); + const fr = @round(splat(F32x4, f)); + const vr8 = round(splat(F32x8, f)); + const fr8 = @round(splat(F32x8, f)); + const vr16 = round(splat(F32x16, f)); + const fr16 = @round(splat(F32x16, f)); + try expectVecEqual(vr, fr); + try expectVecEqual(vr8, fr8); + try expectVecEqual(vr16, fr16); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +pub fn trunc(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + if (cpu_arch == .x86_64 and has_avx) { + if (T == F32x4) { + return asm ("vroundps $3, %%xmm0, %%xmm0" + : [ret] "={xmm0}" (-> T), + : [v] "{xmm0}" (v), + ); + } else if (T == F32x8) { + return asm ("vroundps $3, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> T), + : [v] "{ymm0}" (v), + ); + } else if (T == F32x16 and has_avx512f) { + return asm ("vrndscaleps $3, %%zmm0, %%zmm0" + : [ret] "={zmm0}" (-> T), + : [v] "{zmm0}" (v), + ); + } else if (T == F32x16 and !has_avx512f) { + const arr: [16]f32 = v; + var ymm0 = @as(F32x8, arr[0..8].*); + var ymm1 = @as(F32x8, arr[8..16].*); + ymm0 = asm ("vroundps $3, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> F32x8), + : [v] "{ymm0}" (ymm0), + ); + ymm1 = asm ("vroundps $3, %%ymm1, %%ymm1" + : [ret] "={ymm1}" (-> F32x8), + : [v] "{ymm1}" (ymm1), + ); + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); + } + } else { + const mask = abs(v) < splatNoFraction(T); + const result = floatToIntAndBack(v); + return select(mask, result, v); + } +} +test "zmath.trunc" { + { + try expect(all(trunc(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); + try expect(all(trunc(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); + try expect(all(isNan(trunc(splat(F32x4, math.nan(f32)))), 0)); + try expect(all(isNan(trunc(splat(F32x4, -math.nan(f32)))), 0)); + try expect(all(isNan(trunc(splat(F32x4, math.snan(f32)))), 0)); + try expect(all(isNan(trunc(splat(F32x4, -math.snan(f32)))), 0)); + } + { + const v = trunc(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); + try expectVecApproxEqAbs( + v, + f32x16(1.0, -1.0, -1.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0), + 0.0, + ); + } + var v = trunc(f32x4(1.1, -1.1, -1.5, 1.5)); + try expectVecEqual(v, f32x4(1.0, -1.0, -1.0, 1.0)); + + v = trunc(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + + v = trunc(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); + try expect(math.isNan(v[0])); + try expect(math.isNan(v[1])); + try expect(math.isNan(v[2])); + try expect(v[3] == -math.inf(f32)); + + v = trunc(f32x4(1000.5001, -201.499, -10000.99, 100.750001)); + try expectVecEqual(v, f32x4(1000.0, -201.0, -10000.0, 100.0)); + + v = trunc(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5)); + try expectVecEqual(v, f32x4(-7_388_609.0, 7_388_609.0, 8_388_109.0, -8_388_509.0)); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = trunc(splat(F32x4, f)); + const fr = @trunc(splat(F32x4, f)); + const vr8 = trunc(splat(F32x8, f)); + const fr8 = @trunc(splat(F32x8, f)); + const vr16 = trunc(splat(F32x16, f)); + const fr16 = @trunc(splat(F32x16, f)); + try expectVecEqual(vr, fr); + try expectVecEqual(vr8, fr8); + try expectVecEqual(vr16, fr16); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +pub fn floor(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + if (cpu_arch == .x86_64 and has_avx) { + if (T == F32x4) { + return asm ("vroundps $1, %%xmm0, %%xmm0" + : [ret] "={xmm0}" (-> T), + : [v] "{xmm0}" (v), + ); + } else if (T == F32x8) { + return asm ("vroundps $1, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> T), + : [v] "{ymm0}" (v), + ); + } else if (T == F32x16 and has_avx512f) { + return asm ("vrndscaleps $1, %%zmm0, %%zmm0" + : [ret] "={zmm0}" (-> T), + : [v] "{zmm0}" (v), + ); + } else if (T == F32x16 and !has_avx512f) { + const arr: [16]f32 = v; + var ymm0 = @as(F32x8, arr[0..8].*); + var ymm1 = @as(F32x8, arr[8..16].*); + ymm0 = asm ("vroundps $1, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> F32x8), + : [v] "{ymm0}" (ymm0), + ); + ymm1 = asm ("vroundps $1, %%ymm1, %%ymm1" + : [ret] "={ymm1}" (-> F32x8), + : [v] "{ymm1}" (ymm1), + ); + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); + } + } else { + const mask = abs(v) < splatNoFraction(T); + var result = floatToIntAndBack(v); + const larger_mask = result > v; + const larger = select(larger_mask, splat(T, -1.0), splat(T, 0.0)); + result = result + larger; + return select(mask, result, v); + } +} +test "zmath.floor" { + { + try expect(all(floor(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); + try expect(all(floor(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); + try expect(all(isNan(floor(splat(F32x4, math.nan(f32)))), 0)); + try expect(all(isNan(floor(splat(F32x4, -math.nan(f32)))), 0)); + try expect(all(isNan(floor(splat(F32x4, math.snan(f32)))), 0)); + try expect(all(isNan(floor(splat(F32x4, -math.snan(f32)))), 0)); + } + { + const v = floor(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); + try expectVecApproxEqAbs( + v, + f32x16(1.0, -2.0, -2.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0), + 0.0, + ); + } + var v = floor(f32x4(1.5, -1.5, -1.7, -2.1)); + try expectVecEqual(v, f32x4(1.0, -2.0, -2.0, -3.0)); + + v = floor(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + + v = floor(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); + try expect(math.isNan(v[0])); + try expect(math.isNan(v[1])); + try expect(math.isNan(v[2])); + try expect(v[3] == -math.inf(f32)); + + v = floor(f32x4(1000.5001, -201.499, -10000.99, 100.75001)); + try expectVecEqual(v, f32x4(1000.0, -202.0, -10001.0, 100.0)); + + v = floor(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5)); + try expectVecEqual(v, f32x4(-7_388_610.0, 7_388_609.0, 8_388_109.0, -8_388_510.0)); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = floor(splat(F32x4, f)); + const fr = @floor(splat(F32x4, f)); + const vr8 = floor(splat(F32x8, f)); + const fr8 = @floor(splat(F32x8, f)); + const vr16 = floor(splat(F32x16, f)); + const fr16 = @floor(splat(F32x16, f)); + try expectVecEqual(vr, fr); + try expectVecEqual(vr8, fr8); + try expectVecEqual(vr16, fr16); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +pub fn ceil(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + if (cpu_arch == .x86_64 and has_avx) { + if (T == F32x4) { + return asm ("vroundps $2, %%xmm0, %%xmm0" + : [ret] "={xmm0}" (-> T), + : [v] "{xmm0}" (v), + ); + } else if (T == F32x8) { + return asm ("vroundps $2, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> T), + : [v] "{ymm0}" (v), + ); + } else if (T == F32x16 and has_avx512f) { + return asm ("vrndscaleps $2, %%zmm0, %%zmm0" + : [ret] "={zmm0}" (-> T), + : [v] "{zmm0}" (v), + ); + } else if (T == F32x16 and !has_avx512f) { + const arr: [16]f32 = v; + var ymm0 = @as(F32x8, arr[0..8].*); + var ymm1 = @as(F32x8, arr[8..16].*); + ymm0 = asm ("vroundps $2, %%ymm0, %%ymm0" + : [ret] "={ymm0}" (-> F32x8), + : [v] "{ymm0}" (ymm0), + ); + ymm1 = asm ("vroundps $2, %%ymm1, %%ymm1" + : [ret] "={ymm1}" (-> F32x8), + : [v] "{ymm1}" (ymm1), + ); + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); + } + } else { + const mask = abs(v) < splatNoFraction(T); + var result = floatToIntAndBack(v); + const smaller_mask = result < v; + const smaller = select(smaller_mask, splat(T, -1.0), splat(T, 0.0)); + result = result - smaller; + return select(mask, result, v); + } +} +test "zmath.ceil" { + { + try expect(all(ceil(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); + try expect(all(ceil(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); + try expect(all(isNan(ceil(splat(F32x4, math.nan(f32)))), 0)); + try expect(all(isNan(ceil(splat(F32x4, -math.nan(f32)))), 0)); + try expect(all(isNan(ceil(splat(F32x4, math.snan(f32)))), 0)); + try expect(all(isNan(ceil(splat(F32x4, -math.snan(f32)))), 0)); + } + { + const v = ceil(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); + try expectVecApproxEqAbs( + v, + f32x16(2.0, -1.0, -1.0, 2.0, 3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0, 11.0, 12.0, 13.0, 14.0), + 0.0, + ); + } + var v = ceil(f32x4(1.5, -1.5, -1.7, -2.1)); + try expectVecEqual(v, f32x4(2.0, -1.0, -1.0, -2.0)); + + v = ceil(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + try expectVecEqual(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); + + v = ceil(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); + try expect(math.isNan(v[0])); + try expect(math.isNan(v[1])); + try expect(math.isNan(v[2])); + try expect(v[3] == -math.inf(f32)); + + v = ceil(f32x4(1000.5001, -201.499, -10000.99, 100.75001)); + try expectVecEqual(v, f32x4(1001.0, -201.0, -10000.0, 101.0)); + + v = ceil(f32x4(-1_388_609.5, 1_388_609.1, 1_388_109.9, -1_388_509.9)); + try expectVecEqual(v, f32x4(-1_388_609.0, 1_388_610.0, 1_388_110.0, -1_388_509.0)); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = ceil(splat(F32x4, f)); + const fr = @ceil(splat(F32x4, f)); + const vr8 = ceil(splat(F32x8, f)); + const fr8 = @ceil(splat(F32x8, f)); + const vr16 = ceil(splat(F32x16, f)); + const fr16 = @ceil(splat(F32x16, f)); + try expectVecEqual(vr, fr); + try expectVecEqual(vr8, fr8); + try expectVecEqual(vr16, fr16); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +pub inline fn clamp(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) { + var result = max(vmin, v); + result = min(vmax, result); + return result; +} +test "zmath.clamp" { + // Calling math.inf causes test to fail! + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; + { + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); + const v = clamp(v0, splat(F32x4, -0.5), splat(F32x4, 0.5)); + try expectVecApproxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001); + } + { + const v0 = f32x8(-2.0, 0.25, -0.25, 100.0, -1.0, 0.2, 1.1, -0.3); + const v = clamp(v0, splat(F32x8, -0.5), splat(F32x8, 0.5)); + try expectVecApproxEqAbs(v, f32x8(-0.5, 0.25, -0.25, 0.5, -0.5, 0.2, 0.5, -0.3), 0.0001); + } + { + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); + const v = clamp(v0, f32x4(-100.0, 0.0, -100.0, 0.0), f32x4(0.0, 100.0, 0.0, 100.0)); + try expectVecApproxEqAbs(v, f32x4(-100.0, 100.0, -100.0, 0.0), 0.0001); + } + { + const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); + const v = clamp(v0, splat(F32x4, -1.0), splat(F32x4, 1.0)); + try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, -1.0, -1.0), 0.0001); + } +} + +pub inline fn clampFast(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) { + var result = maxFast(vmin, v); + result = minFast(vmax, result); + return result; +} +test "zmath.clampFast" { + { + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); + const v = clampFast(v0, splat(F32x4, -0.5), splat(F32x4, 0.5)); + try expectVecApproxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001); + } +} + +pub inline fn saturate(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + var result = max(v, splat(T, 0.0)); + result = min(result, splat(T, 1.0)); + return result; +} +test "zmath.saturate" { + // Calling math.inf causes test to fail! + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; + { + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); + const v = saturate(v0); + try expectVecApproxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001); + } + { + const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3); + const v = saturate(v0); + try expectVecApproxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001); + } + { + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); + const v = saturate(v0); + try expectVecApproxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); + } + { + const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); + const v = saturate(v0); + try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001); + } +} + +pub inline fn saturateFast(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + var result = maxFast(v, splat(T, 0.0)); + result = minFast(result, splat(T, 1.0)); + return result; +} +test "zmath.saturateFast" { + { + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); + const v = saturateFast(v0); + try expectVecApproxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001); + } + { + const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3); + const v = saturateFast(v0); + try expectVecApproxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001); + } + { + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); + const v = saturateFast(v0); + try expectVecApproxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); + } + { + const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); + const v = saturateFast(v0); + try expectVecApproxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001); + } +} + +pub inline fn sqrt(v: anytype) @TypeOf(v) { + return @sqrt(v); // sqrtps +} + +pub inline fn abs(v: anytype) @TypeOf(v) { + return @abs(v); // load, andps +} + +pub inline fn select(mask: anytype, v0: anytype, v1: anytype) @TypeOf(v0, v1) { + return @select(f32, mask, v0, v1); +} + +pub inline fn lerp(v0: anytype, v1: anytype, t: f32) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + return v0 + (v1 - v0) * splat(T, t); // subps, shufps, addps, mulps +} + +pub inline fn lerpV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) { + return v0 + (v1 - v0) * t; // subps, addps, mulps +} + +pub inline fn lerpInverse(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1) { + const T = @TypeOf(v0, v1); + return (splat(T, t) - v0) / (v1 - v0); +} + +pub inline fn lerpInverseV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) { + return (t - v0) / (v1 - v0); +} +test "zmath.lerpInverse" { + try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 10.0), 0, 0.0005)); + try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 100.0), 1, 0.0005)); + try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 55.0), 0.5, 0.05)); + try expectVecApproxEqAbs(lerpInverse(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 10.0), f32x4(0.1, 0.05, 0, 0), 0.0005); +} + +// Frame rate independent lerp (or "damp"), for approaching things over time. +// Reference: https://www.gamedeveloper.com/programming/improved-lerp-smoothing- +pub inline fn lerpOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1) { + const t = std.math.exp2(-rate * dt); + return lerp(v0, v1, t); +} + +pub inline fn lerpVOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1, rate, dt) { + const t = std.math.exp2(-rate * dt); + return lerpV(v0, v1, t); +} +test "zmath.lerpOverTime" { + try expect(math.approxEqAbs(f32, lerpVOverTime(0.0, 1.0, 1.0, 1.0), 0.5, 0.0005)); + try expect(math.approxEqAbs(f32, lerpVOverTime(0.5, 1.0, 1.0, 1.0), 0.75, 0.0005)); + try expectVecApproxEqAbs(lerpOverTime(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 1.0, 1.0), f32x4(50, 100, 55, 55), 0.0005); +} + +/// To transform a vector of values from one range to another. +pub inline fn mapLinear(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v) { + const T = @TypeOf(v); + const min1V = splat(T, min1); + const max1V = splat(T, max1); + const min2V = splat(T, min2); + const max2V = splat(T, max2); + const dV = max1V - min1V; + return min2V + (v - min1V) * (max2V - min2V) / dV; +} + +pub inline fn mapLinearV(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v, min1, max1, min2, max2) { + const d = max1 - min1; + return min2 + (v - min1) * (max2 - min2) / d; +} +test "zmath.mapLinear" { + try expect(math.approxEqAbs(f32, mapLinearV(0, 0, 1.2, 10, 100), 10, 0.0005)); + try expect(math.approxEqAbs(f32, mapLinearV(1.2, 0, 1.2, 10, 100), 100, 0.0005)); + try expect(math.approxEqAbs(f32, mapLinearV(0.6, 0, 1.2, 10, 100), 55, 0.0005)); + try expectVecApproxEqAbs(mapLinearV(splat(F32x4, 0), splat(F32x4, 0), splat(F32x4, 1.2), splat(F32x4, 10), splat(F32x4, 100)), splat(F32x4, 10), 0.0005); + try expectVecApproxEqAbs(mapLinear(f32x4(0, 0, 0.6, 1.2), 0, 1.2, 10, 100), f32x4(10, 10, 55, 100), 0.0005); +} + +pub const F32x4Component = enum { x, y, z, w }; + +pub inline fn swizzle( + v: F32x4, + comptime x: F32x4Component, + comptime y: F32x4Component, + comptime z: F32x4Component, + comptime w: F32x4Component, +) F32x4 { + return @shuffle(f32, v, undefined, [4]i32{ @intFromEnum(x), @intFromEnum(y), @intFromEnum(z), @intFromEnum(w) }); +} + +pub inline fn mod(v0: anytype, v1: anytype) @TypeOf(v0, v1) { + // vdivps, vroundps, vmulps, vsubps + return v0 - v1 * trunc(v0 / v1); +} +test "zmath.mod" { + try expectVecApproxEqAbs(mod(splat(F32x4, 3.1), splat(F32x4, 1.7)), splat(F32x4, 1.4), 0.0005); + try expectVecApproxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, 2.0)), splat(F32x4, -1.0), 0.0005); + try expectVecApproxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, -2.0)), splat(F32x4, -1.0), 0.0005); + try expectVecApproxEqAbs(mod(splat(F32x4, 3.0), splat(F32x4, -2.0)), splat(F32x4, 1.0), 0.0005); + try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, 1.0))), 0)); + try expect(all(isNan(mod(splat(F32x4, -math.inf(f32)), splat(F32x4, 123.456))), 0)); + try expect(all(isNan(mod(splat(F32x4, math.nan(f32)), splat(F32x4, 123.456))), 0)); + try expect(all(isNan(mod(splat(F32x4, math.snan(f32)), splat(F32x4, 123.456))), 0)); + try expect(all(isNan(mod(splat(F32x4, -math.snan(f32)), splat(F32x4, 123.456))), 0)); + try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.inf(f32)))), 0)); + try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, -math.inf(f32)))), 0)); + try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32)))), 0)); + try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.nan(f32)))), 0)); + try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.nan(f32)))), 0)); +} + +pub fn modAngle(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => modAngle32(v), + F32x4, F32x8, F32x16 => modAngle32xN(v), + else => @compileError("zmath.modAngle() not implemented for " ++ @typeName(T)), + }; +} + +pub inline fn modAngle32xN(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return v - splat(T, math.tau) * round(v * splat(T, 1.0 / math.tau)); // 2 x vmulps, 2 x load, vroundps, vaddps +} +test "zmath.modAngle" { + try expectVecApproxEqAbs(modAngle(splat(F32x4, math.tau)), splat(F32x4, 0.0), 0.0005); + try expectVecApproxEqAbs(modAngle(splat(F32x4, 0.0)), splat(F32x4, 0.0), 0.0005); + try expectVecApproxEqAbs(modAngle(splat(F32x4, math.pi)), splat(F32x4, math.pi), 0.0005); + try expectVecApproxEqAbs(modAngle(splat(F32x4, 11 * math.pi)), splat(F32x4, math.pi), 0.0005); + try expectVecApproxEqAbs(modAngle(splat(F32x4, 3.5 * math.pi)), splat(F32x4, -0.5 * math.pi), 0.0005); + try expectVecApproxEqAbs(modAngle(splat(F32x4, 2.5 * math.pi)), splat(F32x4, 0.5 * math.pi), 0.0005); +} + +pub inline fn mulAdd(v0: anytype, v1: anytype, v2: anytype) @TypeOf(v0, v1, v2) { + const T = @TypeOf(v0, v1, v2); + if (@import("zmath_options").enable_cross_platform_determinism) { + return v0 * v1 + v2; // Compiler will generate mul, add sequence (no fma even if the target supports it). + } else { + if (cpu_arch == .x86_64 and has_avx and has_fma) { + return @mulAdd(T, v0, v1, v2); + } else { + // NOTE(mziulek): On .x86_64 without HW fma instructions @mulAdd maps to really slow code! + return v0 * v1 + v2; + } + } +} + +fn sin32xN(v: anytype) @TypeOf(v) { + // 11-degree minimax approximation + const T = @TypeOf(v); + + var x = modAngle(v); + const sign = andInt(x, splatNegativeZero(T)); + const c = orInt(sign, splat(T, math.pi)); + const absx = andNotInt(sign, x); + const rflx = c - x; + const comp = absx <= splat(T, 0.5 * math.pi); + x = select(comp, x, rflx); + const x2 = x * x; + + var result = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06)); + result = mulAdd(result, x2, splat(T, -0.00019840874)); + result = mulAdd(result, x2, splat(T, 0.0083333310)); + result = mulAdd(result, x2, splat(T, -0.16666667)); + result = mulAdd(result, x2, splat(T, 1.0)); + return x * result; +} +test "zmath.sin" { + const epsilon = 0.0001; + + try expectVecApproxEqAbs(sin(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 1.0), epsilon); + try expectVecApproxEqAbs(sin(splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); + try expectVecApproxEqAbs(sin(splat(F32x4, -0.0)), splat(F32x4, -0.0), epsilon); + try expectVecApproxEqAbs(sin(splat(F32x4, 89.123)), splat(F32x4, 0.916166), epsilon); + try expectVecApproxEqAbs(sin(splat(F32x8, 89.123)), splat(F32x8, 0.916166), epsilon); + try expectVecApproxEqAbs(sin(splat(F32x16, 89.123)), splat(F32x16, 0.916166), epsilon); + try expect(all(isNan(sin(splat(F32x4, math.inf(f32)))), 0) == true); + try expect(all(isNan(sin(splat(F32x4, -math.inf(f32)))), 0) == true); + try expect(all(isNan(sin(splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(sin(splat(F32x4, math.snan(f32)))), 0) == true); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = sin(splat(F32x4, f)); + const fr = @sin(splat(F32x4, f)); + const vr8 = sin(splat(F32x8, f)); + const fr8 = @sin(splat(F32x8, f)); + const vr16 = sin(splat(F32x16, f)); + const fr16 = @sin(splat(F32x16, f)); + try expectVecApproxEqAbs(vr, fr, epsilon); + try expectVecApproxEqAbs(vr8, fr8, epsilon); + try expectVecApproxEqAbs(vr16, fr16, epsilon); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +fn cos32xN(v: anytype) @TypeOf(v) { + // 10-degree minimax approximation + const T = @TypeOf(v); + + var x = modAngle(v); + var sign = andInt(x, splatNegativeZero(T)); + const c = orInt(sign, splat(T, math.pi)); + const absx = andNotInt(sign, x); + const rflx = c - x; + const comp = absx <= splat(T, 0.5 * math.pi); + x = select(comp, x, rflx); + sign = select(comp, splat(T, 1.0), splat(T, -1.0)); + const x2 = x * x; + + var result = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05)); + result = mulAdd(result, x2, splat(T, -0.0013888378)); + result = mulAdd(result, x2, splat(T, 0.041666638)); + result = mulAdd(result, x2, splat(T, -0.5)); + result = mulAdd(result, x2, splat(T, 1.0)); + return sign * result; +} +test "zmath.cos" { + const epsilon = 0.0001; + + try expectVecApproxEqAbs(cos(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 0.0), epsilon); + try expectVecApproxEqAbs(cos(splat(F32x4, 0.0)), splat(F32x4, 1.0), epsilon); + try expectVecApproxEqAbs(cos(splat(F32x4, -0.0)), splat(F32x4, 1.0), epsilon); + try expect(all(isNan(cos(splat(F32x4, math.inf(f32)))), 0) == true); + try expect(all(isNan(cos(splat(F32x4, -math.inf(f32)))), 0) == true); + try expect(all(isNan(cos(splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(cos(splat(F32x4, math.snan(f32)))), 0) == true); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const vr = cos(splat(F32x4, f)); + const fr = @cos(splat(F32x4, f)); + const vr8 = cos(splat(F32x8, f)); + const fr8 = @cos(splat(F32x8, f)); + const vr16 = cos(splat(F32x16, f)); + const fr16 = @cos(splat(F32x16, f)); + try expectVecApproxEqAbs(vr, fr, epsilon); + try expectVecApproxEqAbs(vr8, fr8, epsilon); + try expectVecApproxEqAbs(vr16, fr16, epsilon); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +pub fn sin(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => sin32(v), + F32x4, F32x8, F32x16 => sin32xN(v), + else => @compileError("zmath.sin() not implemented for " ++ @typeName(T)), + }; +} + +pub fn cos(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => cos32(v), + F32x4, F32x8, F32x16 => cos32xN(v), + else => @compileError("zmath.cos() not implemented for " ++ @typeName(T)), + }; +} + +pub fn sincos(v: anytype) [2]@TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => sincos32(v), + F32x4, F32x8, F32x16 => sincos32xN(v), + else => @compileError("zmath.sincos() not implemented for " ++ @typeName(T)), + }; +} + +pub fn asin(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => asin32(v), + F32x4, F32x8, F32x16 => asin32xN(v), + else => @compileError("zmath.asin() not implemented for " ++ @typeName(T)), + }; +} + +pub fn acos(v: anytype) @TypeOf(v) { + const T = @TypeOf(v); + return switch (T) { + f32 => acos32(v), + F32x4, F32x8, F32x16 => acos32xN(v), + else => @compileError("zmath.acos() not implemented for " ++ @typeName(T)), + }; +} + +fn sincos32xN(v: anytype) [2]@TypeOf(v) { + const T = @TypeOf(v); + + var x = modAngle(v); + var sign = andInt(x, splatNegativeZero(T)); + const c = orInt(sign, splat(T, math.pi)); + const absx = andNotInt(sign, x); + const rflx = c - x; + const comp = absx <= splat(T, 0.5 * math.pi); + x = select(comp, x, rflx); + sign = select(comp, splat(T, 1.0), splat(T, -1.0)); + const x2 = x * x; + + var sresult = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06)); + sresult = mulAdd(sresult, x2, splat(T, -0.00019840874)); + sresult = mulAdd(sresult, x2, splat(T, 0.0083333310)); + sresult = mulAdd(sresult, x2, splat(T, -0.16666667)); + sresult = x * mulAdd(sresult, x2, splat(T, 1.0)); + + var cresult = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05)); + cresult = mulAdd(cresult, x2, splat(T, -0.0013888378)); + cresult = mulAdd(cresult, x2, splat(T, 0.041666638)); + cresult = mulAdd(cresult, x2, splat(T, -0.5)); + cresult = sign * mulAdd(cresult, x2, splat(T, 1.0)); + + return .{ sresult, cresult }; +} +test "zmath.sincos32xN" { + const epsilon = 0.0001; + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const sc = sincos(splat(F32x4, f)); + const sc8 = sincos(splat(F32x8, f)); + const sc16 = sincos(splat(F32x16, f)); + const s4 = @sin(splat(F32x4, f)); + const s8 = @sin(splat(F32x8, f)); + const s16 = @sin(splat(F32x16, f)); + const c4 = @cos(splat(F32x4, f)); + const c8 = @cos(splat(F32x8, f)); + const c16 = @cos(splat(F32x16, f)); + try expectVecApproxEqAbs(sc[0], s4, epsilon); + try expectVecApproxEqAbs(sc8[0], s8, epsilon); + try expectVecApproxEqAbs(sc16[0], s16, epsilon); + try expectVecApproxEqAbs(sc[1], c4, epsilon); + try expectVecApproxEqAbs(sc8[1], c8, epsilon); + try expectVecApproxEqAbs(sc16[1], c16, epsilon); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +fn asin32xN(v: anytype) @TypeOf(v) { + // 7-degree minimax approximation + const T = @TypeOf(v); + + const x = abs(v); + const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x)); + + var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901)); + t0 = mulAdd(t0, x, splat(T, -0.0170881256)); + t0 = mulAdd(t0, x, splat(T, 0.0308918810)); + t0 = mulAdd(t0, x, splat(T, -0.0501743046)); + t0 = mulAdd(t0, x, splat(T, 0.0889789874)); + t0 = mulAdd(t0, x, splat(T, -0.2145988016)); + t0 = root * mulAdd(t0, x, splat(T, 1.5707963050)); + + const t1 = splat(T, math.pi) - t0; + return splat(T, 0.5 * math.pi) - select(v >= splat(T, 0.0), t0, t1); +} + +fn acos32xN(v: anytype) @TypeOf(v) { + // 7-degree minimax approximation + const T = @TypeOf(v); + + const x = abs(v); + const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x)); + + var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901)); + t0 = mulAdd(t0, x, splat(T, -0.0170881256)); + t0 = mulAdd(t0, x, splat(T, 0.0308918810)); + t0 = mulAdd(t0, x, splat(T, -0.0501743046)); + t0 = mulAdd(t0, x, splat(T, 0.0889789874)); + t0 = mulAdd(t0, x, splat(T, -0.2145988016)); + t0 = root * mulAdd(t0, x, splat(T, 1.5707963050)); + + const t1 = splat(T, math.pi) - t0; + return select(v >= splat(T, 0.0), t0, t1); +} + +pub fn atan(v: anytype) @TypeOf(v) { + // 17-degree minimax approximation + const T = @TypeOf(v); + + const vabs = abs(v); + const vinv = splat(T, 1.0) / v; + var sign = select(v > splat(T, 1.0), splat(T, 1.0), splat(T, -1.0)); + const comp = vabs <= splat(T, 1.0); + sign = select(comp, splat(T, 0.0), sign); + const x = select(comp, v, vinv); + const x2 = x * x; + + var result = mulAdd(splat(T, 0.0028662257), x2, splat(T, -0.0161657367)); + result = mulAdd(result, x2, splat(T, 0.0429096138)); + result = mulAdd(result, x2, splat(T, -0.0752896400)); + result = mulAdd(result, x2, splat(T, 0.1065626393)); + result = mulAdd(result, x2, splat(T, -0.1420889944)); + result = mulAdd(result, x2, splat(T, 0.1999355085)); + result = mulAdd(result, x2, splat(T, -0.3333314528)); + result = x * mulAdd(result, x2, splat(T, 1.0)); + + const result1 = sign * splat(T, 0.5 * math.pi) - result; + return select(sign == splat(T, 0.0), result, result1); +} +test "zmath.atan" { + const epsilon = 0.0001; + { + const v = f32x4(0.25, 0.5, 1.0, 1.25); + const e = f32x4(math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3])); + try expectVecApproxEqAbs(e, atan(v), epsilon); + } + { + const v = f32x8(-0.25, 0.5, -1.0, 1.25, 100.0, -200.0, 300.0, 400.0); + // zig fmt: off + const e = f32x8( + math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]), + math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]), + ); + // zig fmt: on + try expectVecApproxEqAbs(e, atan(v), epsilon); + } + { + // zig fmt: off + const v = f32x16( + -0.25, 0.5, -1.0, 0.0, 0.1, -0.2, 30.0, 400.0, + -0.25, 0.5, -1.0, -0.0, -0.05, -0.125, 0.0625, 4000.0 + ); + const e = f32x16( + math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]), + math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]), + math.atan(v[8]), math.atan(v[9]), math.atan(v[10]), math.atan(v[11]), + math.atan(v[12]), math.atan(v[13]), math.atan(v[14]), math.atan(v[15]), + ); + // zig fmt: on + try expectVecApproxEqAbs(e, atan(v), epsilon); + } + { + try expectVecApproxEqAbs(atan(splat(F32x4, math.inf(f32))), splat(F32x4, 0.5 * math.pi), epsilon); + try expectVecApproxEqAbs(atan(splat(F32x4, -math.inf(f32))), splat(F32x4, -0.5 * math.pi), epsilon); + try expect(all(isNan(atan(splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(atan(splat(F32x4, -math.nan(f32)))), 0) == true); + } +} + +pub fn atan2(vy: anytype, vx: anytype) @TypeOf(vx, vy) { + const T = @TypeOf(vx, vy); + const Tu = @Vector(veclen(T), u32); + + const vx_is_positive = + (@as(Tu, @bitCast(vx)) & @as(Tu, @splat(0x8000_0000))) == @as(Tu, @splat(0)); + + const vy_sign = andInt(vy, splatNegativeZero(T)); + const c0_25pi = orInt(vy_sign, @as(T, @splat(0.25 * math.pi))); + const c0_50pi = orInt(vy_sign, @as(T, @splat(0.50 * math.pi))); + const c0_75pi = orInt(vy_sign, @as(T, @splat(0.75 * math.pi))); + const c1_00pi = orInt(vy_sign, @as(T, @splat(1.00 * math.pi))); + + var r1 = select(vx_is_positive, vy_sign, c1_00pi); + var r2 = select(vx == splat(T, 0.0), c0_50pi, splatInt(T, 0xffff_ffff)); + const r3 = select(vy == splat(T, 0.0), r1, r2); + const r4 = select(vx_is_positive, c0_25pi, c0_75pi); + const r5 = select(isInf(vx), r4, c0_50pi); + const result = select(isInf(vy), r5, r3); + const result_valid = @as(Tu, @bitCast(result)) == @as(Tu, @splat(0xffff_ffff)); + + const v = vy / vx; + const r0 = atan(v); + + r1 = select(vx_is_positive, splatNegativeZero(T), c1_00pi); + r2 = r0 + r1; + + return select(result_valid, r2, result); +} +test "zmath.atan2" { + // From DirectXMath XMVectorATan2(): + // + // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: + + // Y == 0 and X is Negative -> Pi with the sign of Y + // y == 0 and x is positive -> 0 with the sign of y + // Y != 0 and X == 0 -> Pi / 2 with the sign of Y + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) + // X == -Infinity and Finite Y -> Pi with the sign of Y + // X == +Infinity and Finite Y -> 0 with the sign of Y + // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y + // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y + // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y + + const epsilon = 0.0001; + try expectVecApproxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, -1.0)), splat(F32x4, math.pi), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, -1.0)), splat(F32x4, -math.pi), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, 0.0)), splat(F32x4, 0.5 * math.pi), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, 0.0)), splat(F32x4, -0.5 * math.pi), epsilon); + try expectVecApproxEqAbs( + atan2(splat(F32x4, 1.0), splat(F32x4, -1.0)), + splat(F32x4, math.atan(@as(f32, -1.0)) + math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, -10.0), splat(F32x4, -2.0)), + splat(F32x4, math.atan(@as(f32, 5.0)) - math.pi), + epsilon, + ); + try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, math.pi), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, -math.pi), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, math.inf(f32))), splat(F32x4, 0.0), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, math.inf(f32))), splat(F32x4, -0.0), epsilon); + try expectVecApproxEqAbs( + atan2(splat(F32x4, math.inf(f32)), splat(F32x4, 2.0)), + splat(F32x4, 0.5 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, 2.0)), + splat(F32x4, -0.5 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, math.inf(f32)), splat(F32x4, -math.inf(f32))), + splat(F32x4, 0.75 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, -math.inf(f32))), + splat(F32x4, -0.75 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32))), + splat(F32x4, 0.25 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, math.inf(f32))), + splat(F32x4, -0.25 * math.pi), + epsilon, + ); + try expectVecApproxEqAbs( + atan2( + f32x8(0.0, -math.inf(f32), -0.0, 2.0, math.inf(f32), math.inf(f32), 1.0, -math.inf(f32)), + f32x8(-2.0, math.inf(f32), 1.0, 0.0, 10.0, -math.inf(f32), 1.0, -math.inf(f32)), + ), + f32x8( + math.pi, + -0.25 * math.pi, + -0.0, + 0.5 * math.pi, + 0.5 * math.pi, + 0.75 * math.pi, + math.atan(@as(f32, 1.0)), + -0.75 * math.pi, + ), + epsilon, + ); + try expectVecApproxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); + try expectVecApproxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon); + try expect(all(isNan(atan2(splat(F32x4, 1.0), splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(atan2(splat(F32x4, -1.0), splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(atan2(splat(F32x4, math.nan(f32)), splat(F32x4, -1.0))), 0) == true); + try expect(all(isNan(atan2(splat(F32x4, -math.nan(f32)), splat(F32x4, 1.0))), 0) == true); +} +// ------------------------------------------------------------------------------ +// +// 3. 2D, 3D, 4D vector functions +// +// ------------------------------------------------------------------------------ +pub inline fn dot2(v0: Vec, v1: Vec) F32x4 { + var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | -- | -- | + const xmm1 = swizzle(xmm0, .y, .x, .x, .x); // | y0*y1 | -- | -- | -- | + xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[3]); // | x0*x1 + y0*y1 | -- | -- | -- | + return swizzle(xmm0, .x, .x, .x, .x); +} +test "zmath.dot2" { + const v0 = f32x4(-1.0, 2.0, 300.0, -2.0); + const v1 = f32x4(4.0, 5.0, 600.0, 2.0); + const v = dot2(v0, v1); + try expectVecApproxEqAbs(v, splat(F32x4, 6.0), 0.0001); +} + +pub inline fn dot3(v0: Vec, v1: Vec) F32x4 { + const dot = v0 * v1; + return f32x4s(dot[0] + dot[1] + dot[2]); +} +test "zmath.dot3" { + const v0 = f32x4(-1.0, 2.0, 3.0, 1.0); + const v1 = f32x4(4.0, 5.0, 6.0, 1.0); + const v = dot3(v0, v1); + try expectVecApproxEqAbs(v, splat(F32x4, 24.0), 0.0001); +} + +pub inline fn dot4(v0: Vec, v1: Vec) F32x4 { + var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | z0*z1 | w0*w1 | + var xmm1 = swizzle(xmm0, .y, .x, .w, .x); // | y0*y1 | -- | w0*w1 | -- | + xmm1 = xmm0 + xmm1; // | x0*x1 + y0*y1 | -- | z0*z1 + w0*w1 | -- | + xmm0 = swizzle(xmm1, .z, .x, .x, .x); // | z0*z1 + w0*w1 | -- | -- | -- | + xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[2]); // addss + return swizzle(xmm0, .x, .x, .x, .x); +} +test "zmath.dot4" { + const v0 = f32x4(-1.0, 2.0, 3.0, -2.0); + const v1 = f32x4(4.0, 5.0, 6.0, 2.0); + const v = dot4(v0, v1); + try expectVecApproxEqAbs(v, splat(F32x4, 20.0), 0.0001); +} + +pub inline fn cross3(v0: Vec, v1: Vec) Vec { + var xmm0 = swizzle(v0, .y, .z, .x, .w); + var xmm1 = swizzle(v1, .z, .x, .y, .w); + var result = xmm0 * xmm1; + xmm0 = swizzle(xmm0, .y, .z, .x, .w); + xmm1 = swizzle(xmm1, .z, .x, .y, .w); + result = result - xmm0 * xmm1; + return andInt(result, f32x4_mask3); +} +test "zmath.cross3" { + { + const v0 = f32x4(1.0, 0.0, 0.0, 1.0); + const v1 = f32x4(0.0, 1.0, 0.0, 1.0); + const v = cross3(v0, v1); + try expectVecApproxEqAbs(v, f32x4(0.0, 0.0, 1.0, 0.0), 0.0001); + } + { + const v0 = f32x4(1.0, 0.0, 0.0, 1.0); + const v1 = f32x4(0.0, -1.0, 0.0, 1.0); + const v = cross3(v0, v1); + try expectVecApproxEqAbs(v, f32x4(0.0, 0.0, -1.0, 0.0), 0.0001); + } + { + const v0 = f32x4(-3.0, 0, -2.0, 1.0); + const v1 = f32x4(5.0, -1.0, 2.0, 1.0); + const v = cross3(v0, v1); + try expectVecApproxEqAbs(v, f32x4(-2.0, -4.0, 3.0, 0.0), 0.0001); + } +} + +pub inline fn lengthSq2(v: Vec) F32x4 { + return dot2(v, v); +} +pub inline fn lengthSq3(v: Vec) F32x4 { + return dot3(v, v); +} +pub inline fn lengthSq4(v: Vec) F32x4 { + return dot4(v, v); +} + +pub inline fn length2(v: Vec) F32x4 { + return sqrt(dot2(v, v)); +} +pub inline fn length3(v: Vec) F32x4 { + return sqrt(dot3(v, v)); +} +pub inline fn length4(v: Vec) F32x4 { + return sqrt(dot4(v, v)); +} +test "zmath.length3" { + { + const v = length3(f32x4(1.0, -2.0, 3.0, 1000.0)); + try expectVecApproxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001); + } + { + const v = length3(f32x4(1.0, math.nan(f32), math.nan(f32), 1000.0)); + try expect(all(isNan(v), 0)); + } + { + const v = length3(f32x4(1.0, math.inf(f32), 3.0, 1000.0)); + try expect(all(isInf(v), 0)); + } + { + const v = length3(f32x4(3.0, 2.0, 1.0, math.nan(f32))); + try expectVecApproxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001); + } +} + +pub inline fn normalize2(v: Vec) Vec { + return v * splat(F32x4, 1.0) / sqrt(dot2(v, v)); +} +pub inline fn normalize3(v: Vec) Vec { + return v * splat(F32x4, 1.0) / sqrt(dot3(v, v)); +} +pub inline fn normalize4(v: Vec) Vec { + return v * splat(F32x4, 1.0) / sqrt(dot4(v, v)); +} +test "zmath.normalize3" { + { + const v0 = f32x4(1.0, -2.0, 3.0, 1000.0); + const v = normalize3(v0); + try expectVecApproxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(14.0)), 0.0005); + } + { + try expect(any(isNan(normalize3(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0)); + try expect(any(isNan(normalize3(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0)); + try expect(any(isNan(normalize3(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0)); + try expect(any(isNan(normalize3(f32x4(0, 0, 0, 0))), 0)); + } +} +test "zmath.normalize4" { + { + const v0 = f32x4(1.0, -2.0, 3.0, 10.0); + const v = normalize4(v0); + try expectVecApproxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(114.0)), 0.0005); + } + { + try expect(any(isNan(normalize4(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0)); + try expect(any(isNan(normalize4(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0)); + try expect(any(isNan(normalize4(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0)); + try expect(any(isNan(normalize4(f32x4(0, 0, 0, 0))), 0)); + } +} + +fn vecMulMat(v: Vec, m: Mat) Vec { + const vx = @shuffle(f32, v, undefined, [4]i32{ 0, 0, 0, 0 }); + const vy = @shuffle(f32, v, undefined, [4]i32{ 1, 1, 1, 1 }); + const vz = @shuffle(f32, v, undefined, [4]i32{ 2, 2, 2, 2 }); + const vw = @shuffle(f32, v, undefined, [4]i32{ 3, 3, 3, 3 }); + return vx * m[0] + vy * m[1] + vz * m[2] + vw * m[3]; +} +fn matMulVec(m: Mat, v: Vec) Vec { + return .{ dot4(m[0], v)[0], dot4(m[1], v)[0], dot4(m[2], v)[0], dot4(m[3], v)[0] }; +} +test "zmath.vecMulMat" { + const m = Mat{ + f32x4(1.0, 0.0, 0.0, 0.0), + f32x4(0.0, 1.0, 0.0, 0.0), + f32x4(0.0, 0.0, 1.0, 0.0), + f32x4(2.0, 3.0, 4.0, 1.0), + }; + const vm = mul(f32x4(1.0, 2.0, 3.0, 1.0), m); + const mv = mul(m, f32x4(1.0, 2.0, 3.0, 1.0)); + const v = mul(transpose(m), f32x4(1.0, 2.0, 3.0, 1.0)); + try expectVecApproxEqAbs(vm, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001); + try expectVecApproxEqAbs(mv, f32x4(1.0, 2.0, 3.0, 21.0), 0.0001); + try expectVecApproxEqAbs(v, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001); +} +// ------------------------------------------------------------------------------ +// +// 4. Matrix functions +// +// ------------------------------------------------------------------------------ +pub fn identity() Mat { + const static = struct { + const identity = Mat{ + f32x4(1.0, 0.0, 0.0, 0.0), + f32x4(0.0, 1.0, 0.0, 0.0), + f32x4(0.0, 0.0, 1.0, 0.0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; + }; + return static.identity; +} + +pub fn matFromArr(arr: [16]f32) Mat { + return Mat{ + f32x4(arr[0], arr[1], arr[2], arr[3]), + f32x4(arr[4], arr[5], arr[6], arr[7]), + f32x4(arr[8], arr[9], arr[10], arr[11]), + f32x4(arr[12], arr[13], arr[14], arr[15]), + }; +} + +fn mulRetType(comptime Ta: type, comptime Tb: type) type { + if (Ta == Mat and Tb == Mat) { + return Mat; + } else if ((Ta == f32 and Tb == Mat) or (Ta == Mat and Tb == f32)) { + return Mat; + } else if ((Ta == Vec and Tb == Mat) or (Ta == Mat and Tb == Vec)) { + return Vec; + } + @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ @typeName(Tb)); +} + +pub fn mul(a: anytype, b: anytype) mulRetType(@TypeOf(a), @TypeOf(b)) { + const Ta = @TypeOf(a); + const Tb = @TypeOf(b); + if (Ta == Mat and Tb == Mat) { + return mulMat(a, b); + } else if (Ta == f32 and Tb == Mat) { + const va = splat(F32x4, a); + return Mat{ va * b[0], va * b[1], va * b[2], va * b[3] }; + } else if (Ta == Mat and Tb == f32) { + const vb = splat(F32x4, b); + return Mat{ a[0] * vb, a[1] * vb, a[2] * vb, a[3] * vb }; + } else if (Ta == Vec and Tb == Mat) { + return vecMulMat(a, b); + } else if (Ta == Mat and Tb == Vec) { + return matMulVec(a, b); + } else { + @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ ", " ++ @typeName(Tb)); + } +} +test "zmath.mul" { + { + const m = Mat{ + f32x4(0.1, 0.2, 0.3, 0.4), + f32x4(0.5, 0.6, 0.7, 0.8), + f32x4(0.9, 1.0, 1.1, 1.2), + f32x4(1.3, 1.4, 1.5, 1.6), + }; + const ms = mul(@as(f32, 2.0), m); + try expectVecApproxEqAbs(ms[0], f32x4(0.2, 0.4, 0.6, 0.8), 0.0001); + try expectVecApproxEqAbs(ms[1], f32x4(1.0, 1.2, 1.4, 1.6), 0.0001); + try expectVecApproxEqAbs(ms[2], f32x4(1.8, 2.0, 2.2, 2.4), 0.0001); + try expectVecApproxEqAbs(ms[3], f32x4(2.6, 2.8, 3.0, 3.2), 0.0001); + } +} + +fn mulMat(m0: Mat, m1: Mat) Mat { + var result: Mat = undefined; + comptime var row: u32 = 0; + inline while (row < 4) : (row += 1) { + const vx = swizzle(m0[row], .x, .x, .x, .x); + const vy = swizzle(m0[row], .y, .y, .y, .y); + const vz = swizzle(m0[row], .z, .z, .z, .z); + const vw = swizzle(m0[row], .w, .w, .w, .w); + result[row] = mulAdd(vx, m1[0], vz * m1[2]) + mulAdd(vy, m1[1], vw * m1[3]); + } + return result; +} +test "zmath.matrix.mul" { + const a = Mat{ + f32x4(0.1, 0.2, 0.3, 0.4), + f32x4(0.5, 0.6, 0.7, 0.8), + f32x4(0.9, 1.0, 1.1, 1.2), + f32x4(1.3, 1.4, 1.5, 1.6), + }; + const b = Mat{ + f32x4(1.7, 1.8, 1.9, 2.0), + f32x4(2.1, 2.2, 2.3, 2.4), + f32x4(2.5, 2.6, 2.7, 2.8), + f32x4(2.9, 3.0, 3.1, 3.2), + }; + const c = mul(a, b); + try expectVecApproxEqAbs(c[0], f32x4(2.5, 2.6, 2.7, 2.8), 0.0001); + try expectVecApproxEqAbs(c[1], f32x4(6.18, 6.44, 6.7, 6.96), 0.0001); + try expectVecApproxEqAbs(c[2], f32x4(9.86, 10.28, 10.7, 11.12), 0.0001); + try expectVecApproxEqAbs(c[3], f32x4(13.54, 14.12, 14.7, 15.28), 0.0001); +} + +pub fn transpose(m: Mat) Mat { + const temp1 = @shuffle(f32, m[0], m[1], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) }); + const temp3 = @shuffle(f32, m[0], m[1], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); + const temp2 = @shuffle(f32, m[2], m[3], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) }); + const temp4 = @shuffle(f32, m[2], m[3], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); + return .{ + @shuffle(f32, temp1, temp2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }), + @shuffle(f32, temp1, temp2, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }), + @shuffle(f32, temp3, temp4, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }), + @shuffle(f32, temp3, temp4, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }), + }; +} +test "zmath.matrix.transpose" { + const m = Mat{ + f32x4(1.0, 2.0, 3.0, 4.0), + f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), + f32x4(13.0, 14.0, 15.0, 16.0), + }; + const mt = transpose(m); + try expectVecApproxEqAbs(mt[0], f32x4(1.0, 5.0, 9.0, 13.0), 0.0001); + try expectVecApproxEqAbs(mt[1], f32x4(2.0, 6.0, 10.0, 14.0), 0.0001); + try expectVecApproxEqAbs(mt[2], f32x4(3.0, 7.0, 11.0, 15.0), 0.0001); + try expectVecApproxEqAbs(mt[3], f32x4(4.0, 8.0, 12.0, 16.0), 0.0001); +} + +pub fn rotationX(angle: f32) Mat { + const sc = sincos(angle); + return .{ + f32x4(1.0, 0.0, 0.0, 0.0), + f32x4(0.0, sc[1], sc[0], 0.0), + f32x4(0.0, -sc[0], sc[1], 0.0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; +} + +pub fn rotationY(angle: f32) Mat { + const sc = sincos(angle); + return .{ + f32x4(sc[1], 0.0, -sc[0], 0.0), + f32x4(0.0, 1.0, 0.0, 0.0), + f32x4(sc[0], 0.0, sc[1], 0.0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; +} + +pub fn rotationZ(angle: f32) Mat { + const sc = sincos(angle); + return .{ + f32x4(sc[1], sc[0], 0.0, 0.0), + f32x4(-sc[0], sc[1], 0.0, 0.0), + f32x4(0.0, 0.0, 1.0, 0.0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; +} + +pub fn translation(x: f32, y: f32, z: f32) Mat { + return .{ + f32x4(1.0, 0.0, 0.0, 0.0), + f32x4(0.0, 1.0, 0.0, 0.0), + f32x4(0.0, 0.0, 1.0, 0.0), + f32x4(x, y, z, 1.0), + }; +} +pub fn translationV(v: Vec) Mat { + return translation(v[0], v[1], v[2]); +} + +pub fn scaling(x: f32, y: f32, z: f32) Mat { + return .{ + f32x4(x, 0.0, 0.0, 0.0), + f32x4(0.0, y, 0.0, 0.0), + f32x4(0.0, 0.0, z, 0.0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; +} +pub fn scalingV(v: Vec) Mat { + return scaling(v[0], v[1], v[2]); +} + +pub fn lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat { + const az = normalize3(eyedir); + const ax = normalize3(cross3(updir, az)); + const ay = normalize3(cross3(az, ax)); + return .{ + f32x4(ax[0], ay[0], az[0], 0), + f32x4(ax[1], ay[1], az[1], 0), + f32x4(ax[2], ay[2], az[2], 0), + f32x4(-dot3(ax, eyepos)[0], -dot3(ay, eyepos)[0], -dot3(az, eyepos)[0], 1.0), + }; +} +pub fn lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat { + return lookToLh(eyepos, -eyedir, updir); +} +pub fn lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat { + return lookToLh(eyepos, focuspos - eyepos, updir); +} +pub fn lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat { + return lookToLh(eyepos, eyepos - focuspos, updir); +} +test "zmath.matrix.lookToLh" { + const m = lookToLh(f32x4(0.0, 0.0, -3.0, 1.0), f32x4(0.0, 0.0, 1.0, 0.0), f32x4(0.0, 1.0, 0.0, 0.0)); + try expectVecApproxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.001); + try expectVecApproxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.001); + try expectVecApproxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.001); + try expectVecApproxEqAbs(m[3], f32x4(0.0, 0.0, 3.0, 1.0), 0.001); +} + +pub fn perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat { + const scfov = sincos(0.5 * fovy); + + assert(near > 0.0 and far > 0.0); + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); + + const h = scfov[1] / scfov[0]; + const w = h / aspect; + const r = far / (far - near); + return .{ + f32x4(w, 0.0, 0.0, 0.0), + f32x4(0.0, h, 0.0, 0.0), + f32x4(0.0, 0.0, r, 1.0), + f32x4(0.0, 0.0, -r * near, 0.0), + }; +} +pub fn perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat { + const scfov = sincos(0.5 * fovy); + + assert(near > 0.0 and far > 0.0); + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); + + const h = scfov[1] / scfov[0]; + const w = h / aspect; + const r = far / (near - far); + return .{ + f32x4(w, 0.0, 0.0, 0.0), + f32x4(0.0, h, 0.0, 0.0), + f32x4(0.0, 0.0, r, -1.0), + f32x4(0.0, 0.0, r * near, 0.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat { + const scfov = sincos(0.5 * fovy); + + assert(near > 0.0 and far > 0.0); + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); + + const h = scfov[1] / scfov[0]; + const w = h / aspect; + const r = far - near; + return .{ + f32x4(w, 0.0, 0.0, 0.0), + f32x4(0.0, h, 0.0, 0.0), + f32x4(0.0, 0.0, (near + far) / r, 1.0), + f32x4(0.0, 0.0, 2.0 * near * far / -r, 0.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat { + const scfov = sincos(0.5 * fovy); + + assert(near > 0.0 and far > 0.0); + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); + + const h = scfov[1] / scfov[0]; + const w = h / aspect; + const r = near - far; + return .{ + f32x4(w, 0.0, 0.0, 0.0), + f32x4(0.0, h, 0.0, 0.0), + f32x4(0.0, 0.0, (near + far) / r, -1.0), + f32x4(0.0, 0.0, 2.0 * near * far / r, 0.0), + }; +} + +pub fn orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = 1 / (far - near); + return .{ + f32x4(2 / w, 0.0, 0.0, 0.0), + f32x4(0.0, 2 / h, 0.0, 0.0), + f32x4(0.0, 0.0, r, 0.0), + f32x4(0.0, 0.0, -r * near, 1.0), + }; +} + +pub fn orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = 1 / (near - far); + return .{ + f32x4(2 / w, 0.0, 0.0, 0.0), + f32x4(0.0, 2 / h, 0.0, 0.0), + f32x4(0.0, 0.0, r, 0.0), + f32x4(0.0, 0.0, r * near, 1.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = far - near; + return .{ + f32x4(2 / w, 0.0, 0.0, 0.0), + f32x4(0.0, 2 / h, 0.0, 0.0), + f32x4(0.0, 0.0, 2 / r, 0.0), + f32x4(0.0, 0.0, (near + far) / -r, 1.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = near - far; + return .{ + f32x4(2 / w, 0.0, 0.0, 0.0), + f32x4(0.0, 2 / h, 0.0, 0.0), + f32x4(0.0, 0.0, 2 / r, 0.0), + f32x4(0.0, 0.0, (near + far) / r, 1.0), + }; +} + +pub fn orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = 1 / (far - near); + return .{ + f32x4(2 / (right - left), 0.0, 0.0, 0.0), + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), + f32x4(0.0, 0.0, r, 0.0), + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), -r * near, 1.0), + }; +} + +pub fn orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = 1 / (near - far); + return .{ + f32x4(2 / (right - left), 0.0, 0.0, 0.0), + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), + f32x4(0.0, 0.0, r, 0.0), + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), r * near, 1.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = far - near; + return .{ + f32x4(2 / (right - left), 0.0, 0.0, 0.0), + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), + f32x4(0.0, 0.0, 2 / r, 0.0), + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / -r, 1.0), + }; +} + +// Produces Z values in [-1.0, 1.0] range (OpenGL defaults) +pub fn orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { + assert(!math.approxEqAbs(f32, far, near, 0.001)); + + const r = near - far; + return .{ + f32x4(2 / (right - left), 0.0, 0.0, 0.0), + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), + f32x4(0.0, 0.0, 2 / r, 0.0), + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / r, 1.0), + }; +} + +pub fn determinant(m: Mat) F32x4 { + var v0 = swizzle(m[2], .y, .x, .x, .x); + var v1 = swizzle(m[3], .z, .z, .y, .y); + var v2 = swizzle(m[2], .y, .x, .x, .x); + var v3 = swizzle(m[3], .w, .w, .w, .z); + var v4 = swizzle(m[2], .z, .z, .y, .y); + var v5 = swizzle(m[3], .w, .w, .w, .z); + + var p0 = v0 * v1; + var p1 = v2 * v3; + var p2 = v4 * v5; + + v0 = swizzle(m[2], .z, .z, .y, .y); + v1 = swizzle(m[3], .y, .x, .x, .x); + v2 = swizzle(m[2], .w, .w, .w, .z); + v3 = swizzle(m[3], .y, .x, .x, .x); + v4 = swizzle(m[2], .w, .w, .w, .z); + v5 = swizzle(m[3], .z, .z, .y, .y); + + p0 = mulAdd(-v0, v1, p0); + p1 = mulAdd(-v2, v3, p1); + p2 = mulAdd(-v4, v5, p2); + + v0 = swizzle(m[1], .w, .w, .w, .z); + v1 = swizzle(m[1], .z, .z, .y, .y); + v2 = swizzle(m[1], .y, .x, .x, .x); + + const s = m[0] * f32x4(1.0, -1.0, 1.0, -1.0); + var r = v0 * p0; + r = mulAdd(-v1, p1, r); + r = mulAdd(v2, p2, r); + return dot4(s, r); +} +test "zmath.matrix.determinant" { + const m = Mat{ + f32x4(10.0, -9.0, -12.0, 1.0), + f32x4(7.0, -12.0, 11.0, 1.0), + f32x4(-10.0, 10.0, 3.0, 1.0), + f32x4(1.0, 2.0, 3.0, 4.0), + }; + try expectVecApproxEqAbs(determinant(m), splat(F32x4, 2939.0), 0.0001); +} + +pub fn inverse(a: anytype) @TypeOf(a) { + const T = @TypeOf(a); + return switch (T) { + Mat => inverseMat(a), + Quat => inverseQuat(a), + else => @compileError("zmath.inverse() not implemented for " ++ @typeName(T)), + }; +} + +fn inverseMat(m: Mat) Mat { + return inverseDet(m, null); +} + +pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat { + const mt = transpose(m); + var v0: [4]F32x4 = undefined; + var v1: [4]F32x4 = undefined; + + v0[0] = swizzle(mt[2], .x, .x, .y, .y); + v1[0] = swizzle(mt[3], .z, .w, .z, .w); + v0[1] = swizzle(mt[0], .x, .x, .y, .y); + v1[1] = swizzle(mt[1], .z, .w, .z, .w); + v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); + v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); + + var d0 = v0[0] * v1[0]; + var d1 = v0[1] * v1[1]; + var d2 = v0[2] * v1[2]; + + v0[0] = swizzle(mt[2], .z, .w, .z, .w); + v1[0] = swizzle(mt[3], .x, .x, .y, .y); + v0[1] = swizzle(mt[0], .z, .w, .z, .w); + v1[1] = swizzle(mt[1], .x, .x, .y, .y); + v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); + v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); + + d0 = mulAdd(-v0[0], v1[0], d0); + d1 = mulAdd(-v0[1], v1[1], d1); + d2 = mulAdd(-v0[2], v1[2], d2); + + v0[0] = swizzle(mt[1], .y, .z, .x, .y); + v1[0] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 1, 3, 0 }); + v0[1] = swizzle(mt[0], .z, .x, .y, .x); + v1[1] = @shuffle(f32, d0, d2, [4]i32{ 3, ~@as(i32, 1), 1, 2 }); + v0[2] = swizzle(mt[3], .y, .z, .x, .y); + v1[2] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 1, 3, 0 }); + v0[3] = swizzle(mt[2], .z, .x, .y, .x); + v1[3] = @shuffle(f32, d1, d2, [4]i32{ 3, ~@as(i32, 3), 1, 2 }); + + var c0 = v0[0] * v1[0]; + var c2 = v0[1] * v1[1]; + var c4 = v0[2] * v1[2]; + var c6 = v0[3] * v1[3]; + + v0[0] = swizzle(mt[1], .z, .w, .y, .z); + v1[0] = @shuffle(f32, d0, d2, [4]i32{ 3, 0, 1, ~@as(i32, 0) }); + v0[1] = swizzle(mt[0], .w, .z, .w, .y); + v1[1] = @shuffle(f32, d0, d2, [4]i32{ 2, 1, ~@as(i32, 0), 0 }); + v0[2] = swizzle(mt[3], .z, .w, .y, .z); + v1[2] = @shuffle(f32, d1, d2, [4]i32{ 3, 0, 1, ~@as(i32, 2) }); + v0[3] = swizzle(mt[2], .w, .z, .w, .y); + v1[3] = @shuffle(f32, d1, d2, [4]i32{ 2, 1, ~@as(i32, 2), 0 }); + + c0 = mulAdd(-v0[0], v1[0], c0); + c2 = mulAdd(-v0[1], v1[1], c2); + c4 = mulAdd(-v0[2], v1[2], c4); + c6 = mulAdd(-v0[3], v1[3], c6); + + v0[0] = swizzle(mt[1], .w, .x, .w, .x); + v1[0] = @shuffle(f32, d0, d2, [4]i32{ 2, ~@as(i32, 1), ~@as(i32, 0), 2 }); + v0[1] = swizzle(mt[0], .y, .w, .x, .z); + v1[1] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 0, 3, ~@as(i32, 0) }); + v0[2] = swizzle(mt[3], .w, .x, .w, .x); + v1[2] = @shuffle(f32, d1, d2, [4]i32{ 2, ~@as(i32, 3), ~@as(i32, 2), 2 }); + v0[3] = swizzle(mt[2], .y, .w, .x, .z); + v1[3] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 0, 3, ~@as(i32, 2) }); + + const c1 = mulAdd(-v0[0], v1[0], c0); + const c3 = mulAdd(v0[1], v1[1], c2); + const c5 = mulAdd(-v0[2], v1[2], c4); + const c7 = mulAdd(v0[3], v1[3], c6); + + c0 = mulAdd(v0[0], v1[0], c0); + c2 = mulAdd(-v0[1], v1[1], c2); + c4 = mulAdd(v0[2], v1[2], c4); + c6 = mulAdd(-v0[3], v1[3], c6); + + var mr = Mat{ + f32x4(c0[0], c1[1], c0[2], c1[3]), + f32x4(c2[0], c3[1], c2[2], c3[3]), + f32x4(c4[0], c5[1], c4[2], c5[3]), + f32x4(c6[0], c7[1], c6[2], c7[3]), + }; + + const det = dot4(mr[0], mt[0]); + if (out_det != null) { + out_det.?.* = det; + } + + if (math.approxEqAbs(f32, det[0], 0.0, math.floatEps(f32))) { + return .{ + f32x4(0.0, 0.0, 0.0, 0.0), + f32x4(0.0, 0.0, 0.0, 0.0), + f32x4(0.0, 0.0, 0.0, 0.0), + f32x4(0.0, 0.0, 0.0, 0.0), + }; + } + + const scale = splat(F32x4, 1.0) / det; + mr[0] *= scale; + mr[1] *= scale; + mr[2] *= scale; + mr[3] *= scale; + return mr; +} +test "zmath.matrix.inverse" { + const m = Mat{ + f32x4(10.0, -9.0, -12.0, 1.0), + f32x4(7.0, -12.0, 11.0, 1.0), + f32x4(-10.0, 10.0, 3.0, 1.0), + f32x4(1.0, 2.0, 3.0, 4.0), + }; + var det: F32x4 = undefined; + const mi = inverseDet(m, &det); + try expectVecApproxEqAbs(det, splat(F32x4, 2939.0), 0.0001); + + try expectVecApproxEqAbs(mi[0], f32x4(-0.170806, -0.13576, -0.349439, 0.164001), 0.0001); + try expectVecApproxEqAbs(mi[1], f32x4(-0.163661, -0.14801, -0.253147, 0.141204), 0.0001); + try expectVecApproxEqAbs(mi[2], f32x4(-0.0871045, 0.00646478, -0.0785982, 0.0398095), 0.0001); + try expectVecApproxEqAbs(mi[3], f32x4(0.18986, 0.103096, 0.272882, 0.10854), 0.0001); +} + +pub fn matFromNormAxisAngle(axis: Vec, angle: f32) Mat { + const sincos_angle = sincos(angle); + + const c2 = splat(F32x4, 1.0 - sincos_angle[1]); + const c1 = splat(F32x4, sincos_angle[1]); + const c0 = splat(F32x4, sincos_angle[0]); + + const n0 = swizzle(axis, .y, .z, .x, .w); + const n1 = swizzle(axis, .z, .x, .y, .w); + + var v0 = c2 * n0 * n1; + const r0 = c2 * axis * axis + c1; + const r1 = c0 * axis + v0; + var r2 = v0 - c0 * axis; + + v0 = andInt(r0, f32x4_mask3); + + var v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 2, ~@as(i32, 1), ~@as(i32, 2) }); + v1 = swizzle(v1, .y, .z, .w, .x); + + var v2 = @shuffle(f32, r1, r2, [4]i32{ 1, 1, ~@as(i32, 0), ~@as(i32, 0) }); + v2 = swizzle(v2, .x, .z, .x, .z); + + r2 = @shuffle(f32, v0, v1, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); + r2 = swizzle(r2, .x, .z, .w, .y); + + var m: Mat = undefined; + m[0] = r2; + + r2 = @shuffle(f32, v0, v1, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); + r2 = swizzle(r2, .z, .x, .w, .y); + m[1] = r2; + + v2 = @shuffle(f32, v2, v0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); + m[2] = v2; + m[3] = f32x4(0.0, 0.0, 0.0, 1.0); + return m; +} +pub fn matFromAxisAngle(axis: Vec, angle: f32) Mat { + assert(!all(axis == splat(F32x4, 0.0), 3)); + assert(!all(isInf(axis), 3)); + const normal = normalize3(axis); + return matFromNormAxisAngle(normal, angle); +} +test "zmath.matrix.matFromAxisAngle" { + { + const m0 = matFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), math.pi * 0.25); + const m1 = rotationX(math.pi * 0.25); + try expectVecApproxEqAbs(m0[0], m1[0], 0.001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.001); + } + { + const m0 = matFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), math.pi * 0.125); + const m1 = rotationY(math.pi * 0.125); + try expectVecApproxEqAbs(m0[0], m1[0], 0.001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.001); + } + { + const m0 = matFromAxisAngle(f32x4(0.0, 0.0, 1.0, 0.0), math.pi * 0.333); + const m1 = rotationZ(math.pi * 0.333); + try expectVecApproxEqAbs(m0[0], m1[0], 0.001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.001); + } +} + +pub fn matFromQuat(quat: Quat) Mat { + const q0 = quat + quat; + var q1 = quat * q0; + + var v0 = swizzle(q1, .y, .x, .x, .w); + v0 = andInt(v0, f32x4_mask3); + + var v1 = swizzle(q1, .z, .z, .y, .w); + v1 = andInt(v1, f32x4_mask3); + + const r0 = (f32x4(1.0, 1.0, 1.0, 0.0) - v0) - v1; + + v0 = swizzle(quat, .x, .x, .y, .w); + v1 = swizzle(q0, .z, .y, .z, .w); + v0 = v0 * v1; + + v1 = swizzle(quat, .w, .w, .w, .w); + const v2 = swizzle(q0, .y, .z, .x, .w); + v1 = v1 * v2; + + const r1 = v0 + v1; + const r2 = v0 - v1; + + v0 = @shuffle(f32, r1, r2, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); + v0 = swizzle(v0, .x, .z, .w, .y); + v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 2), ~@as(i32, 2) }); + v1 = swizzle(v1, .x, .z, .x, .z); + + q1 = @shuffle(f32, r0, v0, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); + q1 = swizzle(q1, .x, .z, .w, .y); + + var m: Mat = undefined; + m[0] = q1; + + q1 = @shuffle(f32, r0, v0, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); + q1 = swizzle(q1, .z, .x, .w, .y); + m[1] = q1; + + q1 = @shuffle(f32, v1, r0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); + m[2] = q1; + m[3] = f32x4(0.0, 0.0, 0.0, 1.0); + return m; +} +test "zmath.matrix.matFromQuat" { + { + const m = matFromQuat(f32x4(0.0, 0.0, 0.0, 1.0)); + try expectVecApproxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.0001); + try expectVecApproxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.0001); + try expectVecApproxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.0001); + try expectVecApproxEqAbs(m[3], f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); + } +} + +pub fn matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat { + return matFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0)); +} +pub fn matFromRollPitchYawV(angles: Vec) Mat { + return matFromQuat(quatFromRollPitchYawV(angles)); +} + +pub fn matToQuat(m: Mat) Quat { + return quatFromMat(m); +} + +pub inline fn loadMat(mem: []const f32) Mat { + return .{ + load(mem[0..4], F32x4, 0), + load(mem[4..8], F32x4, 0), + load(mem[8..12], F32x4, 0), + load(mem[12..16], F32x4, 0), + }; +} +test "zmath.loadMat" { + const a = [18]f32{ + 1.0, 2.0, 3.0, 4.0, + 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, + 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, + }; + const m = loadMat(a[1..]); + try expectVecEqual(m[0], f32x4(2.0, 3.0, 4.0, 5.0)); + try expectVecEqual(m[1], f32x4(6.0, 7.0, 8.0, 9.0)); + try expectVecEqual(m[2], f32x4(10.0, 11.0, 12.0, 13.0)); + try expectVecEqual(m[3], f32x4(14.0, 15.0, 16.0, 17.0)); +} + +pub inline fn storeMat(mem: []f32, m: Mat) void { + store(mem[0..4], m[0], 0); + store(mem[4..8], m[1], 0); + store(mem[8..12], m[2], 0); + store(mem[12..16], m[3], 0); +} + +pub inline fn loadMat43(mem: []const f32) Mat { + return .{ + f32x4(mem[0], mem[1], mem[2], 0.0), + f32x4(mem[3], mem[4], mem[5], 0.0), + f32x4(mem[6], mem[7], mem[8], 0.0), + f32x4(mem[9], mem[10], mem[11], 1.0), + }; +} + +pub inline fn storeMat43(mem: []f32, m: Mat) void { + store(mem[0..3], m[0], 3); + store(mem[3..6], m[1], 3); + store(mem[6..9], m[2], 3); + store(mem[9..12], m[3], 3); +} + +pub inline fn loadMat34(mem: []const f32) Mat { + return .{ + load(mem[0..4], F32x4, 0), + load(mem[4..8], F32x4, 0), + load(mem[8..12], F32x4, 0), + f32x4(0.0, 0.0, 0.0, 1.0), + }; +} + +pub inline fn storeMat34(mem: []f32, m: Mat) void { + store(mem[0..4], m[0], 0); + store(mem[4..8], m[1], 0); + store(mem[8..12], m[2], 0); +} + +pub inline fn matToArr(m: Mat) [16]f32 { + var array: [16]f32 = undefined; + storeMat(array[0..], m); + return array; +} + +pub inline fn matToArr43(m: Mat) [12]f32 { + var array: [12]f32 = undefined; + storeMat43(array[0..], m); + return array; +} + +pub inline fn matToArr34(m: Mat) [12]f32 { + var array: [12]f32 = undefined; + storeMat34(array[0..], m); + return array; +} +// ------------------------------------------------------------------------------ +// +// 5. Quaternion functions +// +// ------------------------------------------------------------------------------ +pub fn qmul(q0: Quat, q1: Quat) Quat { + var result = swizzle(q1, .w, .w, .w, .w); + var q1x = swizzle(q1, .x, .x, .x, .x); + var q1y = swizzle(q1, .y, .y, .y, .y); + var q1z = swizzle(q1, .z, .z, .z, .z); + result = result * q0; + var q0_shuf = swizzle(q0, .w, .z, .y, .x); + q1x = q1x * q0_shuf; + q0_shuf = swizzle(q0_shuf, .y, .x, .w, .z); + result = mulAdd(q1x, f32x4(1.0, -1.0, 1.0, -1.0), result); + q1y = q1y * q0_shuf; + q0_shuf = swizzle(q0_shuf, .w, .z, .y, .x); + q1y = q1y * f32x4(1.0, 1.0, -1.0, -1.0); + q1z = q1z * q0_shuf; + q1y = mulAdd(q1z, f32x4(-1.0, 1.0, 1.0, -1.0), q1y); + return result + q1y; +} +test "zmath.quaternion.mul" { + { + const q0 = f32x4(2.0, 3.0, 4.0, 1.0); + const q1 = f32x4(3.0, 2.0, 1.0, 4.0); + try expectVecApproxEqAbs(qmul(q0, q1), f32x4(16.0, 4.0, 22.0, -12.0), 0.0001); + } +} + +pub fn quatToMat(quat: Quat) Mat { + return matFromQuat(quat); +} + +pub fn quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void { + axis.* = quat; + angle.* = 2.0 * acos(quat[3]); +} +test "zmath.quaternion.quatToAxisAngle" { + { + const q0 = quatFromNormAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); + var axis: Vec = f32x4(4.0, 3.0, 2.0, 1.0); + var angle: f32 = 10.0; + quatToAxisAngle(q0, &axis, &angle); + try expect(math.approxEqAbs(f32, axis[0], @sin(@as(f32, 0.25) * math.pi * 0.5), 0.0001)); + try expect(axis[1] == 0.0); + try expect(axis[2] == 0.0); + try expect(math.approxEqAbs(f32, angle, 0.25 * math.pi, 0.0001)); + } +} + +pub fn quatFromMat(m: Mat) Quat { + const r0 = m[0]; + const r1 = m[1]; + const r2 = m[2]; + const r00 = swizzle(r0, .x, .x, .x, .x); + const r11 = swizzle(r1, .y, .y, .y, .y); + const r22 = swizzle(r2, .z, .z, .z, .z); + + const x2gey2 = (r11 - r00) <= splat(F32x4, 0.0); + const z2gew2 = (r11 + r00) <= splat(F32x4, 0.0); + const x2py2gez2pw2 = r22 <= splat(F32x4, 0.0); + + var t0 = mulAdd(r00, f32x4(1.0, -1.0, -1.0, 1.0), splat(F32x4, 1.0)); + var t1 = r11 * f32x4(-1.0, 1.0, -1.0, 1.0); + var t2 = mulAdd(r22, f32x4(-1.0, -1.0, 1.0, 1.0), t0); + const x2y2z2w2 = t1 + t2; + + t0 = @shuffle(f32, r0, r1, [4]i32{ 1, 2, ~@as(i32, 2), ~@as(i32, 1) }); + t1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 0), ~@as(i32, 1) }); + t1 = swizzle(t1, .x, .z, .w, .y); + const xyxzyz = t0 + t1; + + t0 = @shuffle(f32, r2, r1, [4]i32{ 1, 0, ~@as(i32, 0), ~@as(i32, 0) }); + t1 = @shuffle(f32, r1, r0, [4]i32{ 2, 2, ~@as(i32, 2), ~@as(i32, 1) }); + t1 = swizzle(t1, .x, .z, .w, .y); + const xwywzw = (t0 - t1) * f32x4(-1.0, 1.0, -1.0, 1.0); + + t0 = @shuffle(f32, x2y2z2w2, xyxzyz, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 0) }); + t1 = @shuffle(f32, x2y2z2w2, xwywzw, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 0) }); + t2 = @shuffle(f32, xyxzyz, xwywzw, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); + + const tensor0 = @shuffle(f32, t0, t2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); + const tensor1 = @shuffle(f32, t0, t2, [4]i32{ 2, 1, ~@as(i32, 1), ~@as(i32, 3) }); + const tensor2 = @shuffle(f32, t2, t1, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 2) }); + const tensor3 = @shuffle(f32, t2, t1, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 1) }); + + t0 = select(x2gey2, tensor0, tensor1); + t1 = select(z2gew2, tensor2, tensor3); + t2 = select(x2py2gez2pw2, t0, t1); + + return t2 / length4(t2); +} +test "zmath.quatFromMat" { + { + const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); + const q1 = quatFromMat(rotationX(0.25 * math.pi)); + try expectVecApproxEqAbs(q0, q1, 0.0001); + } + { + const q0 = quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi); + const q1 = quatFromMat(matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi)); + try expectVecApproxEqAbs(q0, q1, 0.0001); + } + { + const q0 = quatFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi); + const q1 = quatFromMat(matFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi)); + try expectVecApproxEqAbs(q0, q1, 0.0001); + } +} + +pub fn quatFromNormAxisAngle(axis: Vec, angle: f32) Quat { + const n = f32x4(axis[0], axis[1], axis[2], 1.0); + const sc = sincos(0.5 * angle); + return n * f32x4(sc[0], sc[0], sc[0], sc[1]); +} +pub fn quatFromAxisAngle(axis: Vec, angle: f32) Quat { + assert(!all(axis == splat(F32x4, 0.0), 3)); + assert(!all(isInf(axis), 3)); + const normal = normalize3(axis); + return quatFromNormAxisAngle(normal, angle); +} +test "zmath.quaternion.quatFromNormAxisAngle" { + { + const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); + const q1 = quatFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), 0.125 * math.pi); + const m0 = rotationX(0.25 * math.pi); + const m1 = rotationY(0.125 * math.pi); + const mr0 = quatToMat(qmul(q0, q1)); + const mr1 = mul(m0, m1); + try expectVecApproxEqAbs(mr0[0], mr1[0], 0.0001); + try expectVecApproxEqAbs(mr0[1], mr1[1], 0.0001); + try expectVecApproxEqAbs(mr0[2], mr1[2], 0.0001); + try expectVecApproxEqAbs(mr0[3], mr1[3], 0.0001); + } + { + const m0 = quatToMat(quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi)); + const m1 = matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi); + try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); + } +} + +pub inline fn qidentity() Quat { + return f32x4(@as(f32, 0.0), @as(f32, 0.0), @as(f32, 0.0), @as(f32, 1.0)); +} + +pub inline fn conjugate(quat: Quat) Quat { + return quat * f32x4(-1.0, -1.0, -1.0, 1.0); +} + +fn inverseQuat(quat: Quat) Quat { + const l = lengthSq4(quat); + const conj = conjugate(quat); + return select(l <= splat(F32x4, math.floatEps(f32)), splat(F32x4, 0.0), conj / l); +} +test "zmath.quaternion.inverseQuat" { + try expectVecApproxEqAbs( + inverse(f32x4(2.0, 3.0, 4.0, 1.0)), + f32x4(-1.0 / 15.0, -1.0 / 10.0, -2.0 / 15.0, 1.0 / 30.0), + 0.0001, + ); + try expectVecApproxEqAbs(inverse(qidentity()), qidentity(), 0.0001); +} + +// Algorithm from: https://github.com/g-truc/glm/blob/master/glm/detail/type_quat.inl +pub fn rotate(q: Quat, v: Vec) Vec { + const w = splat(F32x4, q[3]); + const axis = f32x4(q[0], q[1], q[2], 0.0); + const uv = cross3(axis, v); + return v + ((uv * w) + cross3(axis, uv)) * splat(F32x4, 2.0); +} +test "zmath.quaternion.rotate" { + const quat = quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi); + const mat = matFromQuat(quat); + const forward = f32x4(0.0, 0.0, -1.0, 0.0); + const up = f32x4(0.0, 1.0, 0.0, 0.0); + const right = f32x4(1.0, 0.0, 0.0, 0.0); + try expectVecApproxEqAbs(rotate(quat, forward), mul(forward, mat), 0.0001); + try expectVecApproxEqAbs(rotate(quat, up), mul(up, mat), 0.0001); + try expectVecApproxEqAbs(rotate(quat, right), mul(right, mat), 0.0001); +} + +pub fn slerp(q0: Quat, q1: Quat, t: f32) Quat { + return slerpV(q0, q1, splat(F32x4, t)); +} +pub fn slerpV(q0: Quat, q1: Quat, t: F32x4) Quat { + var cos_omega = dot4(q0, q1); + const sign = select(cos_omega < splat(F32x4, 0.0), splat(F32x4, -1.0), splat(F32x4, 1.0)); + + cos_omega = cos_omega * sign; + const sin_omega = sqrt(splat(F32x4, 1.0) - cos_omega * cos_omega); + + const omega = atan2(sin_omega, cos_omega); + + var v01 = t; + v01 = xorInt(andInt(v01, f32x4_mask2), f32x4_sign_mask1); + v01 = f32x4(1.0, 0.0, 0.0, 0.0) + v01; + + var s0 = sin(v01 * omega) / sin_omega; + s0 = select(cos_omega < splat(F32x4, 1.0 - 0.00001), s0, v01); + + const s1 = swizzle(s0, .y, .y, .y, .y); + s0 = swizzle(s0, .x, .x, .x, .x); + + return q0 * s0 + sign * q1 * s1; +} +test "zmath.quaternion.slerp" { + const from = f32x4(0.0, 0.0, 0.0, 1.0); + const to = f32x4(0.5, 0.5, -0.5, 0.5); + const result = slerp(from, to, 0.5); + try expectVecApproxEqAbs(result, f32x4(0.28867513, 0.28867513, -0.28867513, 0.86602540), 0.0001); +} + +// Converts q back to euler angles, assuming a YXZ rotation order. +// See: http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler +pub fn quatToRollPitchYaw(q: Quat) [3]f32 { + var angles: [3]f32 = undefined; + + const p = swizzle(q, .w, .y, .x, .z); + const sign = -1.0; + + const singularity = p[0] * p[2] + sign * p[1] * p[3]; + if (singularity > 0.499) { + angles[0] = math.pi * 0.5; + angles[1] = 2.0 * math.atan2(p[1], p[0]); + angles[2] = 0.0; + } else if (singularity < -0.499) { + angles[0] = -math.pi * 0.5; + angles[1] = 2.0 * math.atan2(p[1], p[0]); + angles[2] = 0.0; + } else { + const sq = p * p; + const y = splat(F32x4, 2.0) * f32x4(p[0] * p[1] - sign * p[2] * p[3], p[0] * p[3] - sign * p[1] * p[2], 0.0, 0.0); + const x = splat(F32x4, 1.0) - (splat(F32x4, 2.0) * f32x4(sq[1] + sq[2], sq[2] + sq[3], 0.0, 0.0)); + const res = atan2(y, x); + angles[0] = math.asin(2.0 * singularity); + angles[1] = res[0]; + angles[2] = res[1]; + } + + return angles; +} + +test "zmath.quaternion.quatToRollPitchYaw" { + { + const expected = f32x4(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); + const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]); + const result = quatToRollPitchYaw(quat); + try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); + } + + { + const expected = f32x4(0.3 * math.pi, 0.1 * math.pi, 0.2 * math.pi, 0.0); + const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]); + const result = quatToRollPitchYaw(quat); + try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); + } + + // North pole singularity + { + const angle = f32x4(0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); + const expected = f32x4(0.5 * math.pi, -0.1 * math.pi, 0.0, 0.0); + const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]); + const result = quatToRollPitchYaw(quat); + try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); + } + + // South pole singularity + { + const angle = f32x4(-0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); + const expected = f32x4(-0.5 * math.pi, 0.5 * math.pi, 0.0, 0.0); + const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]); + const result = quatToRollPitchYaw(quat); + try expectVecApproxEqAbs(loadArr3(result), expected, 0.0001); + } +} + +pub fn quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat { + return quatFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0)); +} +pub fn quatFromRollPitchYawV(angles: Vec) Quat { // | pitch | yaw | roll | 0 | + const sc = sincos(splat(Vec, 0.5) * angles); + const p0 = @shuffle(f32, sc[1], sc[0], [4]i32{ ~@as(i32, 0), 0, 0, 0 }); + const p1 = @shuffle(f32, sc[0], sc[1], [4]i32{ ~@as(i32, 0), 0, 0, 0 }); + const y0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 1, ~@as(i32, 1), 1, 1 }); + const y1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 1, ~@as(i32, 1), 1, 1 }); + const r0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 2, 2, ~@as(i32, 2), 2 }); + const r1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 2, 2, ~@as(i32, 2), 2 }); + const q1 = p1 * f32x4(1.0, -1.0, -1.0, 1.0) * y1; + const q0 = p0 * y0 * r0; + return mulAdd(q1, r1, q0); +} +test "zmath.quaternion.quatFromRollPitchYawV" { + { + const m0 = quatToMat(quatFromRollPitchYawV(f32x4(0.25 * math.pi, 0.0, 0.0, 0.0))); + const m1 = rotationX(0.25 * math.pi); + try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); + } + { + const m0 = quatToMat(quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi)); + const m1 = mul( + rotationZ(0.3 * math.pi), + mul(rotationX(0.1 * math.pi), rotationY(0.2 * math.pi)), + ); + try expectVecApproxEqAbs(m0[0], m1[0], 0.0001); + try expectVecApproxEqAbs(m0[1], m1[1], 0.0001); + try expectVecApproxEqAbs(m0[2], m1[2], 0.0001); + try expectVecApproxEqAbs(m0[3], m1[3], 0.0001); + } +} +// ------------------------------------------------------------------------------ +// +// 6. Color functions +// +// ------------------------------------------------------------------------------ +pub fn adjustSaturation(color: F32x4, saturation: f32) F32x4 { + const luminance = dot3(f32x4(0.2125, 0.7154, 0.0721, 0.0), color); + var result = mulAdd(color - luminance, f32x4s(saturation), luminance); + result[3] = color[3]; + return result; +} + +pub fn adjustContrast(color: F32x4, contrast: f32) F32x4 { + var result = mulAdd(color - f32x4s(0.5), f32x4s(contrast), f32x4s(0.5)); + result[3] = color[3]; + return result; +} + +pub fn rgbToHsl(rgb: F32x4) F32x4 { + const r = swizzle(rgb, .x, .x, .x, .x); + const g = swizzle(rgb, .y, .y, .y, .y); + const b = swizzle(rgb, .z, .z, .z, .z); + + const minv = min(r, min(g, b)); + const maxv = max(r, max(g, b)); + + const l = (minv + maxv) * f32x4s(0.5); + const d = maxv - minv; + const la = select(boolx4(true, true, true, false), l, rgb); + + if (all(d < f32x4s(math.floatEps(f32)), 3)) { + return select(boolx4(true, true, false, false), f32x4s(0.0), la); + } else { + var s: F32x4 = undefined; + var h: F32x4 = undefined; + + const d2 = minv + maxv; + + if (all(l > f32x4s(0.5), 3)) { + s = d / (f32x4s(2.0) - d2); + } else { + s = d / d2; + } + + if (all(r == maxv, 3)) { + h = (g - b) / d; + } else if (all(g == maxv, 3)) { + h = f32x4s(2.0) + (b - r) / d; + } else { + h = f32x4s(4.0) + (r - g) / d; + } + + h /= f32x4s(6.0); + + if (all(h < f32x4s(0.0), 3)) { + h += f32x4s(1.0); + } + + const lha = select(boolx4(true, true, false, false), h, la); + return select(boolx4(true, false, true, true), lha, s); + } +} +test "zmath.color.rgbToHsl" { + try expectVecApproxEqAbs(rgbToHsl(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.6, 0.5, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsl(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 0.5, 0.5), 0.0001); + try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 0.5, 0.25), 0.0001); + try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 0.5, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001); +} + +fn hueToClr(p: F32x4, q: F32x4, h: F32x4) F32x4 { + var t = h; + + if (all(t < f32x4s(0.0), 3)) + t += f32x4s(1.0); + + if (all(t > f32x4s(1.0), 3)) + t -= f32x4s(1.0); + + if (all(t < f32x4s(1.0 / 6.0), 3)) + return mulAdd(q - p, f32x4s(6.0) * t, p); + + if (all(t < f32x4s(0.5), 3)) + return q; + + if (all(t < f32x4s(2.0 / 3.0), 3)) + return mulAdd(q - p, f32x4s(6.0) * (f32x4s(2.0 / 3.0) - t), p); + + return p; +} + +pub fn hslToRgb(hsl: F32x4) F32x4 { + const s = swizzle(hsl, .y, .y, .y, .y); + const l = swizzle(hsl, .z, .z, .z, .z); + + if (all(isNearEqual(s, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) { + return select(boolx4(true, true, true, false), l, hsl); + } else { + const h = swizzle(hsl, .x, .x, .x, .x); + var q: F32x4 = undefined; + if (all(l < f32x4s(0.5), 3)) { + q = l * (f32x4s(1.0) + s); + } else { + q = (l + s) - (l * s); + } + + const p = f32x4s(2.0) * l - q; + + const r = hueToClr(p, q, h + f32x4s(1.0 / 3.0)); + const g = hueToClr(p, q, h); + const b = hueToClr(p, q, h - f32x4s(1.0 / 3.0)); + + const rg = select(boolx4(true, false, false, false), r, g); + const ba = select(boolx4(true, true, true, false), b, hsl); + return select(boolx4(true, true, false, false), rg, ba); + } +} +test "zmath.color.hslToRgb" { + try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hslToRgb(f32x4(0.6111, 0.6, 0.5, 1.0)), 0.0001); + try expectVecApproxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hslToRgb(f32x4(0.0, 1.0, 0.5, 0.5)), 0.0001); + try expectVecApproxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hslToRgb(f32x4(0.3333, 1.0, 0.5, 0.25)), 0.0005); + try expectVecApproxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hslToRgb(f32x4(0.6666, 1.0, 0.5, 1.0)), 0.0005); + try expectVecApproxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), 0.0001); + try expectVecApproxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), 0.0001); + try expectVecApproxEqAbs(hslToRgb(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0))), f32x4(1.0, 1.0, 1.0, 1.0), 0.0005); + try expectVecApproxEqAbs( + hslToRgb(rgbToHsl(f32x4(0.82198, 0.1839, 0.632, 1.0))), + f32x4(0.82198, 0.1839, 0.632, 1.0), + 0.0005, + ); + try expectVecApproxEqAbs( + rgbToHsl(hslToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))), + f32x4(0.82198, 0.1839, 0.632, 1.0), + 0.0005, + ); + try expectVecApproxEqAbs( + rgbToHsl(hslToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), + f32x4(0.1839, 0.82198, 0.632, 1.0), + 0.0005, + ); + try expectVecApproxEqAbs( + hslToRgb(rgbToHsl(f32x4(0.1839, 0.632, 0.82198, 1.0))), + f32x4(0.1839, 0.632, 0.82198, 1.0), + 0.0005, + ); +} + +pub fn rgbToHsv(rgb: F32x4) F32x4 { + const r = swizzle(rgb, .x, .x, .x, .x); + const g = swizzle(rgb, .y, .y, .y, .y); + const b = swizzle(rgb, .z, .z, .z, .z); + + const minv = min(r, min(g, b)); + const v = max(r, max(g, b)); + const d = v - minv; + const s = if (all(isNearEqual(v, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) f32x4s(0.0) else d / v; + + if (all(d < f32x4s(math.floatEps(f32)), 3)) { + const hv = select(boolx4(true, false, false, false), f32x4s(0.0), v); + const hva = select(boolx4(true, true, true, false), hv, rgb); + return select(boolx4(true, false, true, true), hva, s); + } else { + var h: F32x4 = undefined; + if (all(r == v, 3)) { + h = (g - b) / d; + if (all(g < b, 3)) + h += f32x4s(6.0); + } else if (all(g == v, 3)) { + h = f32x4s(2.0) + (b - r) / d; + } else { + h = f32x4s(4.0) + (r - g) / d; + } + + h /= f32x4s(6.0); + const hv = select(boolx4(true, false, false, false), h, v); + const hva = select(boolx4(true, true, true, false), hv, rgb); + return select(boolx4(true, false, true, true), hva, s); + } +} +test "zmath.color.rgbToHsv" { + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.75, 0.8, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.4, 0.2, 0.8, 1.0)), f32x4(0.7222, 0.75, 0.8, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.4, 0.8, 0.2, 1.0)), f32x4(0.2777, 0.75, 0.8, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 1.0, 0.5), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 1.0, 0.25), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 1.0, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001); + try expectVecApproxEqAbs(rgbToHsv(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001); +} + +pub fn hsvToRgb(hsv: F32x4) F32x4 { + const h = swizzle(hsv, .x, .x, .x, .x); + const s = swizzle(hsv, .y, .y, .y, .y); + const v = swizzle(hsv, .z, .z, .z, .z); + + const h6 = h * f32x4s(6.0); + const i = floor(h6); + const f = h6 - i; + + const p = v * (f32x4s(1.0) - s); + const q = v * (f32x4s(1.0) - f * s); + const t = v * (f32x4s(1.0) - (f32x4s(1.0) - f) * s); + + const ii = @as(i32, @intFromFloat(mod(i, f32x4s(6.0))[0])); + const rgb = switch (ii) { + 0 => blk: { + const vt = select(boolx4(true, false, false, false), v, t); + break :blk select(boolx4(true, true, false, false), vt, p); + }, + 1 => blk: { + const qv = select(boolx4(true, false, false, false), q, v); + break :blk select(boolx4(true, true, false, false), qv, p); + }, + 2 => blk: { + const pv = select(boolx4(true, false, false, false), p, v); + break :blk select(boolx4(true, true, false, false), pv, t); + }, + 3 => blk: { + const pq = select(boolx4(true, false, false, false), p, q); + break :blk select(boolx4(true, true, false, false), pq, v); + }, + 4 => blk: { + const tp = select(boolx4(true, false, false, false), t, p); + break :blk select(boolx4(true, true, false, false), tp, v); + }, + 5 => blk: { + const vp = select(boolx4(true, false, false, false), v, p); + break :blk select(boolx4(true, true, false, false), vp, q); + }, + else => unreachable, + }; + return select(boolx4(true, true, true, false), rgb, hsv); +} +test "zmath.color.hsvToRgb" { + const epsilon = 0.0005; + try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hsvToRgb(f32x4(0.6111, 0.75, 0.8, 1.0)), epsilon); + try expectVecApproxEqAbs(f32x4(0.4, 0.2, 0.8, 1.0), hsvToRgb(f32x4(0.7222, 0.75, 0.8, 1.0)), epsilon); + try expectVecApproxEqAbs(f32x4(0.4, 0.8, 0.2, 1.0), hsvToRgb(f32x4(0.2777, 0.75, 0.8, 1.0)), epsilon); + try expectVecApproxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hsvToRgb(f32x4(0.0, 1.0, 1.0, 0.5)), epsilon); + try expectVecApproxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hsvToRgb(f32x4(0.3333, 1.0, 1.0, 0.25)), epsilon); + try expectVecApproxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hsvToRgb(f32x4(0.6666, 1.0, 1.0, 1.0)), epsilon); + try expectVecApproxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), epsilon); + try expectVecApproxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), epsilon); + try expectVecApproxEqAbs( + hsvToRgb(rgbToHsv(f32x4(0.1839, 0.632, 0.82198, 1.0))), + f32x4(0.1839, 0.632, 0.82198, 1.0), + epsilon, + ); + try expectVecApproxEqAbs( + hsvToRgb(rgbToHsv(f32x4(0.82198, 0.1839, 0.632, 1.0))), + f32x4(0.82198, 0.1839, 0.632, 1.0), + epsilon, + ); + try expectVecApproxEqAbs( + rgbToHsv(hsvToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))), + f32x4(0.82198, 0.1839, 0.632, 1.0), + epsilon, + ); + try expectVecApproxEqAbs( + rgbToHsv(hsvToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), + f32x4(0.1839, 0.82198, 0.632, 1.0), + epsilon, + ); +} + +pub fn rgbToSrgb(rgb: F32x4) F32x4 { + const static = struct { + const cutoff = f32x4(0.0031308, 0.0031308, 0.0031308, 1.0); + const linear = f32x4(12.92, 12.92, 12.92, 1.0); + const scale = f32x4(1.055, 1.055, 1.055, 1.0); + const bias = f32x4(0.055, 0.055, 0.055, 1.0); + const rgamma = 1.0 / 2.4; + }; + var v = saturate(rgb); + const v0 = v * static.linear; + const v1 = static.scale * f32x4( + math.pow(f32, v[0], static.rgamma), + math.pow(f32, v[1], static.rgamma), + math.pow(f32, v[2], static.rgamma), + v[3], + ) - static.bias; + v = select(v < static.cutoff, v0, v1); + return select(boolx4(true, true, true, false), v, rgb); +} +test "zmath.color.rgbToSrgb" { + const epsilon = 0.001; + try expectVecApproxEqAbs(rgbToSrgb(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.484, 0.665, 0.906, 1.0), epsilon); +} + +pub fn srgbToRgb(srgb: F32x4) F32x4 { + const static = struct { + const cutoff = f32x4(0.04045, 0.04045, 0.04045, 1.0); + const rlinear = f32x4(1.0 / 12.92, 1.0 / 12.92, 1.0 / 12.92, 1.0); + const scale = f32x4(1.0 / 1.055, 1.0 / 1.055, 1.0 / 1.055, 1.0); + const bias = f32x4(0.055, 0.055, 0.055, 1.0); + const gamma = 2.4; + }; + var v = saturate(srgb); + const v0 = v * static.rlinear; + var v1 = static.scale * (v + static.bias); + v1 = f32x4( + math.pow(f32, v1[0], static.gamma), + math.pow(f32, v1[1], static.gamma), + math.pow(f32, v1[2], static.gamma), + v1[3], + ); + v = select(v > static.cutoff, v1, v0); + return select(boolx4(true, true, true, false), v, srgb); +} +test "zmath.color.srgbToRgb" { + const epsilon = 0.0007; + try expectVecApproxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), srgbToRgb(f32x4(0.484, 0.665, 0.906, 1.0)), epsilon); + try expectVecApproxEqAbs( + rgbToSrgb(srgbToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), + f32x4(0.1839, 0.82198, 0.632, 1.0), + epsilon, + ); +} +// ------------------------------------------------------------------------------ +// +// X. Misc functions +// +// ------------------------------------------------------------------------------ +pub fn linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 { + const ptvec = pt - linept0; + const linevec = linept1 - linept0; + const scale = dot3(ptvec, linevec) / lengthSq3(linevec); + return length3(ptvec - linevec * scale); +} +test "zmath.linePointDistance" { + { + const linept0 = f32x4(-1.0, -2.0, -3.0, 1.0); + const linept1 = f32x4(1.0, 2.0, 3.0, 1.0); + const pt = f32x4(1.0, 1.0, 1.0, 1.0); + const v = linePointDistance(linept0, linept1, pt); + try expectVecApproxEqAbs(v, splat(F32x4, 0.654), 0.001); + } +} + +fn sin32(v: f32) f32 { + var y = v - math.tau * @round(v * 1.0 / math.tau); + + if (y > 0.5 * math.pi) { + y = math.pi - y; + } else if (y < -math.pi * 0.5) { + y = -math.pi - y; + } + const y2 = y * y; + + // 11-degree minimax approximation + var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06); + sinv = mulAdd(sinv, y2, -0.00019840874); + sinv = mulAdd(sinv, y2, 0.0083333310); + sinv = mulAdd(sinv, y2, -0.16666667); + return y * mulAdd(sinv, y2, 1.0); +} +fn cos32(v: f32) f32 { + var y = v - math.tau * @round(v * 1.0 / math.tau); + + const sign = blk: { + if (y > 0.5 * math.pi) { + y = math.pi - y; + break :blk @as(f32, -1.0); + } else if (y < -math.pi * 0.5) { + y = -math.pi - y; + break :blk @as(f32, -1.0); + } else { + break :blk @as(f32, 1.0); + } + }; + const y2 = y * y; + + // 10-degree minimax approximation + var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05); + cosv = mulAdd(cosv, y2, -0.0013888378); + cosv = mulAdd(cosv, y2, 0.041666638); + cosv = mulAdd(cosv, y2, -0.5); + return sign * mulAdd(cosv, y2, 1.0); +} +fn sincos32(v: f32) [2]f32 { + var y = v - math.tau * @round(v * 1.0 / math.tau); + + const sign = blk: { + if (y > 0.5 * math.pi) { + y = math.pi - y; + break :blk @as(f32, -1.0); + } else if (y < -math.pi * 0.5) { + y = -math.pi - y; + break :blk @as(f32, -1.0); + } else { + break :blk @as(f32, 1.0); + } + }; + const y2 = y * y; + + // 11-degree minimax approximation + var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06); + sinv = mulAdd(sinv, y2, -0.00019840874); + sinv = mulAdd(sinv, y2, 0.0083333310); + sinv = mulAdd(sinv, y2, -0.16666667); + sinv = y * mulAdd(sinv, y2, 1.0); + + // 10-degree minimax approximation + var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05); + cosv = mulAdd(cosv, y2, -0.0013888378); + cosv = mulAdd(cosv, y2, 0.041666638); + cosv = mulAdd(cosv, y2, -0.5); + cosv = sign * mulAdd(cosv, y2, 1.0); + + return .{ sinv, cosv }; +} +test "zmath.sincos32" { + const epsilon = 0.0001; + + try expect(math.isNan(sincos32(math.inf(f32))[0])); + try expect(math.isNan(sincos32(math.inf(f32))[1])); + try expect(math.isNan(sincos32(-math.inf(f32))[0])); + try expect(math.isNan(sincos32(-math.inf(f32))[1])); + try expect(math.isNan(sincos32(math.nan(f32))[0])); + try expect(math.isNan(sincos32(-math.nan(f32))[1])); + + try expect(math.isNan(sin32(math.inf(f32)))); + try expect(math.isNan(cos32(math.inf(f32)))); + try expect(math.isNan(sin32(-math.inf(f32)))); + try expect(math.isNan(cos32(-math.inf(f32)))); + try expect(math.isNan(sin32(math.nan(f32)))); + try expect(math.isNan(cos32(-math.nan(f32)))); + + var f: f32 = -100.0; + var i: u32 = 0; + while (i < 100) : (i += 1) { + const sc = sincos32(f); + const s0 = sin32(f); + const c0 = cos32(f); + const s = @sin(f); + const c = @cos(f); + try expect(math.approxEqAbs(f32, sc[0], s, epsilon)); + try expect(math.approxEqAbs(f32, sc[1], c, epsilon)); + try expect(math.approxEqAbs(f32, s0, s, epsilon)); + try expect(math.approxEqAbs(f32, c0, c, epsilon)); + f += 0.12345 * @as(f32, @floatFromInt(i)); + } +} + +fn asin32(v: f32) f32 { + const x = @abs(v); + var omx = 1.0 - x; + if (omx < 0.0) { + omx = 0.0; + } + const root = @sqrt(omx); + + // 7-degree minimax approximation + var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901); + result = mulAdd(result, x, -0.0170881256); + result = mulAdd(result, x, 0.0308918810); + result = mulAdd(result, x, -0.0501743046); + result = mulAdd(result, x, 0.0889789874); + result = mulAdd(result, x, -0.2145988016); + result = root * mulAdd(result, x, 1.5707963050); + + return if (v >= 0.0) 0.5 * math.pi - result else result - 0.5 * math.pi; +} +test "zmath.asin32" { + const epsilon = 0.0001; + + try expect(math.approxEqAbs(f32, asin(@as(f32, -1.1)), -0.5 * math.pi, epsilon)); + try expect(math.approxEqAbs(f32, asin(@as(f32, 1.1)), 0.5 * math.pi, epsilon)); + try expect(math.approxEqAbs(f32, asin(@as(f32, -1000.1)), -0.5 * math.pi, epsilon)); + try expect(math.approxEqAbs(f32, asin(@as(f32, 100000.1)), 0.5 * math.pi, epsilon)); + try expect(math.isNan(asin(math.inf(f32)))); + try expect(math.isNan(asin(-math.inf(f32)))); + try expect(math.isNan(asin(math.nan(f32)))); + try expect(math.isNan(asin(-math.nan(f32)))); + + try expectVecApproxEqAbs(asin(splat(F32x8, -100.0)), splat(F32x8, -0.5 * math.pi), epsilon); + try expectVecApproxEqAbs(asin(splat(F32x16, 100.0)), splat(F32x16, 0.5 * math.pi), epsilon); + try expect(all(isNan(asin(splat(F32x4, math.inf(f32)))), 0) == true); + try expect(all(isNan(asin(splat(F32x4, -math.inf(f32)))), 0) == true); + try expect(all(isNan(asin(splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(asin(splat(F32x4, math.snan(f32)))), 0) == true); + + var f: f32 = -1.0; + var i: u32 = 0; + while (i < 8) : (i += 1) { + const r0 = asin32(f); + const r1 = math.asin(f); + const r4 = asin(splat(F32x4, f)); + const r8 = asin(splat(F32x8, f)); + const r16 = asin(splat(F32x16, f)); + try expect(math.approxEqAbs(f32, r0, r1, epsilon)); + try expectVecApproxEqAbs(r4, splat(F32x4, r1), epsilon); + try expectVecApproxEqAbs(r8, splat(F32x8, r1), epsilon); + try expectVecApproxEqAbs(r16, splat(F32x16, r1), epsilon); + f += 0.09 * @as(f32, @floatFromInt(i)); + } +} + +fn acos32(v: f32) f32 { + const x = @abs(v); + var omx = 1.0 - x; + if (omx < 0.0) { + omx = 0.0; + } + const root = @sqrt(omx); + + // 7-degree minimax approximation + var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901); + result = mulAdd(result, x, -0.0170881256); + result = mulAdd(result, x, 0.0308918810); + result = mulAdd(result, x, -0.0501743046); + result = mulAdd(result, x, 0.0889789874); + result = mulAdd(result, x, -0.2145988016); + result = root * mulAdd(result, x, 1.5707963050); + + return if (v >= 0.0) result else math.pi - result; +} +test "zmath.acos32" { + const epsilon = 0.1; + + try expect(math.approxEqAbs(f32, acos(@as(f32, -1.1)), math.pi, epsilon)); + try expect(math.approxEqAbs(f32, acos(@as(f32, -10000.1)), math.pi, epsilon)); + try expect(math.approxEqAbs(f32, acos(@as(f32, 1.1)), 0.0, epsilon)); + try expect(math.approxEqAbs(f32, acos(@as(f32, 1000.1)), 0.0, epsilon)); + try expect(math.isNan(acos(math.inf(f32)))); + try expect(math.isNan(acos(-math.inf(f32)))); + try expect(math.isNan(acos(math.nan(f32)))); + try expect(math.isNan(acos(-math.nan(f32)))); + + try expectVecApproxEqAbs(acos(splat(F32x8, -100.0)), splat(F32x8, math.pi), epsilon); + try expectVecApproxEqAbs(acos(splat(F32x16, 100.0)), splat(F32x16, 0.0), epsilon); + try expect(all(isNan(acos(splat(F32x4, math.inf(f32)))), 0) == true); + try expect(all(isNan(acos(splat(F32x4, -math.inf(f32)))), 0) == true); + try expect(all(isNan(acos(splat(F32x4, math.nan(f32)))), 0) == true); + try expect(all(isNan(acos(splat(F32x4, math.snan(f32)))), 0) == true); + + var f: f32 = -1.0; + var i: u32 = 0; + while (i < 8) : (i += 1) { + const r0 = acos32(f); + const r1 = math.acos(f); + const r4 = acos(splat(F32x4, f)); + const r8 = acos(splat(F32x8, f)); + const r16 = acos(splat(F32x16, f)); + try expect(math.approxEqAbs(f32, r0, r1, epsilon)); + try expectVecApproxEqAbs(r4, splat(F32x4, r1), epsilon); + try expectVecApproxEqAbs(r8, splat(F32x8, r1), epsilon); + try expectVecApproxEqAbs(r16, splat(F32x16, r1), epsilon); + f += 0.09 * @as(f32, @floatFromInt(i)); + } +} + +pub fn modAngle32(in_angle: f32) f32 { + const angle = in_angle + math.pi; + var temp: f32 = @abs(angle); + temp = temp - (2.0 * math.pi * @as(f32, @floatFromInt(@as(i32, @intFromFloat(temp / math.pi))))); + temp = temp - math.pi; + if (angle < 0.0) { + temp = -temp; + } + return temp; +} + +pub fn cmulSoa(re0: anytype, im0: anytype, re1: anytype, im1: anytype) [2]@TypeOf(re0, im0, re1, im1) { + const re0_re1 = re0 * re1; + const re0_im1 = re0 * im1; + return .{ + mulAdd(-im0, im1, re0_re1), // re + mulAdd(re1, im0, re0_im1), // im + }; +} +// ------------------------------------------------------------------------------ +// +// FFT (implementation based on xdsp.h from DirectXMath) +// +// ------------------------------------------------------------------------------ +fn fftButterflyDit4_1(re0: *F32x4, im0: *F32x4) void { + const re0l = swizzle(re0.*, .x, .x, .y, .y); + const re0h = swizzle(re0.*, .z, .z, .w, .w); + + const im0l = swizzle(im0.*, .x, .x, .y, .y); + const im0h = swizzle(im0.*, .z, .z, .w, .w); + + const re_temp = mulAdd(re0h, f32x4(1.0, -1.0, 1.0, -1.0), re0l); + const im_temp = mulAdd(im0h, f32x4(1.0, -1.0, 1.0, -1.0), im0l); + + const re_shuf0 = @shuffle(f32, re_temp, im_temp, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); + const re_shuf = swizzle(re_shuf0, .x, .w, .x, .w); + const im_shuf = swizzle(re_shuf0, .z, .y, .z, .y); + + const re_templ = swizzle(re_temp, .x, .y, .x, .y); + const im_templ = swizzle(im_temp, .x, .y, .x, .y); + + re0.* = mulAdd(re_shuf, f32x4(1.0, 1.0, -1.0, -1.0), re_templ); + im0.* = mulAdd(im_shuf, f32x4(1.0, -1.0, -1.0, 1.0), im_templ); +} + +fn fftButterflyDit4_4( + re0: *F32x4, + re1: *F32x4, + re2: *F32x4, + re3: *F32x4, + im0: *F32x4, + im1: *F32x4, + im2: *F32x4, + im3: *F32x4, + unity_table_re: []const F32x4, + unity_table_im: []const F32x4, + stride: u32, + last: bool, +) void { + const re_temp0 = re0.* + re2.*; + const im_temp0 = im0.* + im2.*; + + const re_temp2 = re1.* + re3.*; + const im_temp2 = im1.* + im3.*; + + const re_temp1 = re0.* - re2.*; + const im_temp1 = im0.* - im2.*; + + const re_temp3 = re1.* - re3.*; + const im_temp3 = im1.* - im3.*; + + var re_temp4 = re_temp0 + re_temp2; + var im_temp4 = im_temp0 + im_temp2; + + var re_temp5 = re_temp1 + im_temp3; + var im_temp5 = im_temp1 - re_temp3; + + var re_temp6 = re_temp0 - re_temp2; + var im_temp6 = im_temp0 - im_temp2; + + var re_temp7 = re_temp1 - im_temp3; + var im_temp7 = im_temp1 + re_temp3; + + { + const re_im = cmulSoa(re_temp5, im_temp5, unity_table_re[stride], unity_table_im[stride]); + re_temp5 = re_im[0]; + im_temp5 = re_im[1]; + } + { + const re_im = cmulSoa(re_temp6, im_temp6, unity_table_re[stride * 2], unity_table_im[stride * 2]); + re_temp6 = re_im[0]; + im_temp6 = re_im[1]; + } + { + const re_im = cmulSoa(re_temp7, im_temp7, unity_table_re[stride * 3], unity_table_im[stride * 3]); + re_temp7 = re_im[0]; + im_temp7 = re_im[1]; + } + + if (last) { + fftButterflyDit4_1(&re_temp4, &im_temp4); + fftButterflyDit4_1(&re_temp5, &im_temp5); + fftButterflyDit4_1(&re_temp6, &im_temp6); + fftButterflyDit4_1(&re_temp7, &im_temp7); + } + + re0.* = re_temp4; + im0.* = im_temp4; + + re1.* = re_temp5; + im1.* = im_temp5; + + re2.* = re_temp6; + im2.* = im_temp6; + + re3.* = re_temp7; + im3.* = im_temp7; +} + +fn fft4(re: []F32x4, im: []F32x4, count: u32) void { + assert(std.math.isPowerOfTwo(count)); + assert(re.len >= count); + assert(im.len >= count); + + var index: u32 = 0; + while (index < count) : (index += 1) { + fftButterflyDit4_1(&re[index], &im[index]); + } +} +test "zmath.fft4" { + const epsilon = 0.0001; + var re = [_]F32x4{f32x4(1.0, 2.0, 3.0, 4.0)}; + var im = [_]F32x4{f32x4s(0.0)}; + fft4(re[0..], im[0..], 1); + + var re_uns: [1]F32x4 = undefined; + var im_uns: [1]F32x4 = undefined; + fftUnswizzle(re[0..], re_uns[0..]); + fftUnswizzle(im[0..], im_uns[0..]); + + try expectVecApproxEqAbs(re_uns[0], f32x4(10.0, -2.0, -2.0, -2.0), epsilon); + try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 2.0, 0.0, -2.0), epsilon); +} + +fn fft8(re: []F32x4, im: []F32x4, count: u32) void { + assert(std.math.isPowerOfTwo(count)); + assert(re.len >= 2 * count); + assert(im.len >= 2 * count); + + var index: u32 = 0; + while (index < count) : (index += 1) { + var pre = re[index * 2 ..]; + var pim = im[index * 2 ..]; + + var odds_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); + var evens_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); + var odds_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); + var evens_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); + fftButterflyDit4_1(&odds_re, &odds_im); + fftButterflyDit4_1(&evens_re, &evens_im); + + { + const re_im = cmulSoa( + odds_re, + odds_im, + f32x4(1.0, 0.70710677, 0.0, -0.70710677), + f32x4(0.0, -0.70710677, -1.0, -0.70710677), + ); + pre[0] = evens_re + re_im[0]; + pim[0] = evens_im + re_im[1]; + } + { + const re_im = cmulSoa( + odds_re, + odds_im, + f32x4(-1.0, -0.70710677, 0.0, 0.70710677), + f32x4(0.0, 0.70710677, 1.0, 0.70710677), + ); + pre[1] = evens_re + re_im[0]; + pim[1] = evens_im + re_im[1]; + } + } +} +test "zmath.fft8" { + const epsilon = 0.0001; + var re = [_]F32x4{ f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0) }; + var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0) }; + fft8(re[0..], im[0..], 1); + + var re_uns: [2]F32x4 = undefined; + var im_uns: [2]F32x4 = undefined; + fftUnswizzle(re[0..], re_uns[0..]); + fftUnswizzle(im[0..], im_uns[0..]); + + try expectVecApproxEqAbs(re_uns[0], f32x4(36.0, -4.0, -4.0, -4.0), epsilon); + try expectVecApproxEqAbs(re_uns[1], f32x4(-4.0, -4.0, -4.0, -4.0), epsilon); + try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 9.656854, 4.0, 1.656854), epsilon); + try expectVecApproxEqAbs(im_uns[1], f32x4(0.0, -1.656854, -4.0, -9.656854), epsilon); +} + +fn fft16(re: []F32x4, im: []F32x4, count: u32) void { + assert(std.math.isPowerOfTwo(count)); + assert(re.len >= 4 * count); + assert(im.len >= 4 * count); + + const static = struct { + const unity_table_re = [4]F32x4{ + f32x4(1.0, 1.0, 1.0, 1.0), + f32x4(1.0, 0.92387950, 0.70710677, 0.38268343), + f32x4(1.0, 0.70710677, -4.3711388e-008, -0.70710677), + f32x4(1.0, 0.38268343, -0.70710677, -0.92387950), + }; + const unity_table_im = [4]F32x4{ + f32x4(-0.0, -0.0, -0.0, -0.0), + f32x4(-0.0, -0.38268343, -0.70710677, -0.92387950), + f32x4(-0.0, -0.70710677, -1.0, -0.70710677), + f32x4(-0.0, -0.92387950, -0.70710677, 0.38268343), + }; + }; + + var index: u32 = 0; + while (index < count) : (index += 1) { + fftButterflyDit4_4( + &re[index * 4], + &re[index * 4 + 1], + &re[index * 4 + 2], + &re[index * 4 + 3], + &im[index * 4], + &im[index * 4 + 1], + &im[index * 4 + 2], + &im[index * 4 + 3], + static.unity_table_re[0..], + static.unity_table_im[0..], + 1, + true, + ); + } +} +test "zmath.fft16" { + const epsilon = 0.0001; + var re = [_]F32x4{ + f32x4(1.0, 2.0, 3.0, 4.0), + f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), + f32x4(13.0, 14.0, 15.0, 16.0), + }; + var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0) }; + fft16(re[0..], im[0..], 1); + + var re_uns: [4]F32x4 = undefined; + var im_uns: [4]F32x4 = undefined; + fftUnswizzle(re[0..], re_uns[0..]); + fftUnswizzle(im[0..], im_uns[0..]); + + try expectVecApproxEqAbs(re_uns[0], f32x4(136.0, -8.0, -8.0, -8.0), epsilon); + try expectVecApproxEqAbs(re_uns[1], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); + try expectVecApproxEqAbs(re_uns[2], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); + try expectVecApproxEqAbs(re_uns[3], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon); + try expectVecApproxEqAbs(im_uns[0], f32x4(0.0, 40.218716, 19.313708, 11.972846), epsilon); + try expectVecApproxEqAbs(im_uns[1], f32x4(8.0, 5.345429, 3.313708, 1.591299), epsilon); + try expectVecApproxEqAbs(im_uns[2], f32x4(0.0, -1.591299, -3.313708, -5.345429), epsilon); + try expectVecApproxEqAbs(im_uns[3], f32x4(-8.0, -11.972846, -19.313708, -40.218716), epsilon); +} + +fn fftN(re: []F32x4, im: []F32x4, unity_table: []const F32x4, length: u32, count: u32) void { + assert(length > 16); + assert(std.math.isPowerOfTwo(length)); + assert(std.math.isPowerOfTwo(count)); + assert(re.len >= length * count / 4); + assert(re.len == im.len); + + const total = count * length; + const total_vectors = total / 4; + const stage_vectors = length / 4; + const stage_vectors_mask = stage_vectors - 1; + const stride = length / 16; + const stride_mask = stride - 1; + const stride_inv_mask = ~stride_mask; + + var unity_table_re = unity_table; + var unity_table_im = unity_table[length / 4 ..]; + + var index: u32 = 0; + while (index < total_vectors / 4) : (index += 1) { + const n = (index & stride_inv_mask) * 4 + (index & stride_mask); + fftButterflyDit4_4( + &re[n], + &re[n + stride], + &re[n + stride * 2], + &re[n + stride * 3], + &im[n], + &im[n + stride], + &im[n + stride * 2], + &im[n + stride * 3], + unity_table_re[(n & stage_vectors_mask)..], + unity_table_im[(n & stage_vectors_mask)..], + stride, + false, + ); + } + + if (length > 16 * 4) { + fftN(re, im, unity_table[(length / 2)..], length / 4, count * 4); + } else if (length == 16 * 4) { + fft16(re, im, count * 4); + } else if (length == 8 * 4) { + fft8(re, im, count * 4); + } else if (length == 4 * 4) { + fft4(re, im, count * 4); + } +} +test "zmath.fftN" { + var unity_table: [128]F32x4 = undefined; + const epsilon = 0.0001; + + // 32 samples + { + var re = [_]F32x4{ + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + }; + var im = [_]F32x4{ + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + }; + + fftInitUnityTable(unity_table[0..32]); + fft(re[0..], im[0..], unity_table[0..32]); + + try expectVecApproxEqAbs(re[0], f32x4(528.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[1], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[2], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[3], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[4], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[5], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[6], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(re[7], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon); + try expectVecApproxEqAbs(im[0], f32x4(0.0, 162.450726, 80.437432, 52.744931), epsilon); + try expectVecApproxEqAbs(im[1], f32x4(38.627417, 29.933895, 23.945692, 19.496056), epsilon); + try expectVecApproxEqAbs(im[2], f32x4(16.0, 13.130861, 10.690858, 8.552178), epsilon); + try expectVecApproxEqAbs(im[3], f32x4(6.627417, 4.853547, 3.182598, 1.575862), epsilon); + try expectVecApproxEqAbs(im[4], f32x4(0.0, -1.575862, -3.182598, -4.853547), epsilon); + try expectVecApproxEqAbs(im[5], f32x4(-6.627417, -8.552178, -10.690858, -13.130861), epsilon); + try expectVecApproxEqAbs(im[6], f32x4(-16.0, -19.496056, -23.945692, -29.933895), epsilon); + try expectVecApproxEqAbs(im[7], f32x4(-38.627417, -52.744931, -80.437432, -162.450726), epsilon); + } + + // 64 samples + { + var re = [_]F32x4{ + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + }; + var im = [_]F32x4{ + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + }; + + fftInitUnityTable(unity_table[0..64]); + fft(re[0..], im[0..], unity_table[0..64]); + + try expectVecApproxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon); + var i: u32 = 1; + while (i < 16) : (i += 1) { + try expectVecApproxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon); + } + + const expected = [_]f32{ + 0.0, 0.0, 324.901452, 0.000000, 160.874864, 0.0, 105.489863, 0.000000, + 77.254834, 0.0, 59.867789, 0.0, 47.891384, 0.0, 38.992113, 0.0, + 32.000000, 0.000000, 26.261721, 0.000000, 21.381716, 0.000000, 17.104356, 0.000000, + 13.254834, 0.000000, 9.707094, 0.000000, 6.365196, 0.000000, 3.151725, 0.000000, + 0.000000, 0.000000, -3.151725, 0.000000, -6.365196, 0.000000, -9.707094, 0.000000, + -13.254834, 0.000000, -17.104356, 0.000000, -21.381716, 0.000000, -26.261721, 0.000000, + -32.000000, 0.000000, -38.992113, 0.000000, -47.891384, 0.000000, -59.867789, 0.000000, + -77.254834, 0.000000, -105.489863, 0.000000, -160.874864, 0.000000, -324.901452, 0.000000, + }; + for (expected, 0..) |e, ie| { + try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); + } + } + + // 128 samples + { + var re = [_]F32x4{ + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + }; + var im = [_]F32x4{ + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + }; + + fftInitUnityTable(unity_table[0..128]); + fft(re[0..], im[0..], unity_table[0..128]); + + try expectVecApproxEqAbs(re[0], f32x4(2112.0, 0.0, 0.0, 0.0), epsilon); + var i: u32 = 1; + while (i < 32) : (i += 1) { + try expectVecApproxEqAbs(re[i], f32x4(-64.0, 0.0, 0.0, 0.0), epsilon); + } + + const expected = [_]f32{ + 0.000000, 0.000000, 0.000000, 0.000000, 649.802905, 0.000000, 0.000000, 0.000000, + 321.749727, 0.000000, 0.000000, 0.000000, 210.979725, 0.000000, 0.000000, 0.000000, + 154.509668, 0.000000, 0.000000, 0.000000, 119.735578, 0.000000, 0.000000, 0.000000, + 95.782769, 0.000000, 0.000000, 0.000000, 77.984226, 0.000000, 0.000000, 0.000000, + 64.000000, 0.000000, 0.000000, 0.000000, 52.523443, 0.000000, 0.000000, 0.000000, + 42.763433, 0.000000, 0.000000, 0.000000, 34.208713, 0.000000, 0.000000, 0.000000, + 26.509668, 0.000000, 0.000000, 0.000000, 19.414188, 0.000000, 0.000000, 0.000000, + 12.730392, 0.000000, 0.000000, 0.000000, 6.303450, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, -6.303450, 0.000000, 0.000000, 0.000000, + -12.730392, 0.000000, 0.000000, 0.000000, -19.414188, 0.000000, 0.000000, 0.000000, + -26.509668, 0.000000, 0.000000, 0.000000, -34.208713, 0.000000, 0.000000, 0.000000, + -42.763433, 0.000000, 0.000000, 0.000000, -52.523443, 0.000000, 0.000000, 0.000000, + -64.000000, 0.000000, 0.000000, 0.000000, -77.984226, 0.000000, 0.000000, 0.000000, + -95.782769, 0.000000, 0.000000, 0.000000, -119.735578, 0.000000, 0.000000, 0.000000, + -154.509668, 0.000000, 0.000000, 0.000000, -210.979725, 0.000000, 0.000000, 0.000000, + -321.749727, 0.000000, 0.000000, 0.000000, -649.802905, 0.000000, 0.000000, 0.000000, + }; + for (expected, 0..) |e, ie| { + try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); + } + } +} + +fn fftUnswizzle(input: []const F32x4, output: []F32x4) void { + assert(std.math.isPowerOfTwo(input.len)); + assert(input.len == output.len); + assert(input.ptr != output.ptr); + + const log2_length = std.math.log2_int(usize, input.len * 4); + assert(log2_length >= 2); + + const length = input.len; + + const f32_output = @as([*]f32, @ptrCast(output.ptr))[0 .. output.len * 4]; + + const static = struct { + const swizzle_table = [256]u8{ + 0x00, 0x40, 0x80, 0xC0, 0x10, 0x50, 0x90, 0xD0, 0x20, 0x60, 0xA0, 0xE0, 0x30, 0x70, 0xB0, 0xF0, + 0x04, 0x44, 0x84, 0xC4, 0x14, 0x54, 0x94, 0xD4, 0x24, 0x64, 0xA4, 0xE4, 0x34, 0x74, 0xB4, 0xF4, + 0x08, 0x48, 0x88, 0xC8, 0x18, 0x58, 0x98, 0xD8, 0x28, 0x68, 0xA8, 0xE8, 0x38, 0x78, 0xB8, 0xF8, + 0x0C, 0x4C, 0x8C, 0xCC, 0x1C, 0x5C, 0x9C, 0xDC, 0x2C, 0x6C, 0xAC, 0xEC, 0x3C, 0x7C, 0xBC, 0xFC, + 0x01, 0x41, 0x81, 0xC1, 0x11, 0x51, 0x91, 0xD1, 0x21, 0x61, 0xA1, 0xE1, 0x31, 0x71, 0xB1, 0xF1, + 0x05, 0x45, 0x85, 0xC5, 0x15, 0x55, 0x95, 0xD5, 0x25, 0x65, 0xA5, 0xE5, 0x35, 0x75, 0xB5, 0xF5, + 0x09, 0x49, 0x89, 0xC9, 0x19, 0x59, 0x99, 0xD9, 0x29, 0x69, 0xA9, 0xE9, 0x39, 0x79, 0xB9, 0xF9, + 0x0D, 0x4D, 0x8D, 0xCD, 0x1D, 0x5D, 0x9D, 0xDD, 0x2D, 0x6D, 0xAD, 0xED, 0x3D, 0x7D, 0xBD, 0xFD, + 0x02, 0x42, 0x82, 0xC2, 0x12, 0x52, 0x92, 0xD2, 0x22, 0x62, 0xA2, 0xE2, 0x32, 0x72, 0xB2, 0xF2, + 0x06, 0x46, 0x86, 0xC6, 0x16, 0x56, 0x96, 0xD6, 0x26, 0x66, 0xA6, 0xE6, 0x36, 0x76, 0xB6, 0xF6, + 0x0A, 0x4A, 0x8A, 0xCA, 0x1A, 0x5A, 0x9A, 0xDA, 0x2A, 0x6A, 0xAA, 0xEA, 0x3A, 0x7A, 0xBA, 0xFA, + 0x0E, 0x4E, 0x8E, 0xCE, 0x1E, 0x5E, 0x9E, 0xDE, 0x2E, 0x6E, 0xAE, 0xEE, 0x3E, 0x7E, 0xBE, 0xFE, + 0x03, 0x43, 0x83, 0xC3, 0x13, 0x53, 0x93, 0xD3, 0x23, 0x63, 0xA3, 0xE3, 0x33, 0x73, 0xB3, 0xF3, + 0x07, 0x47, 0x87, 0xC7, 0x17, 0x57, 0x97, 0xD7, 0x27, 0x67, 0xA7, 0xE7, 0x37, 0x77, 0xB7, 0xF7, + 0x0B, 0x4B, 0x8B, 0xCB, 0x1B, 0x5B, 0x9B, 0xDB, 0x2B, 0x6B, 0xAB, 0xEB, 0x3B, 0x7B, 0xBB, 0xFB, + 0x0F, 0x4F, 0x8F, 0xCF, 0x1F, 0x5F, 0x9F, 0xDF, 0x2F, 0x6F, 0xAF, 0xEF, 0x3F, 0x7F, 0xBF, 0xFF, + }; + }; + + if ((log2_length & 1) == 0) { + const rev32 = @as(u6, @intCast(32 - log2_length)); + var index: usize = 0; + while (index < length) : (index += 1) { + const n = index * 4; + const addr = + (@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) | + (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) | + (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) | + @as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])); + f32_output[addr >> rev32] = input[index][0]; + f32_output[(0x40000000 | addr) >> rev32] = input[index][1]; + f32_output[(0x80000000 | addr) >> rev32] = input[index][2]; + f32_output[(0xC0000000 | addr) >> rev32] = input[index][3]; + } + } else { + const rev7 = @as(usize, 1) << @as(u6, @intCast(log2_length - 3)); + const rev32 = @as(u6, @intCast(32 - (log2_length - 3))); + var index: usize = 0; + while (index < length) : (index += 1) { + const n = index / 2; + var addr = + (((@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) | + (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) | + (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) | + (@as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])))) >> rev32) | + ((index & 1) * rev7 * 4); + f32_output[addr] = input[index][0]; + addr += rev7; + f32_output[addr] = input[index][1]; + addr += rev7; + f32_output[addr] = input[index][2]; + addr += rev7; + f32_output[addr] = input[index][3]; + } + } +} + +pub fn fftInitUnityTable(out_unity_table: []F32x4) void { + assert(std.math.isPowerOfTwo(out_unity_table.len)); + assert(out_unity_table.len >= 32 and out_unity_table.len <= 512); + + var unity_table = out_unity_table; + + const v0123 = f32x4(0.0, 1.0, 2.0, 3.0); + var length = out_unity_table.len / 4; + var vlstep = f32x4s(0.5 * math.pi / @as(f32, @floatFromInt(length))); + + while (true) { + length /= 4; + var vjp = v0123; + + var j: u32 = 0; + while (j < length) : (j += 1) { + unity_table[j] = f32x4s(1.0); + unity_table[j + length * 4] = f32x4s(0.0); + + var vls = vjp * vlstep; + var sin_cos = sincos(vls); + unity_table[j + length] = sin_cos[1]; + unity_table[j + length * 5] = sin_cos[0] * f32x4s(-1.0); + + var vijp = vjp + vjp; + vls = vijp * vlstep; + sin_cos = sincos(vls); + unity_table[j + length * 2] = sin_cos[1]; + unity_table[j + length * 6] = sin_cos[0] * f32x4s(-1.0); + + vijp = vijp + vjp; + vls = vijp * vlstep; + sin_cos = sincos(vls); + unity_table[j + length * 3] = sin_cos[1]; + unity_table[j + length * 7] = sin_cos[0] * f32x4s(-1.0); + + vjp += f32x4s(4.0); + } + vlstep *= f32x4s(4.0); + unity_table = unity_table[8 * length ..]; + + if (length <= 4) + break; + } +} + +pub fn fft(re: []F32x4, im: []F32x4, unity_table: []const F32x4) void { + const length = @as(u32, @intCast(re.len * 4)); + assert(std.math.isPowerOfTwo(length)); + assert(length >= 4 and length <= 512); + assert(re.len == im.len); + + var re_temp_storage: [128]F32x4 = undefined; + var im_temp_storage: [128]F32x4 = undefined; + const re_temp = re_temp_storage[0..re.len]; + const im_temp = im_temp_storage[0..im.len]; + + @memcpy(re_temp, re); + @memcpy(im_temp, im); + + if (length > 16) { + assert(unity_table.len == length); + fftN(re_temp, im_temp, unity_table, length, 1); + } else if (length == 16) { + fft16(re_temp, im_temp, 1); + } else if (length == 8) { + fft8(re_temp, im_temp, 1); + } else if (length == 4) { + fft4(re_temp, im_temp, 1); + } + + fftUnswizzle(re_temp, re); + fftUnswizzle(im_temp, im); +} + +pub fn ifft(re: []F32x4, im: []const F32x4, unity_table: []const F32x4) void { + const length = @as(u32, @intCast(re.len * 4)); + assert(std.math.isPowerOfTwo(length)); + assert(length >= 4 and length <= 512); + assert(re.len == im.len); + + var re_temp_storage: [128]F32x4 = undefined; + var im_temp_storage: [128]F32x4 = undefined; + var re_temp = re_temp_storage[0..re.len]; + var im_temp = im_temp_storage[0..im.len]; + + const rnp = f32x4s(1.0 / @as(f32, @floatFromInt(length))); + const rnm = f32x4s(-1.0 / @as(f32, @floatFromInt(length))); + + for (re, 0..) |_, i| { + re_temp[i] = re[i] * rnp; + im_temp[i] = im[i] * rnm; + } + + if (length > 16) { + assert(unity_table.len == length); + fftN(re_temp, im_temp, unity_table, length, 1); + } else if (length == 16) { + fft16(re_temp, im_temp, 1); + } else if (length == 8) { + fft8(re_temp, im_temp, 1); + } else if (length == 4) { + fft4(re_temp, im_temp, 1); + } + + fftUnswizzle(re_temp, re); +} +test "zmath.ifft" { + var unity_table: [512]F32x4 = undefined; + const epsilon = 0.0001; + + // 64 samples + { + var re = [_]F32x4{ + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), + }; + var im = [_]F32x4{ + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), + }; + + fftInitUnityTable(unity_table[0..64]); + fft(re[0..], im[0..], unity_table[0..64]); + + try expectVecApproxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon); + var i: u32 = 1; + while (i < 16) : (i += 1) { + try expectVecApproxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon); + } + + ifft(re[0..], im[0..], unity_table[0..64]); + + try expectVecApproxEqAbs(re[0], f32x4(1.0, 2.0, 3.0, 4.0), epsilon); + try expectVecApproxEqAbs(re[1], f32x4(5.0, 6.0, 7.0, 8.0), epsilon); + try expectVecApproxEqAbs(re[2], f32x4(9.0, 10.0, 11.0, 12.0), epsilon); + try expectVecApproxEqAbs(re[3], f32x4(13.0, 14.0, 15.0, 16.0), epsilon); + try expectVecApproxEqAbs(re[4], f32x4(17.0, 18.0, 19.0, 20.0), epsilon); + try expectVecApproxEqAbs(re[5], f32x4(21.0, 22.0, 23.0, 24.0), epsilon); + try expectVecApproxEqAbs(re[6], f32x4(25.0, 26.0, 27.0, 28.0), epsilon); + try expectVecApproxEqAbs(re[7], f32x4(29.0, 30.0, 31.0, 32.0), epsilon); + } + + // 512 samples + { + var re: [128]F32x4 = undefined; + var im = [_]F32x4{f32x4s(0.0)} ** 128; + + for (&re, 0..) |*v, i| { + const f = @as(f32, @floatFromInt(i * 4)); + v.* = f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0); + } + + fftInitUnityTable(unity_table[0..512]); + fft(re[0..], im[0..], unity_table[0..512]); + + for (re, 0..) |v, i| { + const f = @as(f32, @floatFromInt(i * 4)); + try expect(!approxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon)); + } + + ifft(re[0..], im[0..], unity_table[0..512]); + + for (re, 0..) |v, i| { + const f = @as(f32, @floatFromInt(i * 4)); + try expectVecApproxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon); + } + } +} +// ------------------------------------------------------------------------------ +// +// Private functions and constants +// +// ------------------------------------------------------------------------------ +const f32x4_sign_mask1: F32x4 = F32x4{ @as(f32, @bitCast(@as(u32, 0x8000_0000))), 0, 0, 0 }; +const f32x4_mask2: F32x4 = F32x4{ + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), + 0, + 0, +}; +const f32x4_mask3: F32x4 = F32x4{ + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), + 0, +}; + +inline fn splatNegativeZero(comptime T: type) T { + return @splat(@as(f32, @bitCast(@as(u32, 0x8000_0000)))); +} +inline fn splatNoFraction(comptime T: type) T { + return @splat(@as(f32, 8_388_608.0)); +} +inline fn splatAbsMask(comptime T: type) T { + return @splat(@as(f32, @bitCast(@as(u32, 0x7fff_ffff)))); +} + +fn floatToIntAndBack(v: anytype) @TypeOf(v) { + // This routine won't handle nan, inf and numbers greater than 8_388_608.0 (will generate undefined values). + @setRuntimeSafety(false); + + const T = @TypeOf(v); + const len = veclen(T); + + var vi32: [len]i32 = undefined; + comptime var i: u32 = 0; + // vcvttps2dq + inline while (i < len) : (i += 1) { + vi32[i] = @as(i32, @intFromFloat(v[i])); + } + + var vf32: [len]f32 = undefined; + i = 0; + // vcvtdq2ps + inline while (i < len) : (i += 1) { + vf32[i] = @as(f32, @floatFromInt(vi32[i])); + } + + return vf32; +} +test "zmath.floatToIntAndBack" { + { + const v = floatToIntAndBack(f32x4(1.1, 2.9, 3.0, -4.5)); + try expectVecEqual(v, f32x4(1.0, 2.0, 3.0, -4.0)); + } + { + const v = floatToIntAndBack(f32x8(1.1, 2.9, 3.0, -4.5, 2.5, -2.5, 1.1, -100.2)); + try expectVecEqual(v, f32x8(1.0, 2.0, 3.0, -4.0, 2.0, -2.0, 1.0, -100.0)); + } + { + const v = floatToIntAndBack(f32x4(math.inf(f32), 2.9, math.nan(f32), math.snan(f32))); + try expect(v[1] == 2.0); + } +} + +pub fn expectVecEqual(expected: anytype, actual: anytype) !void { + const T = @TypeOf(expected, actual); + inline for (0..veclen(T)) |i| { + try std.testing.expectEqual(expected[i], actual[i]); + } +} + +pub fn expectVecApproxEqAbs(expected: anytype, actual: anytype, eps: f32) !void { + const T = @TypeOf(expected, actual); + inline for (0..veclen(T)) |i| { + try std.testing.expectApproxEqAbs(expected[i], actual[i], eps); + } +} + +pub fn approxEqAbs(v0: anytype, v1: anytype, eps: f32) bool { + const T = @TypeOf(v0, v1); + comptime var i: comptime_int = 0; + inline while (i < veclen(T)) : (i += 1) { + if (!math.approxEqAbs(f32, v0[i], v1[i], eps)) { + return false; + } + } + return true; +} + +// ------------------------------------------------------------------------------ +// This software is available under 2 licenses -- choose whichever you prefer. +// ------------------------------------------------------------------------------ +// ALTERNATIVE A - MIT License +// Copyright (c) 2022 Michal Ziulek and Contributors +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// ------------------------------------------------------------------------------ +// ALTERNATIVE B - Public Domain (www.unlicense.org) +// This is free and unencumbered software released into the public domain. +// Anyone is free to copy, modify, publish, use, compile, sell, or distribute this +// software, either in source code form or as a compiled binary, for any purpose, +// commercial or non-commercial, and by any means. +// In jurisdictions that recognize copyright laws, the author or authors of this +// software dedicate any and all copyright interest in the software to the public +// domain. We make this dedication for the benefit of the public at large and to +// the detriment of our heirs and successors. We intend this dedication to be an +// overt act of relinquishment in perpetuity of all present and future rights to +// this software under copyright law. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// ------------------------------------------------------------------------------ diff --git a/src/main.zig b/src/main.zig index 5265ec6..026747e 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,7 +1,10 @@ const std = @import("std"); const vk = @import("vulkan"); const sdl = @import("sdl2"); +const zm = @import("zmath"); + const VulkanRenderer = @import("vulkan_renderer.zig").VulkanRenderer; +const Vector3 = @import("utilities.zig").Vector3; pub fn main() !void { const window = try initWindow(); @@ -16,6 +19,11 @@ pub fn main() !void { var vulkan_renderer = try VulkanRenderer.init(window, allocator); defer vulkan_renderer.deinit(); + var angle: f32 = 0.0; + var now: u64 = sdl.getPerformanceCounter(); + var last: u64 = 0; + var delta_time: f32 = 0.0; + mainLoop: while (true) { while (sdl.pollEvent()) |ev| { switch (ev) { @@ -29,6 +37,18 @@ pub fn main() !void { } } + last = now; + now = sdl.getPerformanceCounter(); + delta_time = @as(f32, @floatFromInt(now - last)) * 1000.0 / @as(f32, @floatFromInt(now)); + + angle += 10.0 * delta_time; + + if (angle > 360.0) { + angle -= 360.0; + } + + try vulkan_renderer.updateModel(zm.rotationZ(angle)); + try vulkan_renderer.draw(); } } diff --git a/src/shaders/shader.vert b/src/shaders/shader.vert index eceb2d8..19219b0 100644 --- a/src/shaders/shader.vert +++ b/src/shaders/shader.vert @@ -3,9 +3,15 @@ layout(location = 0) in vec3 pos; layout(location = 1) in vec3 col; +layout(binding = 0) uniform MVP { + mat4 projection; + mat4 view; + mat4 model; +} mvp; + layout(location = 0) out vec3 fragCol; void main() { - gl_Position = vec4(pos, 1.0); + gl_Position = mvp.projection * mvp.view * mvp.model * vec4(pos, 1.0); fragCol = col; } diff --git a/src/vulkan_renderer.zig b/src/vulkan_renderer.zig index b518e9d..5d41005 100644 --- a/src/vulkan_renderer.zig +++ b/src/vulkan_renderer.zig @@ -3,6 +3,7 @@ const sdl = @import("sdl2"); const vk = @import("vulkan"); const builtin = @import("builtin"); const shaders = @import("shaders"); +const zm = @import("zmath"); const Utilities = @import("utilities.zig"); const QueueFamilyIndices = Utilities.QueueFamilyIndices; @@ -39,6 +40,12 @@ pub const CommandBuffer = vk.CommandBufferProxy(apis); pub const VulkanRenderer = struct { const Self = @This(); + const Mvp = struct { + projection: zm.Mat, + view: zm.Mat, + model: zm.Mat, + }; + allocator: std.mem.Allocator, vkb: BaseDispatch, @@ -50,6 +57,9 @@ pub const VulkanRenderer = struct { // Scene objects meshes: [2]Mesh, + // Scene settings + mvp: Mvp, + // Main instance: Instance, physical_device: vk.PhysicalDevice, @@ -65,6 +75,15 @@ pub const VulkanRenderer = struct { swapchain_framebuffers: []vk.Framebuffer, command_buffers: []CommandBuffer, + // Descriptors + descriptor_set_layout: vk.DescriptorSetLayout, + + descriptor_pool: vk.DescriptorPool, + descriptor_sets: []vk.DescriptorSet, + + uniform_buffer: []vk.Buffer, + uniform_buffer_memory: []vk.DeviceMemory, + // Pipeline graphics_pipeline: vk.Pipeline, pipeline_layout: vk.PipelineLayout, @@ -103,10 +122,28 @@ pub const VulkanRenderer = struct { try self.createLogicalDevice(); try self.createSwapchain(); try self.createRenderPass(); + try self.createDescriptorSetLayout(); try self.createGraphicsPipeline(); try self.createFramebuffers(); try self.createCommandPool(); + const aspect: f32 = @as(f32, @floatFromInt(self.extent.width)) / @as(f32, @floatFromInt(self.extent.height)); + self.mvp.projection = zm.perspectiveFovRh( + std.math.degreesToRadians(45.0), + aspect, + 0.1, + 100.0, + ); + self.mvp.view = zm.lookAtRh( + zm.Vec{ 0.0, 0.0, 2.0, 0.0 }, + zm.Vec{ 0.0, 0.0, 0.0, 0.0 }, + zm.Vec{ 0.0, 1.0, 0.0, 0.0 }, + ); + self.mvp.model = zm.identity(); + + // Invert y scale + self.mvp.projection[1][1] *= -1; + // Create meshes // Vertex Data var mesh_vertices = [_]Vertex{ @@ -154,12 +191,21 @@ pub const VulkanRenderer = struct { self.meshes = [_]Mesh{ first_mesh, second_mesh }; try self.createCommandBuffers(); + + try self.createUniformBuffers(); + try self.createDescriptorPool(); + try self.createDescriptorSets(); + try self.recordCommands(); try self.createSynchronisation(); return self; } + pub fn updateModel(self: *Self, new_model: zm.Mat) !void { + self.mvp.model = new_model; + } + pub fn draw(self: *Self) !void { // Wait for given fence to signal (open) from last draw before continuing _ = try self.device.waitForFences(1, @ptrCast(&self.draw_fences[self.current_frame]), vk.TRUE, std.math.maxInt(u64)); @@ -175,6 +221,8 @@ pub const VulkanRenderer = struct { .null_handle, ); + try self.updateUniformBuffer(image_index_result.image_index); + // -- Submit command buffer to render // Queue submission information const wait_stages = [_]vk.PipelineStageFlags{.{ .color_attachment_output_bit = true }}; @@ -215,6 +263,17 @@ pub const VulkanRenderer = struct { self.instance.destroyDebugUtilsMessengerEXT(self.debug_utils.?, null); } + self.device.destroyDescriptorPool(self.descriptor_pool, null); + self.device.destroyDescriptorSetLayout(self.descriptor_set_layout, null); + + for (self.uniform_buffer, self.uniform_buffer_memory) |buffer, buffer_memory| { + self.device.destroyBuffer(buffer, null); + self.device.freeMemory(buffer_memory, null); + } + self.allocator.free(self.uniform_buffer); + self.allocator.free(self.uniform_buffer_memory); + self.allocator.free(self.descriptor_sets); + for (self.meshes) |mesh| { mesh.destroyBuffers(); } @@ -491,6 +550,26 @@ pub const VulkanRenderer = struct { self.render_pass = try self.device.createRenderPass(&render_pass_create_info, null); } + fn createDescriptorSetLayout(self: *Self) !void { + // MVP binding info + const mvp_layout_binding: vk.DescriptorSetLayoutBinding = .{ + .binding = 0, // Binding point in shader (designated by binding number in shader) + .descriptor_type = .uniform_buffer, // Type of descriptor (unifor, dynamic uniform, image sampler, etc) + .descriptor_count = 1, // Number of descriptors for binding + .stage_flags = .{ .vertex_bit = true }, // Shader stage to bind to + .p_immutable_samplers = null, // For texture: can make smapler data immutable by specifying in layout + }; + + // Create descriptor set layout with given bindings + const layout_create_info: vk.DescriptorSetLayoutCreateInfo = .{ + .binding_count = 1, // Number of binding infos + .p_bindings = @ptrCast(&mvp_layout_binding), // Array of binding infos + }; + + // Create descriptor set layout + self.descriptor_set_layout = try self.device.createDescriptorSetLayout(&layout_create_info, null); + } + fn createGraphicsPipeline(self: *Self) !void { // Create shader modules const vert = try self.device.createShaderModule(&.{ @@ -606,7 +685,7 @@ pub const VulkanRenderer = struct { .polygon_mode = .fill, // How to handle filling points between vertices .line_width = 1.0, // How thick the lines should be when drawn .cull_mode = .{ .back_bit = true }, // Which face of a triangle to cull - .front_face = .clockwise, // Winding to determine which side is front + .front_face = .counter_clockwise, // Winding to determine which side is front .depth_bias_enable = vk.FALSE, // Whether to add depth bias to fragments (good for stopping "shadow acne" in shadow mapping) .depth_bias_constant_factor = 0, .depth_bias_clamp = 0, @@ -647,8 +726,11 @@ pub const VulkanRenderer = struct { .blend_constants = [_]f32{ 0, 0, 0, 0 }, }; - // -- Pipeline layout (TODO: Apply future descriptor set layouts) -- - const pipeline_layout_create_info: vk.PipelineLayoutCreateInfo = .{}; + // -- Pipeline layout -- + const pipeline_layout_create_info: vk.PipelineLayoutCreateInfo = .{ + .set_layout_count = 1, + .p_set_layouts = @ptrCast(&self.descriptor_set_layout), + }; self.pipeline_layout = try self.device.createPipelineLayout(&pipeline_layout_create_info, null); @@ -748,6 +830,101 @@ pub const VulkanRenderer = struct { } } + fn createUniformBuffers(self: *Self) !void { + // Buffer size will be size of all three variables (will offset to access) + const buffer_size: vk.DeviceSize = @sizeOf(@TypeOf(self.mvp)); + + // One uniform buffer for each image (and by extension, command buffer) + self.uniform_buffer = try self.allocator.alloc(vk.Buffer, self.swapchain_images.len); + self.uniform_buffer_memory = try self.allocator.alloc(vk.DeviceMemory, self.swapchain_images.len); + + // Create the uniform buffers + for (0..self.uniform_buffer.len) |i| { + try Utilities.createBuffer( + self.physical_device, + self.instance, + self.device, + buffer_size, + .{ .uniform_buffer_bit = true }, + .{ .host_visible_bit = true, .host_coherent_bit = true }, + &self.uniform_buffer[i], + &self.uniform_buffer_memory[i], + ); + } + } + + fn createDescriptorPool(self: *Self) !void { + // Type of descriptors + how many descriptors (!= descriptor sets) (combined makes the pool size) + const pool_size: vk.DescriptorPoolSize = .{ + .type = .uniform_buffer, + .descriptor_count = @intCast(self.uniform_buffer.len), + }; + + // Data to create descriptor pool + const pool_create_info: vk.DescriptorPoolCreateInfo = .{ + .max_sets = @intCast(self.uniform_buffer.len), // Maximum number of descriptor sets that can be created from pool + .pool_size_count = 1, // Amount of pool sizes being passed + .p_pool_sizes = @ptrCast(&pool_size), // Pool sizes to create pool with + }; + + // Create descriptor pool + self.descriptor_pool = try self.device.createDescriptorPool(&pool_create_info, null); + } + + fn createDescriptorSets(self: *Self) !void { + // One descriptor set for every buffer + self.descriptor_sets = try self.allocator.alloc(vk.DescriptorSet, self.uniform_buffer.len); + + var set_layouts = try self.allocator.alloc(vk.DescriptorSetLayout, self.uniform_buffer.len); + defer self.allocator.free(set_layouts); + for (0..set_layouts.len) |i| { + set_layouts[i] = self.descriptor_set_layout; + } + + // Descriptor set allocation info + const set_alloc_info: vk.DescriptorSetAllocateInfo = .{ + .descriptor_pool = self.descriptor_pool, // Pool to allocate descriptor set from + .descriptor_set_count = @intCast(self.descriptor_sets.len), // Number of sets to allocate + .p_set_layouts = set_layouts.ptr, // Layouts to use to allocate sets (1:1 relationship) + }; + + // Allocate descriptor sets (multiple) + try self.device.allocateDescriptorSets(&set_alloc_info, self.descriptor_sets.ptr); + + // Update all of descriptor set buffer bindings + for (0..self.descriptor_sets.len) |i| { + // Buffer info and data offset info + const mvp_buffer_info: vk.DescriptorBufferInfo = .{ + .buffer = self.uniform_buffer[i], // Bufer to get data from + .offset = 0, // Position of start of data + .range = @sizeOf(@TypeOf(self.mvp)), // Size of data + }; + + // Data about connection between binding and buffer + const mvp_set_write: vk.WriteDescriptorSet = .{ + .dst_set = self.descriptor_sets[i], // Descriptor set to update + .dst_binding = 0, // Binding to update (matches with binding on layout/shader) + .dst_array_element = 0, // Index in array to update + .descriptor_type = .uniform_buffer, // Type of descriptor + .descriptor_count = 1, // Amount to update + .p_buffer_info = @ptrCast(&mvp_buffer_info), // Information about buffer data to bind + .p_image_info = undefined, + .p_texel_buffer_view = undefined, + }; + + // Update the descriptor sets with new buffer/binding info + self.device.updateDescriptorSets(1, @ptrCast(&mvp_set_write), 0, null); + } + } + + fn updateUniformBuffer(self: Self, image_index: u32) !void { + const data = try self.device.mapMemory(self.uniform_buffer_memory[image_index], 0, @sizeOf(Mvp), .{}); + + const mvp_data: *Mvp = @ptrCast(@alignCast(data)); + mvp_data.* = self.mvp; + self.device.unmapMemory(self.uniform_buffer_memory[image_index]); + } + fn recordCommands(self: *Self) !void { // Information about how to begin each command const buffer_begin_info: vk.CommandBufferBeginInfo = .{ @@ -799,6 +976,17 @@ pub const VulkanRenderer = struct { // Bind mesh index buffer, with 0 offset and using the uint32 type command_buffer.bindIndexBuffer(mesh.index_buffer, 0, .uint32); + // Bind descriptor sets + command_buffer.bindDescriptorSets( + .graphics, + self.pipeline_layout, + 0, + 1, + @ptrCast(&self.descriptor_sets[i]), + 0, + null, + ); + // Execute a pipeline command_buffer.drawIndexed(mesh.index_count, 1, 0, 0, 0); }