swar

C++20 SIMD Within A Register library

Stars
19

// Overview / Examples / API / FAQ

SWAR: SIMD Within A Register library

https://en.wikipedia.org/wiki/SWAR

Use cases

  • Performance (branchless)
  • Portable (uses 'normal' registers)

Features

Requirements


Overview

API (https://godbolt.org/z/b4v9aTEYs)

constexpr u8 data[]{1, 2, 3, 5, 5, 6, 7, 8};
constexpr swar<u8> lhs{data}; // copy_from
constexpr swar<u8> rhs{5};    // broadcast (native: u64)

static_assert(8u == lhs.size());
static_assert(sizeof(u64) == sizeof(lhs));

constexpr auto match = lhs == rhs;

static_assert(any_of(match));
static_assert(some_of(match));
static_assert(not all_of(match));
static_assert(not none_of(match));

static_assert(3u == find_first_set(match));
static_assert(4u == find_last_set(match));
static_assert(2u == popcount(match));
static_assert(match[3u] and match[4u]);

static_assert(sizeof(u32)  == sizeof(swar<u8,  4u>));
static_assert(sizeof(u64)  == sizeof(swar<u8,  8u>));
static_assert(sizeof(u32)  == sizeof(swar<u16, 2u>));
static_assert(sizeof(u64)  == sizeof(swar<u16, 4u>));
static_assert(sizeof(u64)  == sizeof(swar<u32, 2u>));
static_assert(sizeof(u128) == sizeof(swar<u32, 4u>));

// and more (see API)...

Performance (https://godbolt.org/z/ManGb8aso)

auto eq(swar<u8> lhs, swar<u8> rhs) {
  return lhs == rhs;
}
eq: // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
  movabs  rdx, -9187201950435737472
  xor     rdi, rsi
  movabs  rax, 72340172838076672
  or      rdi, rdx
  sub     rax, rdi
  and     rax, rdx
  ret
auto contains(swar<u8> lhs, u8 value) {
  const auto rhs = swar<u8>{value};
  const auto match = lhs == rhs;
  return any_of(match);
}
contains: // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
  movabs  rax, 72340172838076673
  movzx   esi, sil
  movabs  rdx, -9187201950435737472
  imul    rsi, rax
  sub     rax, 1
  xor     rdi, rsi
  or      rdi, rdx
  sub     rax, rdi
  test    rax, rdx
  setne   al
  ret
auto find(swar<u8> lhs, u8 value) {
  const auto rhs = swar<u8>{value};
  const auto match = lhs == rhs;
  return any_of(match) * find_first_set(match);
}
find: // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
  movabs  rax, 72340172838076673
  movzx   esi, sil
  movabs  rdx, 72340172838076672
  imul    rsi, rax
  movabs  rax, -9187201950435737472
  xor     rdi, rsi
  or      rdi, rax
  sub     rdx, rdi
  and     rdx, rax
  xor     eax, eax
  rep bsf rax, rdx
  test    rdx, rdx
  mov     edx, 0
  cmove   rax, rdx
  ret

Examples

swar vs simd (https://godbolt.org/z/YsG8evqr8)

template<class T> auto eq(T lhs, T rhs) { return lhs == rhs; }
eq(swar<u8>, swar<u8>): // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
  movabs  rdx, -9187201950435737472
  xor     rdi, rsi
  movabs  rax, 72340172838076672
  or      rdi, rdx
  sub     rax, rdi
  and     rax, rdx
  ret

eq(simd<u8>, simd<u8>): // $CXX -O3 -mavx512f
  vpcmpeqb xmm0, xmm0, xmm1
  ret
template<class T> auto contains(T lhs, auto value) {
  const auto rhs = T{value};
  const auto match = lhs == rhs;
  return any_of(match);
}
cointains(swar<u8>, swar<u8>): // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
  movabs  rax, 72340172838076673
  movzx   esi, sil
  movabs  rdx, -9187201950435737472
  imul    rsi, rax
  sub     rax, 1
  xor     rdi, rsi
  or      rdi, rdx
  sub     rax, rdi
  test    rax, rdx
  setne   al
  ret

contains(simd<u8>, simd<u8>): // $CXX -O3 -mavx512f
  vmovd        xmm1, edi
  vpbroadcastb xmm1, xmm1
  vpcmpeqb     xmm0, xmm1, xmm0
  vptest       xmm0, xmm0
  setne        al
  ret
template<class T> auto find(T lhs, auto value) {
  const auto rhs = T{value};
  const auto match = lhs == rhs;
  return any_of(match) * find_first_set(match);
}
find(swar<u8>, swar<u8>): // $CXX -O3 -mno-sse -mno-sse2 -mno-sse3 -mno-avx
  movabs  rax, 72340172838076673
  movzx   esi, sil
  movabs  rdx, 72340172838076672
  imul    rsi, rax
  movabs  rax, -9187201950435737472
  xor     rdi, rsi
  or      rdi, rax
  sub     rdx, rdi
  and     rdx, rax
  xor     eax, eax
  rep bsf rax, rdx
  test    rdx, rdx
  mov     edx, 0
  cmove   rax, rdx
  ret

find(simd<u8>, simd<u8>): // $CXX -O3 -mavx512f
  vmovd         xmm1, edi
  vpbroadcastb  xmm1, xmm1
  vpcmpeqb      xmm0, xmm1, xmm0
  vpmovmskb     eax, xmm0
  or            eax, 65536
  rep           bsf ecx, eax
  xor           eax, eax
  vptest        xmm0, xmm0
  cmovne        eax, ecx
  ret

API

template<class T, size_t Width = sizeof(u64) / sizeof(T), class TAbi = abi_t<T, Width>>
  requires ((sizeof(T) * Width) <= sizeof(TAbi))
struct swar {
  using value_type = T;
  using abi_type = TAbi;

  constexpr swar() noexcept = default;
  constexpr swar(const swar&) noexcept = default;
  constexpr swar(swar&&) noexcept = default;
  constexpr explicit swar(const auto value) noexcept;
  constexpr explicit swar(const auto* mem) noexcept;
  constexpr explicit swar(const auto& gen) noexcept;
  [[nodiscard]] constexpr explicit operator abi_type() const noexcept;
  [[nodiscard]] constexpr auto operator[](size_t) const noexcept -> T;
  [[nodiscard]] static constexpr auto size() noexcept -> size_t;
  [[nodiscard]] friend constexpr auto operator==(const swar&, const swar&) noexcept;
};

template<class T, size_t Width = sizeof(u64) / sizeof(T), class TAbi = abi_t<T, Width>>
  requires ((sizeof(T) * Width) <= sizeof(TAbi))
struct swar_mask {
  using value_type = bool; /// predefined
  using abi_type = TAbi;

  constexpr swar_mask() noexcept = default;
  constexpr swar_mask(const swar_mask&) noexcept = default;
  constexpr swar_mask(swar_mask&&) noexcept = default;
  constexpr explicit swar_mask(const abi_type value) noexcept;

  [[nodiscard]] constexpr auto operator[](const size_t index) const noexcept -> bool;
  [[nodiscard]] static constexpr auto size() noexcept -> size_t { return Width; }
};

template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto all_of(const swar_mask<T, Width, TAbi>& s) noexcept -> bool;

template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto any_of(const swar_mask<T, Width, TAbi>& s) noexcept -> bool;

template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto some_of(const swar_mask<T, Width, TAbi>& s) noexcept -> bool;

template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto none_of(const swar_mask<T, Width, TAbi>& s) noexcept -> bool;

template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto find_first_set(const swar_mask<T, Width, TAbi>& s) noexcept;

template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto find_last_set(const swar_mask<T, Width, TAbi>& s) noexcept;

template<class T, size_t Width, class TAbi>
[[nodiscard]] constexpr auto popcount(const swar_mask<T, Width, TAbi>& s) noexcept;

template<class T> inline constexpr bool is_swar_v = /* unspecified */;
template<class T> inline constexpr bool is_swar_mask_v = /* unspecified */;

FAQ

  • How to disable running tests at compile-time?

    When -DNTEST is defined static_asserts tests wont be executed upon include. Note: Use with caution as disabling tests means that there are no gurantees upon include that given compiler/env combination works as expected.

  • How to integrate with CMake.FetchContent?

    include(FetchContent)
    
    FetchContent_Declare(
      qlibs.swar
      GIT_REPOSITORY https://github.com/qlibs/swar
      GIT_TAG v1.0.0
    )
    
    FetchContent_MakeAvailable(qlibs.swar)
    
    target_link_libraries(${PROJECT_NAME} PUBLIC qlibs.swar);
    
  • Acknowledgments

    https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html, https://wg21.link/P1928