Nothing Special   »   [go: up one dir, main page]

Skip to content

Commit

Permalink
Improve: Faster split-loads on Haswell
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Oct 14, 2024
1 parent 3740cf4 commit 3458950
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 32 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ You can learn more about the technical implementation details in the following b
For reference, we use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API.
Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements:

| Type | Apple M2 Pro | Intel Sapphire Rapids | AWS Graviton 4 |
| Type | Apple M2 Pro | AMD Genoa | AWS Graviton 4 |
| :----- | ---------------------------------: | ---------------------------------: | ---------------------------------: |
| `f64` | 18.5 → 28.8 GB/s <br/> + 56 % | 21.9 → 41.4 GB/s <br/> + 89 % | 20.7 → 41.3 GB/s <br/> + 99 % |
| `f32` | 9.2 → 29.6 GB/s <br/> + 221 % | 10.9 → 95.8 GB/s <br/> + 779 % | 4.9 → 41.9 GB/s <br/> + 755 % |
Expand Down Expand Up @@ -748,12 +748,16 @@ In general there are a few principles that SimSIMD follows:
- Avoid loop unrolling.
- Never allocate memory.
- Never throw exceptions or set `errno`.
- Detect overflows and report the distance with a "signaling" `NaN`.
- Keep all function arguments the size of the pointer.
- Avoid returning from public interfaces, use out-arguments instead.
- Don't over-optimize for old CPUs and single- and double-precision floating-point numbers.
- Prioritize mixed-precision and integer operations, and new ISA extensions.
Possibly, in the future:
- Best effort computation silencing `NaN` components in low-precision inputs.
- Detect overflows and report the distance with a "signaling" `NaN`.
Last, but not the least - don't build unless there is a demand for it.
So if you have a specific use-case, please open an issue or a pull request, and ideally, bring in more users with similar needs.
Expand Down
20 changes: 10 additions & 10 deletions include/simsimd/curved.h
Original file line number Diff line number Diff line change
Expand Up @@ -420,8 +420,8 @@ SIMSIMD_PUBLIC void simsimd_bilinear_f16_haswell(simsimd_f16_t const* a, simsimd
__m256 a_vec = _mm256_cvtph_ps(_mm_set1_epi16(*(short const*)(a + i)));
__m256 partial_sum_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(b + j)));
__m256 c_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
__m256 b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(b + j)));
__m256 c_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
partial_sum_vec = _mm256_fmadd_ps(b_vec, c_vec, partial_sum_vec);
}
sum_vec = _mm256_fmadd_ps(a_vec, partial_sum_vec, sum_vec);
Expand Down Expand Up @@ -455,9 +455,9 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_haswell(simsimd_f16_t const* a, sims
__m256 partial_sum_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 diff_j_vec = _mm256_sub_ps( //
_mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(a + j))),
_mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(b + j))));
__m256 c_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
_mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(a + j))),
_mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(b + j))));
__m256 c_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
partial_sum_vec = _mm256_fmadd_ps(diff_j_vec, c_vec, partial_sum_vec);
}
sum_vec = _mm256_fmadd_ps(diff_i_vec, partial_sum_vec, sum_vec);
Expand Down Expand Up @@ -493,8 +493,8 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_haswell(simsimd_bf16_t const* a, simsi
__m256 a_vec = _mm256_set1_ps(simsimd_bf16_to_f32(a + i));
__m256 partial_sum_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j)));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(b + j)));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
partial_sum_vec = _mm256_fmadd_ps(b_vec, c_vec, partial_sum_vec);
}
sum_vec = _mm256_fmadd_ps(a_vec, partial_sum_vec, sum_vec);
Expand Down Expand Up @@ -530,9 +530,9 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_haswell(simsimd_bf16_t const* a, si
__m256 partial_sum_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 diff_j_vec = _mm256_sub_ps( //
_simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(a + j))), //
_simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j))));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
_simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(a + j))), //
_simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(b + j))));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
partial_sum_vec = _mm256_fmadd_ps(diff_j_vec, c_vec, partial_sum_vec);
}
sum_vec = _mm256_fmadd_ps(diff_i_vec, partial_sum_vec, sum_vec);
Expand Down
22 changes: 14 additions & 8 deletions include/simsimd/dot.h
Original file line number Diff line number Diff line change
Expand Up @@ -997,10 +997,16 @@ SIMSIMD_PUBLIC void simsimd_dot_f16_haswell(simsimd_f16_t const* a, simsimd_f16_
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
} else {
a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
n -= 8, a += 8, b += 8;
}
// We can silence the NaNs using blends:
//
// __m256 a_is_nan = _mm256_cmp_ps(a_vec, a_vec, _CMP_UNORD_Q);
// __m256 b_is_nan = _mm256_cmp_ps(b_vec, b_vec, _CMP_UNORD_Q);
// ab_vec = _mm256_blendv_ps(_mm256_fmadd_ps(a_vec, b_vec, ab_vec), ab_vec, _mm256_or_ps(a_is_nan, b_is_nan));
//
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
if (n)
goto simsimd_dot_f16_haswell_cycle;
Expand Down Expand Up @@ -1032,8 +1038,8 @@ SIMSIMD_PUBLIC void simsimd_dot_f16c_haswell(simsimd_f16_t const* a, simsimd_f16
);

while (n >= 8) {
__m256 a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
__m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
__m256 a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
__m256 b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
__m256 b_swapped_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec));
ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec);
ab_imag_vec = _mm256_fmadd_ps(a_vec, b_swapped_vec, ab_imag_vec);
Expand Down Expand Up @@ -1068,8 +1074,8 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_haswell(simsimd_f16_t const* a, simsimd_f1
);

while (n >= 8) {
__m256 a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
__m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
__m256 a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
__m256 b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec);
b_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec));
ab_imag_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_imag_vec);
Expand Down Expand Up @@ -1161,8 +1167,8 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf
b_vec = _simsimd_partial_load_bf16x8_haswell(b, n);
n = 0;
} else {
a_vec = _mm_loadu_si128((__m128i const*)a);
b_vec = _mm_loadu_si128((__m128i const*)b);
a_vec = _mm_lddqu_si128((__m128i const*)a);
b_vec = _mm_lddqu_si128((__m128i const*)b);
a += 8, b += 8, n -= 8;
}
ab_vec = _mm256_fmadd_ps(_simsimd_bf16x8_to_f32x8_haswell(a_vec), _simsimd_bf16x8_to_f32x8_haswell(b_vec), ab_vec);
Expand Down
8 changes: 4 additions & 4 deletions include/simsimd/probability.h
Original file line number Diff line number Diff line change
Expand Up @@ -351,8 +351,8 @@ SIMSIMD_PUBLIC void simsimd_kl_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
} else {
a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
n -= 8, a += 8, b += 8;
}
a_vec = _mm256_add_ps(a_vec, epsilon_vec);
Expand Down Expand Up @@ -383,8 +383,8 @@ SIMSIMD_PUBLIC void simsimd_js_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
} else {
a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
n -= 8, a += 8, b += 8;
}
a_vec = _mm256_add_ps(a_vec, epsilon_vec);
Expand Down
16 changes: 8 additions & 8 deletions include/simsimd/spatial.h
Original file line number Diff line number Diff line change
Expand Up @@ -1037,8 +1037,8 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f16_haswell(simsimd_f16_t const* a, simsimd_f16
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
} else {
a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
n -= 8, a += 8, b += 8;
}
__m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
Expand All @@ -1060,8 +1060,8 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_haswell(simsimd_f16_t const* a, simsimd_f16_
b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
n = 0;
} else {
a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
n -= 8, a += 8, b += 8;
}
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
Expand Down Expand Up @@ -1092,8 +1092,8 @@ SIMSIMD_PUBLIC void simsimd_l2sq_bf16_haswell(simsimd_bf16_t const* a, simsimd_b
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b, n));
n = 0;
} else {
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)a));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)b));
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)a));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)b));
n -= 8, a += 8, b += 8;
}
__m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
Expand All @@ -1115,8 +1115,8 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b, n));
n = 0;
} else {
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)a));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)b));
a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)a));
b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)b));
n -= 8, a += 8, b += 8;
}
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
Expand Down

0 comments on commit 3458950

Please sign in to comment.