Improve: Faster split-loads on Haswell

ashvardanian · Oct 14, 2024 · 3458950 · 3458950
1 parent 3740cf4
commit 3458950
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -93,7 +93,7 @@ You can learn more about the technical implementation details in the following b
 For reference, we use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API.
 Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements:
 
-| Type   |                       Apple M2 Pro |              Intel Sapphire Rapids |                     AWS Graviton 4 |
+| Type   |                       Apple M2 Pro |                          AMD Genoa |                     AWS Graviton 4 |
 | :----- | ---------------------------------: | ---------------------------------: | ---------------------------------: |
 | `f64`  | 18.5 → 28.8 GB/s <br/>      + 56 % | 21.9 → 41.4 GB/s <br/>      + 89 % | 20.7 → 41.3 GB/s <br/>      + 99 % |
 | `f32`  |  9.2 → 29.6 GB/s <br/>     + 221 % | 10.9 → 95.8 GB/s <br/>     + 779 % |  4.9 → 41.9 GB/s <br/>     + 755 % |
@@ -748,12 +748,16 @@ In general there are a few principles that SimSIMD follows:
 - Avoid loop unrolling.
 - Never allocate memory.
 - Never throw exceptions or set `errno`.
-- Detect overflows and report the distance with a "signaling" `NaN`.
 - Keep all function arguments the size of the pointer.
 - Avoid returning from public interfaces, use out-arguments instead.
 - Don't over-optimize for old CPUs and single- and double-precision floating-point numbers.
 - Prioritize mixed-precision and integer operations, and new ISA extensions.
 
+Possibly, in the future:
+
+- Best effort computation silencing `NaN` components in low-precision inputs. 
+- Detect overflows and report the distance with a "signaling" `NaN`.
+
 Last, but not the least - don't build unless there is a demand for it.
 So if you have a specific use-case, please open an issue or a pull request, and ideally, bring in more users with similar needs.
 

diff --git a/include/simsimd/curved.h b/include/simsimd/curved.h
@@ -420,8 +420,8 @@ SIMSIMD_PUBLIC void simsimd_bilinear_f16_haswell(simsimd_f16_t const* a, simsimd
         __m256 a_vec = _mm256_cvtph_ps(_mm_set1_epi16(*(short const*)(a + i)));
         __m256 partial_sum_vec = _mm256_setzero_ps();
         for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
-            __m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(b + j)));
-            __m256 c_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
+            __m256 b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(b + j)));
+            __m256 c_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
             partial_sum_vec = _mm256_fmadd_ps(b_vec, c_vec, partial_sum_vec);
         }
         sum_vec = _mm256_fmadd_ps(a_vec, partial_sum_vec, sum_vec);
@@ -455,9 +455,9 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_haswell(simsimd_f16_t const* a, sims
         __m256 partial_sum_vec = _mm256_setzero_ps();
         for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
             __m256 diff_j_vec = _mm256_sub_ps( //
-                _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(a + j))),
-                _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(b + j))));
-            __m256 c_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
+                _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(a + j))),
+                _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(b + j))));
+            __m256 c_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
             partial_sum_vec = _mm256_fmadd_ps(diff_j_vec, c_vec, partial_sum_vec);
         }
         sum_vec = _mm256_fmadd_ps(diff_i_vec, partial_sum_vec, sum_vec);
@@ -493,8 +493,8 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_haswell(simsimd_bf16_t const* a, simsi
         __m256 a_vec = _mm256_set1_ps(simsimd_bf16_to_f32(a + i));
         __m256 partial_sum_vec = _mm256_setzero_ps();
         for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
-            __m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j)));
-            __m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
+            __m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(b + j)));
+            __m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
             partial_sum_vec = _mm256_fmadd_ps(b_vec, c_vec, partial_sum_vec);
         }
         sum_vec = _mm256_fmadd_ps(a_vec, partial_sum_vec, sum_vec);
@@ -530,9 +530,9 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_haswell(simsimd_bf16_t const* a, si
         __m256 partial_sum_vec = _mm256_setzero_ps();
         for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
             __m256 diff_j_vec = _mm256_sub_ps(                                              //
-                _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(a + j))), //
-                _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j))));
-            __m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
+                _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(a + j))), //
+                _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(b + j))));
+            __m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
             partial_sum_vec = _mm256_fmadd_ps(diff_j_vec, c_vec, partial_sum_vec);
         }
         sum_vec = _mm256_fmadd_ps(diff_i_vec, partial_sum_vec, sum_vec);

diff --git a/include/simsimd/dot.h b/include/simsimd/dot.h
@@ -997,10 +997,16 @@ SIMSIMD_PUBLIC void simsimd_dot_f16_haswell(simsimd_f16_t const* a, simsimd_f16_
         b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
         n = 0;
     } else {
-        a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
-        b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
+        a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
+        b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
         n -= 8, a += 8, b += 8;
     }
+    // We can silence the NaNs using blends:
+    //
+    //     __m256 a_is_nan = _mm256_cmp_ps(a_vec, a_vec, _CMP_UNORD_Q);
+    //     __m256 b_is_nan = _mm256_cmp_ps(b_vec, b_vec, _CMP_UNORD_Q);
+    //     ab_vec = _mm256_blendv_ps(_mm256_fmadd_ps(a_vec, b_vec, ab_vec), ab_vec, _mm256_or_ps(a_is_nan, b_is_nan));
+    //
     ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
     if (n)
         goto simsimd_dot_f16_haswell_cycle;
@@ -1032,8 +1038,8 @@ SIMSIMD_PUBLIC void simsimd_dot_f16c_haswell(simsimd_f16_t const* a, simsimd_f16
     );
 
     while (n >= 8) {
-        __m256 a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
-        __m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
+        __m256 a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
+        __m256 b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
         __m256 b_swapped_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec));
         ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec);
         ab_imag_vec = _mm256_fmadd_ps(a_vec, b_swapped_vec, ab_imag_vec);
@@ -1068,8 +1074,8 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_haswell(simsimd_f16_t const* a, simsimd_f1
     );
 
     while (n >= 8) {
-        __m256 a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
-        __m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
+        __m256 a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
+        __m256 b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
         ab_real_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_real_vec);
         b_vec = _mm256_castsi256_ps(_mm256_shuffle_epi8(_mm256_castps_si256(b_vec), swap_adjacent_vec));
         ab_imag_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_imag_vec);
@@ -1161,8 +1167,8 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf
         b_vec = _simsimd_partial_load_bf16x8_haswell(b, n);
         n = 0;
     } else {
-        a_vec = _mm_loadu_si128((__m128i const*)a);
-        b_vec = _mm_loadu_si128((__m128i const*)b);
+        a_vec = _mm_lddqu_si128((__m128i const*)a);
+        b_vec = _mm_lddqu_si128((__m128i const*)b);
         a += 8, b += 8, n -= 8;
     }
     ab_vec = _mm256_fmadd_ps(_simsimd_bf16x8_to_f32x8_haswell(a_vec), _simsimd_bf16x8_to_f32x8_haswell(b_vec), ab_vec);

diff --git a/include/simsimd/probability.h b/include/simsimd/probability.h
@@ -351,8 +351,8 @@ SIMSIMD_PUBLIC void simsimd_kl_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t
         b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
         n = 0;
     } else {
-        a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
-        b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
+        a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
+        b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
         n -= 8, a += 8, b += 8;
     }
     a_vec = _mm256_add_ps(a_vec, epsilon_vec);
@@ -383,8 +383,8 @@ SIMSIMD_PUBLIC void simsimd_js_f16_haswell(simsimd_f16_t const* a, simsimd_f16_t
         b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
         n = 0;
     } else {
-        a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
-        b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
+        a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
+        b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
         n -= 8, a += 8, b += 8;
     }
     a_vec = _mm256_add_ps(a_vec, epsilon_vec);

diff --git a/include/simsimd/spatial.h b/include/simsimd/spatial.h
@@ -1037,8 +1037,8 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f16_haswell(simsimd_f16_t const* a, simsimd_f16
         b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
         n = 0;
     } else {
-        a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
-        b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
+        a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
+        b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
         n -= 8, a += 8, b += 8;
     }
     __m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
@@ -1060,8 +1060,8 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_haswell(simsimd_f16_t const* a, simsimd_f16_
         b_vec = _simsimd_partial_load_f16x8_haswell(b, n);
         n = 0;
     } else {
-        a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)a));
-        b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)b));
+        a_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)a));
+        b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)b));
         n -= 8, a += 8, b += 8;
     }
     ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
@@ -1092,8 +1092,8 @@ SIMSIMD_PUBLIC void simsimd_l2sq_bf16_haswell(simsimd_bf16_t const* a, simsimd_b
         b_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b, n));
         n = 0;
     } else {
-        a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)a));
-        b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)b));
+        a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)a));
+        b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)b));
         n -= 8, a += 8, b += 8;
     }
     __m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
@@ -1115,8 +1115,8 @@ SIMSIMD_PUBLIC void simsimd_cos_bf16_haswell(simsimd_bf16_t const* a, simsimd_bf
         b_vec = _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b, n));
         n = 0;
     } else {
-        a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)a));
-        b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)b));
+        a_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)a));
+        b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)b));
         n -= 8, a += 8, b += 8;
     }
     ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);