strided_perm/hptt/mod.rs
1//! HPTT-faithful cache-efficient tensor permutation.
2//!
3//! Based on the algorithm described in HPTT (High-Performance Tensor Transpose)
4//! by Paul Springer, Tong Su, and Paolo Bientinesi.
5//! Original C++ implementation: <https://github.com/springer13/hptt>
6//! Licensed under BSD-3-Clause. See THIRD-PARTY-LICENSES for details.
7//!
8//! Implements the key techniques from HPTT:
9//! 1. Bilateral dimension fusion (fuse dims contiguous in both src and dst)
10//! 2. 2D micro-kernel transpose (4×4 scalar for f64, 8×8 for f32)
11//! 3. Macro-kernel: BLOCK × BLOCK tile via grid of micro-kernel calls
12//! 4. Recursive ComputeNode loop nest (only stride-1 dims get blocked)
13//! 5. ConstStride1 fast path when src and dst stride-1 dims coincide
14
15mod execute;
16mod macro_kernel;
17pub(crate) mod micro_kernel;
18mod plan;
19
20pub use execute::execute_permute_blocked;
21#[cfg(feature = "parallel")]
22pub use execute::execute_permute_blocked_par;
23pub use plan::{build_permute_plan, PermutePlan};