strided_kernel/
lib.rs

1//! Cache-optimized kernels for strided multidimensional array operations.
2//!
3//! This crate is a Rust port of Julia's [Strided.jl](https://github.com/Jutho/Strided.jl)
4//! and [StridedViews.jl](https://github.com/Jutho/StridedViews.jl) libraries, providing
5//! efficient operations on strided multidimensional array views.
6//!
7//! # Core Types
8//!
9//! - [`StridedView`] / [`StridedViewMut`]: Dynamic-rank strided views over existing data
10//! - [`StridedArray`]: Owned strided multidimensional array
11//! - [`ElementOp`] trait and implementations ([`Identity`], [`Conj`], [`Transpose`], [`Adjoint`]):
12//!   Type-level element operations applied lazily on access
13//!
14//! # Primary API (view-based, Julia-compatible)
15//!
16//! ## Map Operations
17//!
18//! - [`map_into`]: Apply a function element-wise from source to destination
19//! - [`zip_map2_into`], [`zip_map3_into`], [`zip_map4_into`]: Multi-array element-wise operations
20//!
21//! ## Reduce Operations
22//!
23//! - [`reduce`]: Full reduction with map function
24//! - [`reduce_axis`]: Reduce along a single axis
25//!
26//! ## Basic Operations
27//!
28//! - [`copy_into`]: Copy array contents
29//! - [`add`], [`mul`]: Element-wise arithmetic
30//! - [`axpy`]: y = alpha*x + y (array version)
31//! - [`sum`], [`dot`]: Reductions
32//! - [`symmetrize_into`], [`symmetrize_conj_into`]: Matrix symmetrization
33//!
34//! # Example
35//!
36//! ```rust
37//! use strided_kernel::{StridedView, StridedViewMut, StridedArray, Identity, map_into};
38//!
39//! // Create a column-major array (Julia default)
40//! let src = StridedArray::<f64>::from_fn_col_major(&[2, 3], |idx| {
41//!     (idx[0] * 10 + idx[1]) as f64
42//! });
43//! let mut dest = StridedArray::<f64>::col_major(&[2, 3]);
44//!
45//! // Map with view-based API
46//! map_into(&mut dest.view_mut(), &src.view(), |x| x * 2.0).unwrap();
47//! assert_eq!(dest.get(&[1, 2]), 24.0); // (1*10 + 2) * 2
48//! ```
49//!
50//! # Cache Optimization
51//!
52//! The library uses Julia's blocking strategy for cache efficiency:
53//! - Dimensions are sorted by stride magnitude for optimal memory access
54//! - Operations are blocked into tiles fitting L1 cache ([`BLOCK_MEMORY_SIZE`] = 32KB)
55//! - Contiguous arrays use fast paths bypassing the blocking machinery
56
57mod block;
58mod fuse;
59mod kernel;
60mod order;
61mod simd;
62#[cfg(feature = "parallel")]
63mod threading;
64
65mod maybe_sync;
66pub use maybe_sync::{MaybeSend, MaybeSendSync, MaybeSync};
67
68// View-based operation modules
69mod map_view;
70mod ops_view;
71mod reduce_view;
72
73// ============================================================================
74// Re-exports from strided_view for backward compatibility
75// ============================================================================
76pub use strided_view::view;
77pub use strided_view::{
78    col_major_strides, row_major_strides, Adjoint, ComposableElementOp, Compose, Conj, ElementOp,
79    ElementOpApply, Identity, Result, StridedArray, StridedError, StridedView, StridedViewMut,
80    Transpose,
81};
82
83// ============================================================================
84// Map operations
85// ============================================================================
86pub use map_view::{map_into, zip_map2_into, zip_map3_into, zip_map4_into};
87
88// ============================================================================
89// High-level operations
90// ============================================================================
91pub use ops_view::{
92    add, axpy, copy_conj, copy_into, copy_scale, copy_transpose_scale_into, dot, fma, mul, sum,
93    symmetrize_conj_into, symmetrize_into,
94};
95
96// ============================================================================
97// Reduce operations
98// ============================================================================
99pub use reduce_view::{reduce, reduce_axis};
100
101// ============================================================================
102// SIMD trait
103// ============================================================================
104pub use simd::MaybeSimdOps;
105
106// ============================================================================
107// Constants
108// ============================================================================
109
110/// Block memory size for cache-optimized iteration (L1 cache target).
111///
112/// Operations are blocked into tiles that fit within this size to maximize cache hits.
113/// Default: 32KB (typical L1 data cache size).
114pub const BLOCK_MEMORY_SIZE: usize = 32 * 1024;
115
116/// Cache line size in bytes.
117///
118/// Used for memory region calculations in block size computation.
119pub const CACHE_LINE_SIZE: usize = 64;