Expand description
High-level einsum with N-ary contraction tree optimization.
This crate provides Einstein summation notation for [Tensor]
values. It supports:
- String notation:
"ij,jk->ik"(NumPy/PyTorch compatible) - Parenthesized contraction order:
"ij,(jk,kl)->il"to control pairwise contraction sequence in string notation - Integer label notation: omeinsum-rs compatible, using
u32labels - N-ary contraction: Automatic or manual optimization of pairwise
contraction order via
ContractionTree - Accumulating variants:
einsum_into,einsum_with_subscripts_into,einsum_with_plan_intowrite into a pre-allocated output buffer with BLAS-stylealpha/betascaling, avoiding allocation in hot loops
§Backend dispatch
The backend is selected automatically from the tensor’s
LogicalMemorySpace (PyTorch-style).
There is no backend type parameter in the public API.
§Examples
§Common operations
ⓘ
use tenferro_einsum::einsum;
use tenferro_tensor::{Tensor, MemoryOrder};
use tenferro_device::LogicalMemorySpace;
let col = MemoryOrder::ColumnMajor;
let a = Tensor::<f64>::from_slice(&[1.0, 2.0, 3.0, 4.0], &[2, 2], col).unwrap();
let b = Tensor::<f64>::from_slice(&[5.0, 6.0, 7.0, 8.0], &[2, 2], col).unwrap();
let v = Tensor::<f64>::from_slice(&[1.0, 2.0, 3.0], &[3], col).unwrap();
// Matrix multiplication: C = A @ B
let c = einsum("ij,jk->ik", &[&a, &b]).unwrap();
// Trace: tr(A)
let tr = einsum("ii->", &[&a]).unwrap();
// Outer product: v_i * v_j -> M_{ij}
let outer = einsum("i,j->ij", &[&v, &v]).unwrap();
// Dot product: v . v
let dot = einsum("i,i->", &[&v, &v]).unwrap();
// Matrix-vector product: A @ v
let mv = einsum("ij,j->i", &[&a, &v]).unwrap();
// Diagonal embedding: vector -> diagonal matrix
// v = [1, 2, 3] -> [[1,0,0],[0,2,0],[0,0,3]]
let diag = einsum("i->ii", &[&v]).unwrap();
assert_eq!(diag.dims(), &[3, 3]);
// Diagonal extraction: matrix -> diagonal vector
let d = einsum("ii->i", &[&a]).unwrap();
// Higher-order diagonal: 3D tensor with repeated index
// Creates T_{iii} from v_i
let t = einsum("i->iii", &[&v]).unwrap();
assert_eq!(t.dims(), &[3, 3, 3]);
// Consuming variant: operands are moved, buffers may be reused
use tenferro_einsum::einsum_owned;
let x = Tensor::<f64>::from_slice(&[1.0, 2.0, 3.0, 4.0], &[2, 2], col).unwrap();
let y = Tensor::<f64>::from_slice(&[5.0, 6.0, 7.0, 8.0], &[2, 2], col).unwrap();
let z = einsum_owned("ij,jk->ik", vec![x, y]).unwrap(); // x, y consumed§Batch operations
ⓘ
// Batched GEMM: 10 independent matrix multiplications in one call
// A: (batch=10, m=3, k=4), B: (batch=10, k=4, n=5) -> C: (batch=10, m=3, n=5)
let a = Tensor::<f64>::zeros(&[10, 3, 4], LogicalMemorySpace::MainMemory, col);
let b = Tensor::<f64>::zeros(&[10, 4, 5], LogicalMemorySpace::MainMemory, col);
let c = einsum("bij,bjk->bik", &[&a, &b]).unwrap();
assert_eq!(c.dims(), &[10, 3, 5]);
// Multiple batch dimensions: (batch1=2, batch2=3, m, k) x (batch1=2, batch2=3, k, n)
let a = Tensor::<f64>::zeros(&[2, 3, 4, 5], LogicalMemorySpace::MainMemory, col);
let b = Tensor::<f64>::zeros(&[2, 3, 5, 6], LogicalMemorySpace::MainMemory, col);
let c = einsum("abij,abjk->abik", &[&a, &b]).unwrap();
assert_eq!(c.dims(), &[2, 3, 4, 6]);
// Broadcast batch: A has batch dim, B is shared across batch
// A: (batch=10, m=3, k=4), B: (k=4, n=5) -> C: (batch=10, m=3, n=5)
let a = Tensor::<f64>::zeros(&[10, 3, 4], LogicalMemorySpace::MainMemory, col);
let b = Tensor::<f64>::zeros(&[4, 5], LogicalMemorySpace::MainMemory, col);
let c = einsum("bij,jk->bik", &[&a, &b]).unwrap();
assert_eq!(c.dims(), &[10, 3, 5]);§Integer label notation
ⓘ
use tenferro_einsum::{einsum_with_subscripts, Subscripts};
// Same as "ij,jk->ik" but with integer labels
// Useful when indices exceed 52 (a-z, A-Z) or are computed programmatically
let subs = Subscripts::new(&[&[0, 1], &[1, 2]], &[0, 2]);
let c = einsum_with_subscripts(&subs, &[&a, &b]).unwrap();§Contraction order control
ⓘ
// Three matrices: D = A @ B @ C
// Parentheses specify: contract B*C first, then A*(BC)
let d = einsum("ij,(jk,kl)->il", &[&a, &b, &c]).unwrap();
// Or use ContractionTree for programmatic control
use tenferro_einsum::ContractionTree;
let subs = Subscripts::new(&[&[0, 1], &[1, 2], &[2, 3]], &[0, 3]);
let tree = ContractionTree::from_pairs(
&subs,
&[&[3, 4], &[4, 100], &[100, 5]],
&[(1, 2), (0, 3)], // B*C first (avoids large intermediate)
).unwrap();
let d = einsum_with_plan(&tree, &[&a, &b, &c]).unwrap();§Accumulating into a pre-allocated output
ⓘ
use tenferro_einsum::{einsum_with_plan_into, ContractionTree, Subscripts};
use tenferro_tensor::{Tensor, MemoryOrder};
use tenferro_device::LogicalMemorySpace;
let col = MemoryOrder::ColumnMajor;
let subs = Subscripts::new(&[&[0, 1], &[1, 2]], &[0, 2]);
let tree = ContractionTree::optimize(&subs, &[&[3, 4], &[4, 5]]).unwrap();
let a = Tensor::<f64>::zeros(&[3, 4], LogicalMemorySpace::MainMemory, col);
let b = Tensor::<f64>::zeros(&[4, 5], LogicalMemorySpace::MainMemory, col);
let mut c = Tensor::<f64>::zeros(&[3, 5], LogicalMemorySpace::MainMemory, col);
// Hot loop: reuse output buffer, zero allocation per iteration
for _ in 0..1000 {
// C = 1.0 * (A @ B) + 0.0 * C (overwrite)
einsum_with_plan_into(&tree, &[&a, &b], 1.0, 0.0, &mut c).unwrap();
}§GPU async chaining (deferred evaluation)
GPU einsum operations return immediately. The result tensor carries a
CompletionEvent that tracks the
pending accelerator work. Passing this tensor to another einsum chains
via GPU stream dependencies — no CPU synchronization until data is
accessed from the host.
wait()— explicitly blocks until computation completesview(),dims(),strides()— implicitly callwait()- For CPU tensors,
eventis alwaysNone(zero overhead)
ⓘ
use tenferro_einsum::einsum;
use tenferro_tensor::{Tensor, MemoryOrder};
use tenferro_device::LogicalMemorySpace;
// In production, obtain memory spaces via BackendRegistry (future API).
let gpu_mem = LogicalMemorySpace::GpuMemory { device_id: 0 };
let col = MemoryOrder::ColumnMajor;
let a = Tensor::<f64>::zeros(&[3, 4], gpu_mem, col);
let b = Tensor::<f64>::zeros(&[4, 5], gpu_mem, col);
// Both einsum calls submit work to the GPU and return immediately.
// The second call detects c's pending event and chains on the stream.
let c = einsum("ij,jk->ik", &[&a, &b]).unwrap();
let d = einsum("ij,jk->ik", &[&c, &b]).unwrap();
// wait() blocks until GPU computation completes
d.wait();§Specifying a compute device
ⓘ
use tenferro_einsum::einsum;
use tenferro_tensor::{Tensor, MemoryOrder};
use tenferro_device::{LogicalMemorySpace, ComputeDevice};
let col = MemoryOrder::ColumnMajor;
// In production, obtain memory spaces via BackendRegistry (future API).
let gpu_mem = LogicalMemorySpace::GpuMemory { device_id: 0 };
let mut a = Tensor::<f64>::zeros(&[3, 4], gpu_mem, col);
let mut b = Tensor::<f64>::zeros(&[4, 5], gpu_mem, col);
// Pin tensors to CUDA device 1 (overrides automatic device selection).
// This works when CUDA device 1 can access GpuMemory { device_id: 0 }
// (e.g., same physical GPU or NVLink-connected peer).
// If the device cannot access the memory space, einsum returns
// Err(NoCompatibleComputeDevice). In that case, transfer explicitly:
// let a = a.to_memory_space_async(GpuMemory { device_id: 1 }).unwrap();
a.set_preferred_compute_device(Some(ComputeDevice::Cuda { device_id: 1 }));
b.set_preferred_compute_device(Some(ComputeDevice::Cuda { device_id: 1 }));
// einsum dispatches to the specified CUDA device
let c = einsum("ij,jk->ik", &[&a, &b]).unwrap();
// Clear override — revert to automatic selection
// a.set_preferred_compute_device(None);Structs§
- Contraction
Tree - Contraction tree determining pairwise contraction order for N-ary einsum.
- Subscripts
- Einsum subscripts using integer labels (omeinsum-rs compatible).
Functions§
- dual_
einsum - Dual einsum (forward-mode JVP propagation).
- einsum
- Execute einsum using string notation.
- einsum_
frule - Forward-mode rule (frule) for einsum without building a global tape.
- einsum_
hvp - Local HVP rule for einsum without building a global tape.
- einsum_
into - Execute einsum using string notation, accumulating into an existing output.
- einsum_
owned - Execute einsum using string notation, consuming the input tensors.
- einsum_
rrule - Reverse-mode rule (rrule) for einsum without building a global tape.
- einsum_
with_ plan - Execute einsum with a pre-optimized
ContractionTree. - einsum_
with_ plan_ into - Execute einsum with a pre-optimized
ContractionTree, accumulating into an existing output. - einsum_
with_ plan_ owned - Execute einsum with a pre-optimized
ContractionTree, consuming the input tensors. - einsum_
with_ subscripts - Execute einsum with pre-built
Subscripts. - einsum_
with_ subscripts_ into - Execute einsum with pre-built
Subscripts, accumulating into an existing output. - einsum_
with_ subscripts_ owned - Execute einsum with pre-built
Subscripts, consuming the input tensors. - tracked_
einsum - Tracked einsum (reverse-mode AD).