tenferro_einsum/
eager_ad.rs

1//! EagerTensor einsum extension API.
2
3use std::collections::hash_map::DefaultHasher;
4use std::error::Error as StdError;
5use std::hash::{Hash, Hasher};
6use std::mem::size_of;
7use std::sync::{Arc, OnceLock};
8
9use computegraph::compile::{compile, CompiledProgram, Instruction};
10use computegraph::graph::GraphBuilder;
11use computegraph::materialize::materialize_merge;
12use computegraph::resolve::resolve;
13use computegraph::types::{ValueKey, ValueRef};
14use tenferro_ad::extension::{adopt_untracked_eager_value, apply_eager_with_extension_session};
15use tenferro_ad::{EagerRuntime, EagerTensor};
16use tenferro_ops::dim_expr::DimExpr;
17use tenferro_ops::input_key::TensorInputKey;
18use tenferro_ops::std_tensor_op::StdTensorOp;
19use tenferro_runtime::{ErrorPhase, ExtensionCacheKey};
20use tenferro_tensor::{ErrorKind, ShapeMismatch, ValidationError, ValidationKind};
21
22use crate::binary_dot::{try_build_exact_output_binary_dot_plan, BinaryDotOperandOrder};
23use crate::builder::build_einsum_graph;
24use crate::cache::{
25    saturating_sum, vec_retained_bytes, EINSUM_EAGER_EXPANDED_PROGRAMS_CACHE,
26    EINSUM_EXTENSION_FAMILY_ID,
27};
28use crate::extension::{execute_einsum_extension_session_reads, EinsumExtensionOp};
29use crate::optimize::{
30    default_auto_options, hash_einsum_plan_spec, plan_specs_equal, resolve_plan_spec,
31    EinsumPlanSpec,
32};
33use crate::{parse_einsum_subscripts, EinsumSubscripts, Error, Result, Subscripts, TensorDotAxes};
34
35/// Eager einsum extension methods for slices or arrays of [`EagerTensor`] refs.
36pub trait EagerEinsumExt {
37    /// Execute an einsum from string notation.
38    ///
39    /// # Errors
40    ///
41    /// Returns [`Error::InvalidSubscripts`] for malformed notation,
42    /// [`Error::Validation`] for rank/shape/dtype mismatches, or
43    /// [`Error::Planning`] / [`Error::Runtime`] for contraction planning and
44    /// execution failures.
45    fn einsum(&self, subscripts: &str) -> Result<EagerTensor>;
46
47    /// Execute an einsum from parsed integer labels.
48    ///
49    /// # Errors
50    ///
51    /// Returns [`Error::Validation`] for rank/shape/dtype mismatches,
52    /// [`Error::Planning`] for an invalid contraction plan, or
53    /// [`Error::Runtime`] for extension registration or backend execution
54    /// failures.
55    fn einsum_subscripts(&self, subscripts: &EinsumSubscripts) -> Result<EagerTensor>;
56}
57
58fn eager_cpu_extension_module() -> Result<Arc<dyn tenferro_runtime::ExtensionModule>> {
59    static MODULE: OnceLock<Arc<dyn tenferro_runtime::ExtensionModule>> = OnceLock::new();
60    if let Some(module) = MODULE.get() {
61        return Ok(Arc::clone(module));
62    }
63
64    let engine_id = tenferro_cpu::runtime_engine_id().map_err(eager_runtime_config_error)?;
65    let module = crate::extension::extension_module::<tenferro_cpu::CpuBackend>(engine_id)
66        .map_err(eager_runtime_config_error)?;
67    let _ = MODULE.set(Arc::clone(&module));
68    Ok(MODULE.get().cloned().unwrap_or(module))
69}
70
71fn eager_runtime_config_error(source: tenferro_runtime::RuntimeConfigError) -> Error {
72    Error::Runtime(tenferro_runtime::Error::runtime_state_source(
73        "tenferro_einsum::eager_extension_module",
74        ErrorPhase::Execution,
75        source,
76    ))
77}
78
79impl EagerEinsumExt for [&EagerTensor] {
80    fn einsum(&self, subscripts: &str) -> Result<EagerTensor> {
81        einsum(self, subscripts)
82    }
83
84    fn einsum_subscripts(&self, subscripts: &EinsumSubscripts) -> Result<EagerTensor> {
85        einsum_subscripts(self, subscripts)
86    }
87}
88
89impl<const N: usize> EagerEinsumExt for [&EagerTensor; N] {
90    fn einsum(&self, subscripts: &str) -> Result<EagerTensor> {
91        einsum(self.as_slice(), subscripts)
92    }
93
94    fn einsum_subscripts(&self, subscripts: &EinsumSubscripts) -> Result<EagerTensor> {
95        einsum_subscripts(self.as_slice(), subscripts)
96    }
97}
98
99/// Eager tensor contraction-sugar methods.
100pub trait EagerTensorEinsumExt {
101    /// Contract two eager tensors over the requested axes.
102    ///
103    /// # Errors
104    ///
105    /// Returns [`Error::Validation`] with `AxisOutOfBounds`, `DuplicateAxis`,
106    /// `RankMismatch`, or `ShapeMismatch` for invalid axes/shapes, or
107    /// [`Error::Runtime`] for backend execution failures.
108    fn tensordot(&self, rhs: &EagerTensor, axes: TensorDotAxes<'_>) -> Result<EagerTensor>;
109}
110
111impl EagerTensorEinsumExt for EagerTensor {
112    fn tensordot(&self, rhs: &EagerTensor, axes: TensorDotAxes<'_>) -> Result<EagerTensor> {
113        tensordot(self, rhs, axes)
114    }
115}
116
117/// Execute an einsum eagerly on [`EagerTensor`] values.
118///
119/// # Examples
120///
121/// ```
122/// use tenferro_ad::{EagerRuntime, EagerTensor};
123/// use tenferro_cpu::CpuBackend;
124/// use tenferro_einsum::EagerEinsumExt;
125/// use tenferro_tensor::Tensor;
126///
127/// let runtime = EagerRuntime::with_cpu_backend(CpuBackend::new())?;
128/// let a = EagerTensor::from_tensor_in(
129///     Tensor::from_vec_col_major(vec![2, 3], vec![1.0_f64; 6]).unwrap(),
130///     runtime.clone(),
131/// ).unwrap();
132/// let b = EagerTensor::from_tensor_in(
133///     Tensor::from_vec_col_major(vec![3, 4], vec![1.0_f64; 12]).unwrap(),
134///     runtime,
135/// ).unwrap();
136/// let out = [&a, &b].einsum("ij,jk->ik")?;
137/// assert_eq!(out.shape(), &[2, 4]);
138/// # Ok::<(), tenferro_einsum::Error>(())
139/// ```
140///
141/// # Errors
142///
143/// Returns [`Error::InvalidSubscripts`] for malformed notation,
144/// [`Error::Validation`] for input count/rank/shape/dtype mismatches,
145/// [`Error::Planning`] when no contraction path is valid, or [`Error::Runtime`]
146/// for extension registration and backend execution failures.
147pub fn einsum(inputs: &[&EagerTensor], subscripts: &str) -> Result<EagerTensor> {
148    let subscripts = parse_einsum_subscripts(subscripts)?;
149    einsum_subscripts(inputs, &subscripts)
150}
151
152/// Execute an einsum eagerly from integer labels.
153///
154/// # Examples
155///
156/// ```
157/// use tenferro_ad::{EagerRuntime, EagerTensor};
158/// use tenferro_cpu::CpuBackend;
159/// use tenferro_einsum::{EagerEinsumExt, parse_einsum_subscripts};
160/// use tenferro_tensor::Tensor;
161///
162/// let runtime = EagerRuntime::with_cpu_backend(CpuBackend::new())?;
163/// let a = EagerTensor::from_tensor_in(
164///     Tensor::from_vec_col_major(vec![2, 3], vec![1.0_f64; 6]).unwrap(),
165///     runtime.clone(),
166/// ).unwrap();
167/// let b = EagerTensor::from_tensor_in(
168///     Tensor::from_vec_col_major(vec![3, 4], vec![1.0_f64; 12]).unwrap(),
169///     runtime,
170/// ).unwrap();
171/// let subscripts = parse_einsum_subscripts("ij,jk->ik").unwrap();
172/// let out = [&a, &b].einsum_subscripts(&subscripts)?;
173/// assert_eq!(out.shape(), &[2, 4]);
174/// # Ok::<(), tenferro_einsum::Error>(())
175/// ```
176///
177/// # Errors
178///
179/// Returns [`Error::Validation`] for input count/rank/shape/dtype mismatches,
180/// [`Error::Planning`] when no contraction path is valid, or [`Error::Runtime`]
181/// for extension registration and backend execution failures.
182pub fn einsum_subscripts(
183    inputs: &[&EagerTensor],
184    subscripts: &EinsumSubscripts,
185) -> Result<EagerTensor> {
186    if let Some(result) = try_direct_binary_dot_general(inputs, subscripts) {
187        return result;
188    }
189
190    if let Some(result) = try_whole_program_untracked(inputs, subscripts)? {
191        return Ok(result);
192    }
193
194    let output_shape_hint = infer_eager_output_shape(subscripts, inputs)?;
195    if let Some(result) = try_expand_eager_einsum(inputs, subscripts)? {
196        return Ok(result);
197    }
198
199    let op = Arc::new(EinsumExtensionOp::with_output_shape_hint(
200        subscripts.clone(),
201        output_shape_hint,
202        EinsumPlanSpec::Auto(default_auto_options()),
203    ));
204    let execute_op = Arc::clone(&op);
205    let module = eager_cpu_extension_module()?;
206    let mut outputs =
207        apply_eager_with_extension_session(op, inputs, module, move |_op, input_reads, ctx| {
208            execute_einsum_extension_session_reads(&execute_op, input_reads, ctx)
209        })?;
210    outputs.pop().ok_or_else(|| {
211        Error::Runtime(tenferro_runtime::Error::MissingInput(
212            "einsum extension produced no eager output".into(),
213        ))
214    })
215}
216
217fn try_direct_binary_dot_general(
218    inputs: &[&EagerTensor],
219    subscripts: &EinsumSubscripts,
220) -> Option<Result<EagerTensor>> {
221    if inputs.len() != 2 || subscripts.inputs.len() != 2 {
222        return None;
223    }
224
225    let lhs_labels = &subscripts.inputs[0];
226    let rhs_labels = &subscripts.inputs[1];
227    if lhs_labels.len() != inputs[0].shape().len() || rhs_labels.len() != inputs[1].shape().len() {
228        return None;
229    }
230
231    if let Some(plan) =
232        try_build_exact_output_binary_dot_plan(lhs_labels, rhs_labels, &subscripts.output)
233    {
234        return Some(match plan.operand_order {
235            BinaryDotOperandOrder::Original => inputs[0]
236                .dot_general(inputs[1], plan.config)
237                .map_err(Error::Runtime),
238            BinaryDotOperandOrder::Swapped => inputs[1]
239                .dot_general(inputs[0], plan.config)
240                .map_err(Error::Runtime),
241        });
242    }
243    None
244}
245
246/// Whether the untracked whole-program eager einsum executor is enabled.
247///
248/// Prototype gate (issue #1060 follow-up): when set, untracked N-ary eager
249/// einsum runs the whole contraction in one backend session via
250/// [`crate::eager::eager_einsum_subscripts`] instead of executing the expanded
251/// program one standard op at a time. Tracked (`requires_grad`) inputs keep the
252/// existing per-op path so eager AD recording semantics are unchanged.
253fn whole_program_untracked_enabled() -> bool {
254    std::env::var_os("TENFERRO_EAGER_WHOLE_PROGRAM").is_some()
255}
256
257/// Run an untracked eager einsum as a single backend-session program.
258///
259/// Returns `None` (so the caller falls back to the per-op expanded path) when
260/// the gate is off, there are no inputs, any input tracks gradients, or the
261/// inputs do not all share one runtime.
262fn try_whole_program_untracked(
263    inputs: &[&EagerTensor],
264    subscripts: &EinsumSubscripts,
265) -> Result<Option<EagerTensor>> {
266    if !whole_program_untracked_enabled() {
267        return Ok(None);
268    }
269    let Some(first) = inputs.first() else {
270        return Ok(None);
271    };
272    if inputs.iter().any(|tensor| tensor.tracks_grad()) {
273        return Ok(None);
274    }
275    let runtime = first.runtime();
276    if inputs
277        .iter()
278        .any(|tensor| !Arc::ptr_eq(tensor.runtime(), runtime))
279    {
280        return Ok(None);
281    }
282
283    let subs = Subscripts::from(subscripts);
284    let tensor_arcs = inputs
285        .iter()
286        .map(|tensor| tensor.materialized().map_err(Error::Runtime))
287        .collect::<Result<Vec<_>>>()?;
288    let tensors: Vec<_> = tensor_arcs.iter().map(|tensor| tensor.as_ref()).collect();
289    let result = runtime.with_execution_session(|backend| {
290        crate::eager::eager_einsum_subscripts_with_session(backend, &tensors, &subs)
291    })??;
292    Ok(Some(EagerTensor::from_tensor_in(result, runtime.clone())?))
293}
294
295/// Run an untracked whole-program eager einsum on an explicit contraction tree.
296///
297/// Prototype/benchmark entry (issue #1060 follow-up). Executes the whole
298/// contraction in one backend session on the caller-provided path (e.g. an
299/// externally optimized `opt_flops` order via [`crate::ContractionTree::from_pairs`]),
300/// instead of one eager op per expanded step. All inputs must be untracked and
301/// share one runtime; tracked inputs should use the per-op path to keep eager
302/// AD semantics.
303///
304/// # Examples
305///
306/// ```
307/// use tenferro_ad::{EagerRuntime, EagerTensor};
308/// use tenferro_cpu::CpuBackend;
309/// use tenferro_einsum::{ContractionTree, Subscripts};
310/// use tenferro_tensor::Tensor;
311///
312/// let runtime = EagerRuntime::with_cpu_backend(CpuBackend::new())?;
313/// let a = EagerTensor::from_tensor_in(
314///     Tensor::from_vec_col_major(vec![2, 3], vec![1.0_f64; 6]).unwrap(),
315///     runtime.clone(),
316/// ).unwrap();
317/// let b = EagerTensor::from_tensor_in(
318///     Tensor::from_vec_col_major(vec![3, 4], vec![1.0_f64; 12]).unwrap(),
319///     runtime,
320/// ).unwrap();
321/// let subs = Subscripts::parse("ij,jk->ik").unwrap();
322/// let tree = ContractionTree::from_pairs(&subs, &[&[2, 3], &[3, 4]], &[(0, 1)]).unwrap();
323/// let out = einsum_whole_program_untracked(&[&a, &b], &tree)?;
324/// assert_eq!(out.shape(), &[2, 4]);
325/// # Ok::<(), tenferro_ad::error::Error>(())
326/// ```
327#[cfg(test)]
328fn einsum_whole_program_untracked(
329    inputs: &[&EagerTensor],
330    tree: &crate::ContractionTree,
331) -> Result<EagerTensor> {
332    let first = inputs.first().ok_or_else(|| {
333        Error::invalid_argument(
334            "einsum",
335            "inputs",
336            "einsum requires at least one input tensor",
337        )
338    })?;
339    if inputs.iter().any(|tensor| tensor.tracks_grad()) {
340        return Err(Error::invalid_argument(
341            "einsum",
342            "inputs",
343            "whole-program eager einsum requires untracked inputs",
344        ));
345    }
346    let runtime = first.runtime();
347    if inputs
348        .iter()
349        .any(|tensor| !Arc::ptr_eq(tensor.runtime(), runtime))
350    {
351        return Err(Error::invalid_argument(
352            "einsum",
353            "inputs",
354            "whole-program eager einsum requires inputs from one runtime",
355        ));
356    }
357    let tensor_arcs = inputs
358        .iter()
359        .map(|tensor| tensor.materialized().map_err(Error::Runtime))
360        .collect::<Result<Vec<_>>>()?;
361    let tensors: Vec<_> = tensor_arcs.iter().map(|tensor| tensor.as_ref()).collect();
362    let result = runtime.with_execution_session(|backend| {
363        crate::eager::eager_einsum_with_tree(backend, &tensors, tree)
364    })??;
365    EagerTensor::from_tensor_in(result, runtime.clone()).map_err(Error::Runtime)
366}
367
368fn try_expand_eager_einsum(
369    inputs: &[&EagerTensor],
370    subscripts: &EinsumSubscripts,
371) -> Result<Option<EagerTensor>> {
372    if inputs.len() <= 1 {
373        return Ok(None);
374    }
375
376    let shapes: Vec<Vec<usize>> = inputs
377        .iter()
378        .map(|tensor| tensor.shape().to_vec())
379        .collect();
380    let shape_refs: Vec<&[usize]> = shapes.iter().map(Vec::as_slice).collect();
381    let subs = Subscripts::from(subscripts);
382    let plan_spec = EinsumPlanSpec::Auto(default_auto_options());
383
384    let program = cached_expanded_eager_program(
385        inputs[0].runtime(),
386        subscripts,
387        &subs,
388        &plan_spec,
389        &shape_refs,
390        &shapes,
391    )?;
392    execute_eager_einsum_program(inputs, &program)
393}
394
395struct ExpandedEagerProgram {
396    compiled: CompiledProgram<StdTensorOp>,
397    input_slots: Vec<(usize, usize)>,
398}
399
400#[derive(Clone)]
401struct ExpandedEagerProgramCacheKeyData {
402    subscripts: EinsumSubscripts,
403    shapes: Vec<Vec<usize>>,
404    plan_spec: EinsumPlanSpec,
405}
406
407impl ExpandedEagerProgramCacheKeyData {
408    fn new(
409        subscripts: &EinsumSubscripts,
410        shapes: &[Vec<usize>],
411        plan_spec: &EinsumPlanSpec,
412    ) -> Self {
413        Self {
414            subscripts: subscripts.clone(),
415            shapes: shapes.to_vec(),
416            plan_spec: plan_spec.clone(),
417        }
418    }
419
420    fn matches_expanded_eager_program(
421        &self,
422        subscripts: &EinsumSubscripts,
423        shapes: &[Vec<usize>],
424        plan_spec: &EinsumPlanSpec,
425    ) -> bool {
426        self.subscripts == *subscripts
427            && self.shapes.as_slice() == shapes
428            && plan_specs_equal(&self.plan_spec, plan_spec)
429    }
430
431    fn retained_bytes(&self) -> usize {
432        saturating_sum([
433            crate::cache::einsum_subscripts_retained_bytes(&self.subscripts),
434            saturating_sum(self.shapes.iter().map(vec_retained_bytes)),
435            plan_spec_retained_bytes(&self.plan_spec),
436        ])
437    }
438}
439
440struct CachedExpandedEagerProgram {
441    key_data: ExpandedEagerProgramCacheKeyData,
442    program: Arc<ExpandedEagerProgram>,
443}
444
445fn cached_expanded_eager_program(
446    runtime: &Arc<EagerRuntime>,
447    subscripts: &EinsumSubscripts,
448    subs: &Subscripts,
449    plan_spec: &EinsumPlanSpec,
450    shape_refs: &[&[usize]],
451    shapes: &[Vec<usize>],
452) -> Result<Arc<ExpandedEagerProgram>> {
453    runtime.with_extension_execution_context(|extension_ctx| {
454        let caches = extension_ctx.caches_mut();
455        let plan_hash = plan_spec_hash(plan_spec);
456        let key = expanded_eager_program_cache_key(subscripts, shapes, plan_hash);
457        if let Some(cached) = caches.get::<CachedExpandedEagerProgram>(&key) {
458            let key_data = &cached.key_data;
459            if key_data.matches_expanded_eager_program(subscripts, shapes, plan_spec) {
460                return Ok(Arc::clone(&cached.program));
461            }
462        }
463
464        let tree = resolve_plan_spec(plan_spec, subs, shape_refs)?;
465        let program = Arc::new(build_expanded_eager_program(&tree, shapes)?);
466        let key_data = ExpandedEagerProgramCacheKeyData::new(subscripts, shapes, plan_spec);
467        let retained_bytes = saturating_sum([
468            key_data.retained_bytes(),
469            expanded_eager_program_retained_bytes(&program),
470        ]);
471        caches.put(
472            key,
473            CachedExpandedEagerProgram {
474                key_data,
475                program: Arc::clone(&program),
476            },
477            retained_bytes,
478        );
479        Ok(program)
480    })?
481}
482
483fn expanded_eager_program_cache_key(
484    subscripts: &EinsumSubscripts,
485    shapes: &[Vec<usize>],
486    plan_hash: u64,
487) -> ExtensionCacheKey {
488    let mut hasher = DefaultHasher::new();
489    subscripts.hash(&mut hasher);
490    shapes.hash(&mut hasher);
491    plan_hash.hash(&mut hasher);
492    ExtensionCacheKey::new(
493        EINSUM_EXTENSION_FAMILY_ID,
494        EINSUM_EAGER_EXPANDED_PROGRAMS_CACHE,
495        hasher.finish(),
496    )
497}
498
499fn plan_spec_hash(plan_spec: &EinsumPlanSpec) -> u64 {
500    let mut hasher = DefaultHasher::new();
501    hash_einsum_plan_spec(plan_spec, &mut hasher);
502    hasher.finish()
503}
504
505fn plan_spec_retained_bytes(plan_spec: &EinsumPlanSpec) -> usize {
506    match plan_spec {
507        EinsumPlanSpec::Auto(options) => saturating_sum([
508            std::mem::size_of::<EinsumPlanSpec>(),
509            vec_retained_bytes(&options.betas),
510        ]),
511        EinsumPlanSpec::LeftToRight => std::mem::size_of::<EinsumPlanSpec>(),
512        EinsumPlanSpec::Path(path) | EinsumPlanSpec::FixedPairs(path) => saturating_sum([
513            std::mem::size_of::<EinsumPlanSpec>(),
514            vec_retained_bytes(path),
515        ]),
516    }
517}
518
519fn build_expanded_eager_program(
520    tree: &crate::ContractionTree,
521    shapes: &[Vec<usize>],
522) -> Result<ExpandedEagerProgram> {
523    let mut builder = GraphBuilder::<StdTensorOp>::new();
524    let mut input_vals = Vec::with_capacity(shapes.len());
525    for input_idx in 0..shapes.len() {
526        let local = builder.add_input(TensorInputKey::User {
527            id: input_idx as u64,
528        });
529        input_vals.push(ValueRef::Local(local));
530    }
531
532    let result_ref = build_einsum_graph(&mut builder, tree, &input_vals, shapes)?;
533    let ValueRef::Local(result_local) = result_ref else {
534        return Err(Error::Runtime(tenferro_runtime::Error::Internal(
535            "expanded eager einsum returned an external value".into(),
536        )));
537    };
538    builder.set_outputs(vec![result_local]);
539    let graph = Arc::new(builder.build());
540    let output_key = graph.values()[result_local].key.clone();
541    let view = resolve(vec![graph]);
542    let graph = materialize_merge(&view, &[output_key]);
543    let compiled = compile(&graph);
544    let input_slots = compiled
545        .input_slots
546        .iter()
547        .zip(graph.inputs.iter())
548        .map(|(&slot, key)| {
549            let ValueKey::Input(TensorInputKey::User { id }) = key else {
550                return Err(runtime_internal(format!(
551                    "expanded eager einsum saw unexpected input key: {key:?}"
552                )));
553            };
554            Ok((slot, *id as usize))
555        })
556        .collect::<Result<_>>()?;
557
558    Ok(ExpandedEagerProgram {
559        compiled,
560        input_slots,
561    })
562}
563
564fn execute_eager_einsum_program(
565    inputs: &[&EagerTensor],
566    program: &ExpandedEagerProgram,
567) -> Result<Option<EagerTensor>> {
568    let mut slots: Vec<Option<EagerTensor>> = vec![None; program.compiled.n_slots];
569    for &(slot, input_idx) in &program.input_slots {
570        let tensor = inputs.get(input_idx).ok_or_else(|| {
571            runtime_missing(format!(
572                "expanded eager einsum input {input_idx} is missing"
573            ))
574        })?;
575        slots[slot] = Some((*tensor).clone());
576    }
577
578    let mut instruction_idx = 0;
579    while instruction_idx < program.compiled.instructions.len() {
580        if let Some((output_slot, output)) = try_execute_eager_broadcast_multiply_pattern(
581            &program.compiled.instructions,
582            instruction_idx,
583            &slots,
584            &program.compiled.output_slots,
585        )? {
586            slots[output_slot] = Some(output);
587            instruction_idx += 3;
588            continue;
589        }
590
591        let instr = &program.compiled.instructions[instruction_idx];
592        if instr.outputs.len() != 1 {
593            return Err(runtime_internal(format!(
594                "expanded eager einsum expected single-output op, got {} outputs",
595                instr.outputs.len()
596            )));
597        }
598        let input_refs: Vec<&EagerTensor> = instr
599            .inputs
600            .iter()
601            .map(|&slot| slot_tensor(&slots, slot))
602            .collect::<Result<_>>()?;
603        let output =
604            tenferro_ad::extension::apply_standard_op(instr.operation.clone(), &input_refs)?;
605        slots[instr.outputs[0]] = Some(output);
606        instruction_idx += 1;
607    }
608
609    let [output_slot] = program.compiled.output_slots.as_slice() else {
610        return Err(runtime_internal(format!(
611            "expanded eager einsum expected one graph output, got {}",
612            program.compiled.output_slots.len()
613        )));
614    };
615    slots
616        .get_mut(*output_slot)
617        .and_then(Option::take)
618        .map(Some)
619        .ok_or_else(|| runtime_missing("expanded eager einsum output slot is missing"))
620}
621
622fn expanded_eager_program_retained_bytes(program: &ExpandedEagerProgram) -> usize {
623    saturating_sum([
624        size_of::<ExpandedEagerProgram>(),
625        vec_retained_bytes(&program.input_slots),
626        compiled_program_retained_bytes(&program.compiled),
627    ])
628}
629
630fn compiled_program_retained_bytes(program: &CompiledProgram<StdTensorOp>) -> usize {
631    saturating_sum([
632        size_of::<CompiledProgram<StdTensorOp>>(),
633        vec_retained_bytes(&program.instructions),
634        vec_retained_bytes(&program.input_slots),
635        vec_retained_bytes(&program.output_slots),
636        saturating_sum(program.instructions.iter().map(instruction_retained_bytes)),
637    ])
638}
639
640fn instruction_retained_bytes(instruction: &Instruction<StdTensorOp>) -> usize {
641    saturating_sum([
642        size_of::<Instruction<StdTensorOp>>(),
643        std_tensor_op_retained_bytes(&instruction.operation),
644        vec_retained_bytes(&instruction.inputs),
645        vec_retained_bytes(&instruction.outputs),
646    ])
647}
648
649fn std_tensor_op_retained_bytes(op: &StdTensorOp) -> usize {
650    match op {
651        StdTensorOp::DotGeneral { config } => saturating_sum([
652            vec_retained_bytes(&config.lhs_contracting_dims),
653            vec_retained_bytes(&config.rhs_contracting_dims),
654            vec_retained_bytes(&config.lhs_batch_dims),
655            vec_retained_bytes(&config.rhs_batch_dims),
656        ]),
657        StdTensorOp::Transpose { perm } => vec_retained_bytes(perm),
658        StdTensorOp::Reshape { to_shape } => vec_retained_bytes(to_shape),
659        StdTensorOp::BroadcastInDim { shape, dims } => {
660            saturating_sum([vec_retained_bytes(shape), vec_retained_bytes(dims)])
661        }
662        StdTensorOp::Constant { bytes, .. } => vec_retained_bytes(bytes),
663        StdTensorOp::ReduceSum { axes }
664        | StdTensorOp::ReduceProd { axes }
665        | StdTensorOp::ReduceMax { axes }
666        | StdTensorOp::ReduceMin { axes }
667        | StdTensorOp::Reverse { axes } => vec_retained_bytes(axes),
668        StdTensorOp::DynamicSlice { slice_sizes } => vec_retained_bytes(slice_sizes),
669        StdTensorOp::GatherDynamicSliceSizes {
670            offset_dims,
671            collapsed_slice_dims,
672            start_index_map,
673            slice_sizes,
674            ..
675        } => saturating_sum([
676            vec_retained_bytes(offset_dims),
677            vec_retained_bytes(collapsed_slice_dims),
678            vec_retained_bytes(start_index_map),
679            vec_retained_bytes(slice_sizes),
680        ]),
681        _ => 0,
682    }
683}
684
685fn try_execute_eager_broadcast_multiply_pattern(
686    instructions: &[Instruction<StdTensorOp>],
687    instruction_idx: usize,
688    slots: &[Option<EagerTensor>],
689    output_slots: &[usize],
690) -> Result<Option<(usize, EagerTensor)>> {
691    if instruction_idx + 2 >= instructions.len() {
692        return Ok(None);
693    }
694    let lhs_bc = &instructions[instruction_idx];
695    let rhs_bc = &instructions[instruction_idx + 1];
696    let multiply = &instructions[instruction_idx + 2];
697
698    let StdTensorOp::BroadcastInDim {
699        shape: lhs_shape_exprs,
700        dims: lhs_dims,
701    } = &lhs_bc.operation
702    else {
703        return Ok(None);
704    };
705    let StdTensorOp::BroadcastInDim {
706        shape: rhs_shape_exprs,
707        dims: rhs_dims,
708    } = &rhs_bc.operation
709    else {
710        return Ok(None);
711    };
712    if !matches!(multiply.operation, StdTensorOp::Mul)
713        || lhs_bc.outputs.len() != 1
714        || rhs_bc.outputs.len() != 1
715        || multiply.outputs.len() != 1
716        || multiply.inputs.len() != 2
717        || lhs_bc.inputs.is_empty()
718        || rhs_bc.inputs.is_empty()
719        || multiply.inputs[0] != lhs_bc.outputs[0]
720        || multiply.inputs[1] != rhs_bc.outputs[0]
721    {
722        return Ok(None);
723    }
724
725    let lhs_bc_slot = lhs_bc.outputs[0];
726    let rhs_bc_slot = rhs_bc.outputs[0];
727    if output_slots.contains(&lhs_bc_slot)
728        || output_slots.contains(&rhs_bc_slot)
729        || instructions[instruction_idx + 3..]
730            .iter()
731            .any(|instr| instr.inputs.contains(&lhs_bc_slot) || instr.inputs.contains(&rhs_bc_slot))
732    {
733        return Ok(None);
734    }
735
736    let lhs = slot_tensor(slots, lhs_bc.inputs[0])?;
737    let rhs = slot_tensor(slots, rhs_bc.inputs[0])?;
738    let lhs_shape = eval_shape_exprs(slots, &lhs_bc.inputs, lhs_shape_exprs)?;
739    let rhs_shape = eval_shape_exprs(slots, &rhs_bc.inputs, rhs_shape_exprs)?;
740    let Some(output) =
741        backend_broadcast_multiply_untracked(lhs, &lhs_shape, lhs_dims, rhs, &rhs_shape, rhs_dims)?
742    else {
743        return Ok(None);
744    };
745
746    Ok(Some((multiply.outputs[0], output)))
747}
748
749#[allow(clippy::too_many_arguments)]
750fn backend_broadcast_multiply_untracked(
751    lhs: &EagerTensor,
752    lhs_shape: &[usize],
753    lhs_dims: &[usize],
754    rhs: &EagerTensor,
755    rhs_shape: &[usize],
756    rhs_dims: &[usize],
757) -> Result<Option<EagerTensor>> {
758    if !Arc::ptr_eq(lhs.runtime(), rhs.runtime()) {
759        return Err(tenferro_runtime::Error::ContextMismatch {
760            lhs: lhs.ctx_id(),
761            rhs: rhs.ctx_id(),
762        }
763        .into());
764    }
765    if lhs.tracks_grad() || rhs.tracks_grad() {
766        return Ok(None);
767    }
768
769    let runtime = lhs.runtime();
770    let value = runtime.with_execution_session(|backend| {
771        backend.execute_broadcast_multiply_value(
772            lhs.tensor_read(),
773            lhs_shape,
774            lhs_dims,
775            rhs.tensor_read(),
776            rhs_shape,
777            rhs_dims,
778        )
779    })??;
780
781    Ok(value.map(|value| adopt_untracked_eager_value(runtime.clone(), value)))
782}
783
784fn eval_shape_exprs(
785    slots: &[Option<EagerTensor>],
786    input_slots: &[usize],
787    shape: &[DimExpr],
788) -> Result<Vec<usize>> {
789    let inputs = input_slots
790        .iter()
791        .map(|&slot| slot_tensor(slots, slot))
792        .collect::<Result<Vec<_>>>()?;
793    let input_shapes = inputs
794        .iter()
795        .map(|tensor| tensor.shape())
796        .collect::<Vec<_>>();
797    DimExpr::eval_all(shape, &input_shapes).map_err(|error| {
798        runtime_extension_error(
799            "einsum",
800            ErrorKind::Validation(ValidationKind::InvalidArgument),
801            error,
802        )
803    })
804}
805
806fn slot_tensor(slots: &[Option<EagerTensor>], slot: usize) -> Result<&EagerTensor> {
807    slots.get(slot).and_then(Option::as_ref).ok_or_else(|| {
808        Error::Runtime(tenferro_runtime::Error::MissingInput(format!(
809            "expanded eager einsum missing value for slot {slot}"
810        )))
811    })
812}
813
814fn infer_eager_output_shape(
815    subscripts: &EinsumSubscripts,
816    inputs: &[&EagerTensor],
817) -> Result<Vec<tenferro_runtime::SymDim>> {
818    if inputs.is_empty() {
819        return Err(Error::invalid_argument(
820            "einsum",
821            "inputs",
822            "einsum requires at least one input tensor",
823        ));
824    }
825    if subscripts.inputs.len() != inputs.len() {
826        return Err(Error::invalid_argument(
827            "einsum",
828            "inputs",
829            format!(
830                "einsum subscripts expect {} inputs, got {}",
831                subscripts.inputs.len(),
832                inputs.len()
833            ),
834        ));
835    }
836
837    let mut label_dims = std::collections::HashMap::new();
838    for (labels, tensor) in subscripts.inputs.iter().zip(inputs.iter()) {
839        let shape = tensor.shape();
840        if labels.len() != shape.len() {
841            return Err(Error::validation(
842                "einsum",
843                ValidationError::RankMismatch {
844                    expected: labels.len(),
845                    actual: shape.len(),
846                },
847            ));
848        }
849        for (&label, &dim) in labels.iter().zip(shape.iter()) {
850            if let Some(existing) = label_dims.insert(label, dim) {
851                if existing != dim {
852                    return Err(Error::validation(
853                        "einsum",
854                        ShapeMismatch::ExpectedActual {
855                            expected: tenferro_tensor::ShapeVec::from_vec(vec![existing]),
856                            actual: tenferro_tensor::ShapeVec::from_vec(vec![dim]),
857                        }
858                        .into(),
859                    ));
860                }
861            }
862        }
863    }
864
865    subscripts
866        .output
867        .iter()
868        .map(|label| {
869            label_dims
870                .get(label)
871                .copied()
872                .map(tenferro_runtime::SymDim::from)
873                .ok_or_else(|| {
874                    Error::invalid_argument(
875                        "einsum",
876                        "output",
877                        format!("einsum output label {label} is missing from input labels"),
878                    )
879                })
880        })
881        .collect()
882}
883
884fn runtime_extension_error<E>(op: &'static str, kind: ErrorKind, source: E) -> Error
885where
886    E: StdError + Send + Sync + 'static,
887{
888    Error::Runtime(tenferro_runtime::Error::extension(
889        op,
890        ErrorPhase::Execution,
891        EINSUM_EXTENSION_FAMILY_ID,
892        kind,
893        source,
894    ))
895}
896
897fn runtime_internal(message: impl Into<String>) -> Error {
898    Error::Runtime(tenferro_runtime::Error::Internal(message.into()))
899}
900
901fn runtime_missing(message: impl Into<String>) -> Error {
902    Error::Runtime(tenferro_runtime::Error::MissingInput(message.into()))
903}
904
905/// Execute a NumPy-style tensor contraction on [`EagerTensor`] values.
906///
907/// This helper lives in the einsum extension trait surface because it is
908/// contraction sugar over `dot_general`, not a linear algebra facade.
909///
910/// # Examples
911///
912/// ```
913/// use tenferro_tensor::Tensor;
914/// use tenferro_cpu::CpuBackend;
915/// use tenferro_ad::{EagerRuntime, EagerTensor};
916/// use tenferro_einsum::{EagerTensorEinsumExt, TensorDotAxes};
917///
918/// let ctx = EagerRuntime::with_cpu_backend(CpuBackend::new())?;
919/// let lhs = EagerTensor::from_tensor_in(
920///     Tensor::from_vec_col_major(vec![2, 3], vec![1.0_f64; 6]).unwrap(),
921///     ctx.clone(),
922/// ).unwrap();
923/// let rhs = EagerTensor::from_tensor_in(
924///     Tensor::from_vec_col_major(vec![3, 4], vec![1.0_f64; 12]).unwrap(),
925///     ctx,
926/// ).unwrap();
927/// let out = lhs.tensordot(&rhs, TensorDotAxes::Count(1)).unwrap();
928///
929/// assert_eq!(out.shape(), &[2, 4]);
930/// # Ok::<(), tenferro_einsum::Error>(())
931/// ```
932///
933/// # Errors
934///
935/// Returns [`Error::Validation`] with `AxisOutOfBounds`, `DuplicateAxis`,
936/// `RankMismatch`, or `ShapeMismatch` for invalid contraction axes and shapes,
937/// or [`Error::Runtime`] for eager backend execution failures.
938pub fn tensordot(
939    lhs: &EagerTensor,
940    rhs: &EagerTensor,
941    axes: TensorDotAxes<'_>,
942) -> Result<EagerTensor> {
943    let config = crate::tensordot::dot_general_config(axes, lhs.shape().len(), rhs.shape().len())?;
944    crate::tensordot::validate_concrete_contract_dims(lhs.shape(), rhs.shape(), &config)?;
945    lhs.dot_general(rhs, config).map_err(Error::Runtime)
946}
947
948#[cfg(test)]
949mod tests;
tenferro_einsum/eager_ad.rs

tenferro_einsum/
eager_ad.rs