tenferro_tensor/
backend.rs

1use crate::config::{
2    CompareDir, DotGeneralConfig, GatherConfig, PadConfig, ScatterConfig, SliceConfig,
3};
4use crate::types::{
5    Buffer, TensorRank, TensorScalar, TensorView, TensorViewMut, TypedTensor, TypedTensorView,
6    TypedTensorViewMut,
7};
8use crate::validate::validate_convert_dtype;
9use crate::{
10    AllocationDomainId, AllocationId, DType, Error, RuntimeCacheControl, ShapeMismatch, Tensor,
11    TensorRead, TensorValue, TensorWrite, ValidationError,
12};
13use num_complex::{Complex32, Complex64};
14use smallvec::SmallVec;
15use std::ptr::NonNull;
16use strided_kernel::{
17    erased_map_into, erased_zip_into, ErasedMapOp, ErasedRawStridedMut, ErasedRawStridedPtr,
18    ErasedZipOp, ExecContext, KernelDType,
19};
20
21#[cfg(test)]
22mod tests;
23
24fn read_boundary_error(op: &'static str) -> crate::Error {
25    crate::Error::unsupported(
26        op,
27        "backend does not accept borrowed tensor views at this execution boundary",
28    )
29}
30
31fn validation(op: &'static str, source: ValidationError) -> crate::Error {
32    Error::validation(op, source)
33}
34
35fn invalid_argument(op: &'static str, argument: &'static str, message: impl Into<String>) -> Error {
36    Error::invalid_argument(op, argument, message)
37}
38
39fn read_tensor<'a>(op: &'static str, input: TensorRead<'a>) -> crate::Result<&'a Tensor> {
40    input.as_tensor().ok_or_else(|| read_boundary_error(op))
41}
42
43fn validate_axis_list(
44    op: &'static str,
45    role: &'static str,
46    axes: &[usize],
47    rank: usize,
48) -> crate::Result<()> {
49    let mut seen = vec![false; rank];
50    for &axis in axes {
51        if axis >= rank {
52            return Err(validation(
53                op,
54                ValidationError::AxisOutOfBounds { axis, rank },
55            ));
56        }
57        if seen[axis] {
58            return Err(validation(
59                op,
60                ValidationError::DuplicateAxis { axis, role },
61            ));
62        }
63        seen[axis] = true;
64    }
65    Ok(())
66}
67
68fn validate_role_disjoint(
69    op: &'static str,
70    first_role: &'static str,
71    first_axes: &[usize],
72    second_role: &'static str,
73    second_axes: &[usize],
74) -> crate::Result<()> {
75    for &axis in first_axes {
76        if second_axes.contains(&axis) {
77            return Err(validation(
78                op,
79                ValidationError::AxisRoleConflict {
80                    axis,
81                    first_role,
82                    second_role,
83                },
84            ));
85        }
86    }
87    Ok(())
88}
89
90/// Infer the output shape for a validated dot-general operation.
91#[doc(hidden)]
92pub fn dot_general_output_shape(
93    lhs_shape: &[usize],
94    rhs_shape: &[usize],
95    config: &DotGeneralConfig,
96    op: &'static str,
97) -> crate::Result<Vec<usize>> {
98    if config.lhs_contracting_dims.len() != config.rhs_contracting_dims.len() {
99        return Err(invalid_argument(
100            op,
101            "contracting_dims",
102            "lhs/rhs contracting dim counts differ",
103        ));
104    }
105    if config.lhs_batch_dims.len() != config.rhs_batch_dims.len() {
106        return Err(invalid_argument(
107            op,
108            "batch_dims",
109            "lhs/rhs batch dim counts differ",
110        ));
111    }
112
113    let lhs_rank = lhs_shape.len();
114    let rhs_rank = rhs_shape.len();
115    validate_axis_list(
116        op,
117        "lhs_contracting",
118        &config.lhs_contracting_dims,
119        lhs_rank,
120    )?;
121    validate_axis_list(
122        op,
123        "rhs_contracting",
124        &config.rhs_contracting_dims,
125        rhs_rank,
126    )?;
127    validate_axis_list(op, "lhs_batch", &config.lhs_batch_dims, lhs_rank)?;
128    validate_axis_list(op, "rhs_batch", &config.rhs_batch_dims, rhs_rank)?;
129    validate_role_disjoint(
130        op,
131        "lhs_contracting",
132        &config.lhs_contracting_dims,
133        "lhs_batch",
134        &config.lhs_batch_dims,
135    )?;
136    validate_role_disjoint(
137        op,
138        "rhs_contracting",
139        &config.rhs_contracting_dims,
140        "rhs_batch",
141        &config.rhs_batch_dims,
142    )?;
143
144    for (&lhs_axis, &rhs_axis) in config
145        .lhs_contracting_dims
146        .iter()
147        .zip(&config.rhs_contracting_dims)
148    {
149        if lhs_shape[lhs_axis] != rhs_shape[rhs_axis] {
150            return Err(validation(
151                op,
152                ShapeMismatch::ContractedDimensions {
153                    lhs_axis,
154                    lhs_size: lhs_shape[lhs_axis],
155                    rhs_axis,
156                    rhs_size: rhs_shape[rhs_axis],
157                }
158                .into(),
159            ));
160        }
161    }
162    for (&lhs_axis, &rhs_axis) in config.lhs_batch_dims.iter().zip(&config.rhs_batch_dims) {
163        if lhs_shape[lhs_axis] != rhs_shape[rhs_axis] {
164            return Err(validation(
165                op,
166                ShapeMismatch::ContractedDimensions {
167                    lhs_axis,
168                    lhs_size: lhs_shape[lhs_axis],
169                    rhs_axis,
170                    rhs_size: rhs_shape[rhs_axis],
171                }
172                .into(),
173            ));
174        }
175    }
176
177    let lhs_free = (0..lhs_rank)
178        .filter(|axis| {
179            !config.lhs_contracting_dims.contains(axis) && !config.lhs_batch_dims.contains(axis)
180        })
181        .map(|axis| lhs_shape[axis]);
182    let rhs_free = (0..rhs_rank)
183        .filter(|axis| {
184            !config.rhs_contracting_dims.contains(axis) && !config.rhs_batch_dims.contains(axis)
185        })
186        .map(|axis| rhs_shape[axis]);
187    let batch = config.lhs_batch_dims.iter().map(|&axis| lhs_shape[axis]);
188
189    Ok(lhs_free.chain(rhs_free).chain(batch).collect())
190}
191
192/// Validate output dtype and shape for dot-general read-into dispatch.
193#[doc(hidden)]
194pub fn validate_dot_general_read_into(
195    lhs: &TensorRead<'_>,
196    rhs: &TensorRead<'_>,
197    config: &DotGeneralConfig,
198    out: &TensorWrite<'_>,
199    op: &'static str,
200) -> crate::Result<Vec<usize>> {
201    if lhs.dtype() != rhs.dtype() {
202        return Err(validation(
203            op,
204            ValidationError::DTypeMismatch {
205                expected: crate::core_dtype(lhs.dtype()),
206                actual: crate::core_dtype(rhs.dtype()),
207            },
208        ));
209    }
210    if lhs.dtype() != out.dtype() {
211        return Err(validation(
212            op,
213            ValidationError::DTypeMismatch {
214                expected: crate::core_dtype(lhs.dtype()),
215                actual: crate::core_dtype(out.dtype()),
216            },
217        ));
218    }
219    let expected = dot_general_output_shape(lhs.shape(), rhs.shape(), config, op)?;
220    if out.shape() != expected.as_slice() {
221        return Err(validation(
222            op,
223            ShapeMismatch::ExpectedActual {
224                expected: expected.clone().into(),
225                actual: out.shape().to_vec().into(),
226            }
227            .into(),
228        ));
229    }
230    Ok(expected)
231}
232
233/// Scalar coefficient accepted by contraction accumulation backends.
234///
235/// `ContractionScalar` is intentionally narrower than [`crate::TensorScalar`]:
236/// dot-general accumulation is only defined for floating and complex tensor
237/// dtypes.
238///
239/// # Examples
240///
241/// ```rust
242/// use tenferro_tensor::{ContractionScalar, DType};
243///
244/// let alpha = ContractionScalar::F64(2.0);
245/// assert_eq!(alpha.dtype(), DType::F64);
246/// ```
247#[derive(Clone, Copy, Debug, PartialEq)]
248pub enum ContractionScalar {
249    F32(f32),
250    F64(f64),
251    C32(Complex32),
252    C64(Complex64),
253}
254
255impl ContractionScalar {
256    /// Return this scalar's tensor dtype.
257    ///
258    /// # Examples
259    ///
260    /// ```rust
261    /// use tenferro_tensor::{ContractionScalar, DType};
262    ///
263    /// assert_eq!(ContractionScalar::F32(1.0).dtype(), DType::F32);
264    /// ```
265    pub fn dtype(self) -> DType {
266        match self {
267            Self::F32(_) => DType::F32,
268            Self::F64(_) => DType::F64,
269            Self::C32(_) => DType::C32,
270            Self::C64(_) => DType::C64,
271        }
272    }
273
274    /// Return the multiplicative identity for a supported contraction dtype.
275    ///
276    /// # Examples
277    ///
278    /// ```rust
279    /// use tenferro_tensor::{ContractionScalar, DType};
280    ///
281    /// assert_eq!(ContractionScalar::one(DType::F64).unwrap(), ContractionScalar::F64(1.0));
282    /// assert!(ContractionScalar::one(DType::I32).is_err());
283    /// ```
284    /// # Errors
285    ///
286    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
287    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
288    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
289    /// backend execution or storage access cannot provide the requested result.
290    pub fn one(dtype: DType) -> crate::Result<Self> {
291        match dtype {
292            DType::F32 => Ok(Self::F32(1.0)),
293            DType::F64 => Ok(Self::F64(1.0)),
294            DType::C32 => Ok(Self::C32(Complex32::new(1.0, 0.0))),
295            DType::C64 => Ok(Self::C64(Complex64::new(1.0, 0.0))),
296            DType::I32 | DType::I64 | DType::Bool => Err(validation(
297                "dot_general",
298                ValidationError::DTypeMismatch {
299                    expected: crate::core_dtype(dtype),
300                    actual: crate::core_dtype(DType::F32),
301                },
302            )),
303        }
304    }
305
306    /// Return the additive identity for a supported contraction dtype.
307    ///
308    /// # Examples
309    ///
310    /// ```rust
311    /// use tenferro_tensor::{ContractionScalar, DType};
312    ///
313    /// assert_eq!(ContractionScalar::zero(DType::F64).unwrap(), ContractionScalar::F64(0.0));
314    /// ```
315    /// # Errors
316    ///
317    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
318    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
319    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
320    /// backend execution or storage access cannot provide the requested result.
321    pub fn zero(dtype: DType) -> crate::Result<Self> {
322        match dtype {
323            DType::F32 => Ok(Self::F32(0.0)),
324            DType::F64 => Ok(Self::F64(0.0)),
325            DType::C32 => Ok(Self::C32(Complex32::new(0.0, 0.0))),
326            DType::C64 => Ok(Self::C64(Complex64::new(0.0, 0.0))),
327            DType::I32 | DType::I64 | DType::Bool => Err(validation(
328                "dot_general",
329                ValidationError::DTypeMismatch {
330                    expected: crate::core_dtype(dtype),
331                    actual: crate::core_dtype(DType::F32),
332                },
333            )),
334        }
335    }
336}
337
338/// Output-update semantics for dot-general accumulation.
339///
340/// This keeps contraction axes in [`DotGeneralConfig`] and output update
341/// semantics here, so cached and non-cached backend traits can share the same
342/// accumulation contract.
343///
344/// # Examples
345///
346/// ```rust
347/// use tenferro_tensor::{ContractionScalar, DotGeneralAccumulation, DType};
348///
349/// let accum = DotGeneralAccumulation::overwrite(DType::F64).unwrap();
350/// assert_eq!(accum.alpha, ContractionScalar::F64(1.0));
351/// assert_eq!(accum.beta, ContractionScalar::F64(0.0));
352/// ```
353#[derive(Clone, Copy, Debug, PartialEq)]
354pub struct DotGeneralAccumulation {
355    pub lhs_conj: bool,
356    pub rhs_conj: bool,
357    pub alpha: ContractionScalar,
358    pub beta: ContractionScalar,
359}
360
361/// One matrix multiply in a grouped GEMM over shared flat buffers.
362///
363/// Offsets are element offsets into the corresponding shared lhs, rhs, and
364/// output buffers. Each job computes a column-major `rows x cols` output block
365/// from a column-major `rows x contracted` lhs block and a column-major
366/// `contracted x cols` rhs block.
367///
368/// Provider implementations receive these descriptors through the public
369/// grouped-GEMM request accessor. The engine validates ranges and pairwise
370/// output disjointness before provider entry.
371#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
372pub struct GroupedGemmJob {
373    out_offset: usize,
374    lhs_offset: usize,
375    rhs_offset: usize,
376    rows: usize,
377    contracted: usize,
378    cols: usize,
379}
380
381impl GroupedGemmJob {
382    /// Construct a column-major grouped-GEMM job over shared flat buffers.
383    #[allow(clippy::too_many_arguments)]
384    pub fn new(
385        out_offset: usize,
386        lhs_offset: usize,
387        rhs_offset: usize,
388        rows: usize,
389        contracted: usize,
390        cols: usize,
391    ) -> Self {
392        Self {
393            out_offset,
394            lhs_offset,
395            rhs_offset,
396            rows,
397            contracted,
398            cols,
399        }
400    }
401
402    /// Return the output element offset.
403    pub fn out_offset(&self) -> usize {
404        self.out_offset
405    }
406
407    /// Return the left-input element offset.
408    pub fn lhs_offset(&self) -> usize {
409        self.lhs_offset
410    }
411
412    /// Return the right-input element offset.
413    pub fn rhs_offset(&self) -> usize {
414        self.rhs_offset
415    }
416
417    /// Return the output row count.
418    pub fn rows(&self) -> usize {
419        self.rows
420    }
421
422    /// Return the contracted dimension.
423    pub fn contracted(&self) -> usize {
424        self.contracted
425    }
426
427    /// Return the output column count.
428    pub fn cols(&self) -> usize {
429        self.cols
430    }
431}
432
433/// Shared scalar/update metadata for grouped GEMM execution.
434#[doc(hidden)]
435#[derive(Clone, Copy, Debug, PartialEq)]
436pub struct GroupedGemmConfig<'a> {
437    jobs: &'a [GroupedGemmJob],
438    accumulation: DotGeneralAccumulation,
439}
440
441impl<'a> GroupedGemmConfig<'a> {
442    pub fn new(jobs: &'a [GroupedGemmJob], accumulation: DotGeneralAccumulation) -> Self {
443        Self { jobs, accumulation }
444    }
445
446    pub fn jobs(&self) -> &'a [GroupedGemmJob] {
447        self.jobs
448    }
449
450    pub fn accumulation(&self) -> DotGeneralAccumulation {
451        self.accumulation
452    }
453}
454
455impl DotGeneralAccumulation {
456    /// Return overwrite semantics, `out = lhs dot rhs`, for `dtype`.
457    /// # Errors
458    ///
459    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
460    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
461    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
462    /// backend execution or storage access cannot provide the requested result.
463    pub fn overwrite(dtype: DType) -> crate::Result<Self> {
464        Ok(Self {
465            lhs_conj: false,
466            rhs_conj: false,
467            alpha: ContractionScalar::one(dtype)?,
468            beta: ContractionScalar::zero(dtype)?,
469        })
470    }
471
472    /// Return additive update semantics, `out += lhs dot rhs`, for `dtype`.
473    ///
474    /// # Examples
475    ///
476    /// ```rust
477    /// use tenferro_tensor::{ContractionScalar, DType, DotGeneralAccumulation};
478    ///
479    /// let accum = DotGeneralAccumulation::add_to(DType::F64)?;
480    /// assert_eq!(accum.alpha, ContractionScalar::F64(1.0));
481    /// assert_eq!(accum.beta, ContractionScalar::F64(1.0));
482    /// # Ok::<(), tenferro_tensor::Error>(())
483    /// ```
484    /// # Errors
485    ///
486    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
487    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
488    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
489    /// backend execution or storage access cannot provide the requested result.
490    pub fn add_to(dtype: DType) -> crate::Result<Self> {
491        Ok(Self {
492            lhs_conj: false,
493            rhs_conj: false,
494            alpha: ContractionScalar::one(dtype)?,
495            beta: ContractionScalar::one(dtype)?,
496        })
497    }
498
499    /// Return scaled update semantics, `out = alpha * lhs dot rhs + beta * out`.
500    ///
501    /// # Examples
502    ///
503    /// ```rust
504    /// use tenferro_tensor::{ContractionScalar, DotGeneralAccumulation};
505    ///
506    /// let accum = DotGeneralAccumulation::scaled(
507    ///     ContractionScalar::F32(0.5),
508    ///     ContractionScalar::F32(2.0),
509    /// )?;
510    /// assert_eq!(accum.alpha, ContractionScalar::F32(0.5));
511    /// # Ok::<(), tenferro_tensor::Error>(())
512    /// ```
513    /// # Errors
514    ///
515    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
516    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
517    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
518    /// backend execution or storage access cannot provide the requested result.
519    pub fn scaled(alpha: ContractionScalar, beta: ContractionScalar) -> crate::Result<Self> {
520        if alpha.dtype() != beta.dtype() {
521            return Err(validation(
522                "dot_general",
523                ValidationError::DTypeMismatch {
524                    expected: crate::core_dtype(alpha.dtype()),
525                    actual: crate::core_dtype(beta.dtype()),
526                },
527            ));
528        }
529        Ok(Self {
530            lhs_conj: false,
531            rhs_conj: false,
532            alpha,
533            beta,
534        })
535    }
536
537    fn validate_for_dtype(self, dtype: DType) -> crate::Result<()> {
538        for scalar in [self.alpha, self.beta] {
539            if scalar.dtype() != dtype {
540                return Err(validation(
541                    "dot_general",
542                    ValidationError::DTypeMismatch {
543                        expected: crate::core_dtype(scalar.dtype()),
544                        actual: crate::core_dtype(dtype),
545                    },
546                ));
547            }
548        }
549        Ok(())
550    }
551}
552
553#[doc(hidden)]
554pub fn validate_dot_general_accumulation(
555    lhs: &TensorRead<'_>,
556    rhs: &TensorRead<'_>,
557    config: &DotGeneralConfig,
558    accumulation: DotGeneralAccumulation,
559    out: &TensorWrite<'_>,
560    op: &'static str,
561) -> crate::Result<Vec<usize>> {
562    let shape = validate_dot_general_read_into(lhs, rhs, config, out, op)?;
563    accumulation.validate_for_dtype(lhs.dtype())?;
564    Ok(shape)
565}
566
567#[doc(hidden)]
568pub fn dot_general_accum_via_temp<B: TensorDot + ?Sized>(
569    backend: &mut B,
570    lhs: TensorRead<'_>,
571    rhs: TensorRead<'_>,
572    config: &DotGeneralConfig,
573    accumulation: DotGeneralAccumulation,
574    mut out: TensorWrite<'_>,
575) -> crate::Result<()> {
576    validate_dot_general_accumulation(&lhs, &rhs, config, accumulation, &out, "dot_general")?;
577    let dot = backend.dot_general_with_conj_read(
578        lhs,
579        rhs,
580        config,
581        accumulation.lhs_conj,
582        accumulation.rhs_conj,
583    )?;
584    accumulate_dot_result_into(&dot, accumulation, &mut out)
585}
586
587fn grouped_checked_product(
588    op: &'static str,
589    role: &'static str,
590    dims: &[usize],
591) -> crate::Result<usize> {
592    dims.iter().try_fold(1usize, |acc, &dim| {
593        acc.checked_mul(dim).ok_or_else(|| {
594            invalid_argument(
595                op,
596                role,
597                format!("logical element count overflows usize for shape {dims:?}"),
598            )
599        })
600    })
601}
602
603fn checked_gemm_span(
604    op: &'static str,
605    role: &'static str,
606    offset: usize,
607    rows: usize,
608    cols: usize,
609) -> crate::Result<Option<std::ops::Range<usize>>> {
610    let len = rows.checked_mul(cols).ok_or_else(|| {
611        invalid_argument(
612            op,
613            role,
614            format!("matrix element count overflows usize: rows={rows} cols={cols}"),
615        )
616    })?;
617    if len == 0 {
618        return Ok(None);
619    }
620    let end = offset.checked_add(len).ok_or_else(|| {
621        invalid_argument(
622            op,
623            role,
624            format!("matrix range overflows usize: offset={offset} len={len}"),
625        )
626    })?;
627    Ok(Some(offset..end))
628}
629
630fn validate_grouped_gemm_range(
631    op: &'static str,
632    role: &'static str,
633    len: usize,
634    range: Option<std::ops::Range<usize>>,
635) -> crate::Result<()> {
636    let Some(range) = range else {
637        return Ok(());
638    };
639    if range.end > len {
640        return Err(invalid_argument(
641            op,
642            role,
643            format!(
644                "matrix range {}..{} exceeds shared buffer logical length {len}",
645                range.start, range.end
646            ),
647        ));
648    }
649    Ok(())
650}
651
652#[doc(hidden)]
653pub fn validate_grouped_gemm(
654    lhs: &TensorRead<'_>,
655    rhs: &TensorRead<'_>,
656    out: &TensorWrite<'_>,
657    config: &GroupedGemmConfig<'_>,
658    op: &'static str,
659) -> crate::Result<()> {
660    if lhs.dtype() != rhs.dtype() {
661        return Err(validation(
662            op,
663            ValidationError::DTypeMismatch {
664                expected: crate::core_dtype(lhs.dtype()),
665                actual: crate::core_dtype(rhs.dtype()),
666            },
667        ));
668    }
669    if lhs.dtype() != out.dtype() {
670        return Err(validation(
671            op,
672            ValidationError::DTypeMismatch {
673                expected: crate::core_dtype(lhs.dtype()),
674                actual: crate::core_dtype(out.dtype()),
675            },
676        ));
677    }
678    config.accumulation.validate_for_dtype(lhs.dtype())?;
679
680    let lhs_len = grouped_checked_product(op, "lhs", lhs.shape())?;
681    let rhs_len = grouped_checked_product(op, "rhs", rhs.shape())?;
682    let out_len = grouped_checked_product(op, "out", out.shape())?;
683    // Grouped GEMM job count is runtime-controlled and can be large. Keep the
684    // validation ranges in a reserved Vec, not SmallVec, so arbitrary batches
685    // avoid inline-capacity tuning and can be sorted for O(n log n) overlap
686    // validation.
687    let mut out_ranges = Vec::<(usize, std::ops::Range<usize>)>::with_capacity(config.jobs.len());
688    for (idx, job) in config.jobs.iter().enumerate() {
689        validate_grouped_gemm_range(
690            op,
691            "lhs",
692            lhs_len,
693            checked_gemm_span(op, "lhs", job.lhs_offset, job.rows, job.contracted)?,
694        )?;
695        validate_grouped_gemm_range(
696            op,
697            "rhs",
698            rhs_len,
699            checked_gemm_span(op, "rhs", job.rhs_offset, job.contracted, job.cols)?,
700        )?;
701        let out_range = checked_gemm_span(op, "out", job.out_offset, job.rows, job.cols)?;
702        validate_grouped_gemm_range(op, "out", out_len, out_range.clone())?;
703        if let Some(out_range) = out_range {
704            out_ranges.push((idx, out_range));
705        }
706    }
707    out_ranges.sort_unstable_by_key(|(_, range)| range.start);
708    for pair in out_ranges.windows(2) {
709        let (prev_idx, previous) = &pair[0];
710        let (idx, current) = &pair[1];
711        if previous.end > current.start {
712            return Err(invalid_argument(
713                op,
714                "jobs",
715                format!(
716                    "grouped GEMM output range for job {idx} overlaps job {prev_idx} range {}..{}",
717                    previous.start, previous.end
718                ),
719            ));
720        }
721    }
722    Ok(())
723}
724
725fn add_element_offsets(
726    op: &'static str,
727    base: isize,
728    offset: usize,
729    role: &'static str,
730) -> crate::Result<isize> {
731    let offset = isize::try_from(offset).map_err(|_| {
732        invalid_argument(op, role, format!("offset {offset} does not fit in isize"))
733    })?;
734    base.checked_add(offset).ok_or_else(|| {
735        invalid_argument(
736            op,
737            role,
738            format!("offset overflows isize: base={base} offset={offset}"),
739        )
740    })
741}
742
743fn dim_stride(op: &'static str, dim: usize, role: &'static str) -> crate::Result<isize> {
744    isize::try_from(dim).map_err(|_| {
745        invalid_argument(
746            op,
747            role,
748            format!("leading dimension {dim} does not fit in isize"),
749        )
750    })
751}
752
753fn typed_read_storage<'a, T>(
754    tensor: &'a TypedTensor<T>,
755    op: &'static str,
756) -> crate::Result<(&'a [T], isize)> {
757    match tensor.buffer() {
758        Buffer::Host(data) => Ok((data, 0)),
759        Buffer::Backend(_) => Err(crate::Error::runtime_state(
760            op,
761            "grouped GEMM default path requires host-backed tensor storage",
762        )),
763    }
764}
765
766fn grouped_gemm_default_config() -> DotGeneralConfig {
767    // DotGeneralConfig owns Vec fields, so this rank-2 fallback config follows
768    // that API boundary rather than introducing SmallVec locally.
769    DotGeneralConfig {
770        lhs_contracting_dims: vec![1],
771        rhs_contracting_dims: vec![0],
772        lhs_batch_dims: Vec::new(),
773        rhs_batch_dims: Vec::new(),
774    }
775}
776
777trait GroupedGemmDType<T> {
778    fn wrap_read(view: TypedTensorView<'_, T>) -> TensorView<'_>;
779    fn wrap_write(view: TypedTensorViewMut<'_, T>) -> TensorViewMut<'_>;
780}
781
782struct GroupedF32;
783struct GroupedF64;
784struct GroupedC32;
785struct GroupedC64;
786
787impl GroupedGemmDType<f32> for GroupedF32 {
788    fn wrap_read(view: TypedTensorView<'_, f32>) -> TensorView<'_> {
789        TensorView::F32(view)
790    }
791
792    fn wrap_write(view: TypedTensorViewMut<'_, f32>) -> TensorViewMut<'_> {
793        TensorViewMut::F32(view)
794    }
795}
796
797impl GroupedGemmDType<f64> for GroupedF64 {
798    fn wrap_read(view: TypedTensorView<'_, f64>) -> TensorView<'_> {
799        TensorView::F64(view)
800    }
801
802    fn wrap_write(view: TypedTensorViewMut<'_, f64>) -> TensorViewMut<'_> {
803        TensorViewMut::F64(view)
804    }
805}
806
807impl GroupedGemmDType<Complex32> for GroupedC32 {
808    fn wrap_read(view: TypedTensorView<'_, Complex32>) -> TensorView<'_> {
809        TensorView::C32(view)
810    }
811
812    fn wrap_write(view: TypedTensorViewMut<'_, Complex32>) -> TensorViewMut<'_> {
813        TensorViewMut::C32(view)
814    }
815}
816
817impl GroupedGemmDType<Complex64> for GroupedC64 {
818    fn wrap_read(view: TypedTensorView<'_, Complex64>) -> TensorView<'_> {
819        TensorView::C64(view)
820    }
821
822    fn wrap_write(view: TypedTensorViewMut<'_, Complex64>) -> TensorViewMut<'_> {
823        TensorViewMut::C64(view)
824    }
825}
826
827#[allow(clippy::too_many_arguments)]
828fn grouped_gemm_default_loop<B, T, V>(
829    backend: &mut B,
830    lhs_data: &[T],
831    lhs_base: isize,
832    rhs_data: &[T],
833    rhs_base: isize,
834    out_view: &mut TypedTensorViewMut<'_, T>,
835    config: &GroupedGemmConfig<'_>,
836) -> crate::Result<()>
837where
838    B: TensorDot + ?Sized,
839    T: 'static,
840    V: GroupedGemmDType<T>,
841{
842    let op = "grouped_gemm";
843    let dot_config = grouped_gemm_default_config();
844    for job in config.jobs {
845        let lhs_offset = add_element_offsets(op, lhs_base, job.lhs_offset, "lhs")?;
846        let rhs_offset = add_element_offsets(op, rhs_base, job.rhs_offset, "rhs")?;
847        let out_offset = add_element_offsets(op, out_view.offset(), job.out_offset, "out")?;
848        let lhs_rows = dim_stride(op, job.rows, "lhs")?;
849        let rhs_rows = dim_stride(op, job.contracted, "rhs")?;
850        let out_rows = dim_stride(op, job.rows, "out")?;
851        // TypedTensorView constructors own Vec shape/stride metadata. These
852        // fallback rank-2 views are short-lived, but SmallVec is not usable
853        // without changing the view API.
854        let lhs_matrix = TypedTensorView::from_slice(
855            vec![job.rows, job.contracted],
856            vec![1, lhs_rows],
857            lhs_offset,
858            lhs_data,
859        )?;
860        let rhs_matrix = TypedTensorView::from_slice(
861            vec![job.contracted, job.cols],
862            vec![1, rhs_rows],
863            rhs_offset,
864            rhs_data,
865        )?;
866        let out_storage = out_view.host_storage_mut()?;
867        let out_matrix = TypedTensorViewMut::from_slice(
868            vec![job.rows, job.cols],
869            vec![1, out_rows],
870            out_offset,
871            out_storage,
872        )?;
873        backend.dot_general_read_into_accum(
874            TensorRead::from_view(V::wrap_read(lhs_matrix)),
875            TensorRead::from_view(V::wrap_read(rhs_matrix)),
876            &dot_config,
877            config.accumulation,
878            TensorWrite::from_view(V::wrap_write(out_matrix)),
879        )?;
880    }
881    Ok(())
882}
883
884#[doc(hidden)]
885pub fn grouped_gemm_via_sequential<B>(
886    backend: &mut B,
887    lhs: TensorRead<'_>,
888    rhs: TensorRead<'_>,
889    config: &GroupedGemmConfig<'_>,
890    mut out: TensorWrite<'_>,
891) -> crate::Result<()>
892where
893    B: TensorDot + ?Sized,
894{
895    validate_grouped_gemm(&lhs, &rhs, &out, config, "grouped_gemm")?;
896    macro_rules! dispatch {
897        ($variant:ident, $wrapper:ty) => {
898            match (&lhs, &rhs, &mut out) {
899                (
900                    TensorRead::Tensor(Tensor::$variant(a)),
901                    TensorRead::Tensor(Tensor::$variant(b)),
902                    TensorWrite::Tensor(Tensor::$variant(c)),
903                ) => {
904                    let (a_data, a_base) = typed_read_storage(a, "grouped_gemm")?;
905                    let (b_data, b_base) = typed_read_storage(b, "grouped_gemm")?;
906                    let mut c_view = c.as_view_mut();
907                    return grouped_gemm_default_loop::<_, _, $wrapper>(
908                        backend,
909                        a_data,
910                        a_base,
911                        b_data,
912                        b_base,
913                        &mut c_view,
914                        config,
915                    );
916                }
917                (
918                    TensorRead::Tensor(Tensor::$variant(a)),
919                    TensorRead::View(TensorView::$variant(b)),
920                    TensorWrite::Tensor(Tensor::$variant(c)),
921                ) => {
922                    let (a_data, a_base) = typed_read_storage(a, "grouped_gemm")?;
923                    let mut c_view = c.as_view_mut();
924                    return grouped_gemm_default_loop::<_, _, $wrapper>(
925                        backend,
926                        a_data,
927                        a_base,
928                        b.host_storage()?,
929                        b.offset(),
930                        &mut c_view,
931                        config,
932                    );
933                }
934                (
935                    TensorRead::View(TensorView::$variant(a)),
936                    TensorRead::Tensor(Tensor::$variant(b)),
937                    TensorWrite::Tensor(Tensor::$variant(c)),
938                ) => {
939                    let (b_data, b_base) = typed_read_storage(b, "grouped_gemm")?;
940                    let mut c_view = c.as_view_mut();
941                    return grouped_gemm_default_loop::<_, _, $wrapper>(
942                        backend,
943                        a.host_storage()?,
944                        a.offset(),
945                        b_data,
946                        b_base,
947                        &mut c_view,
948                        config,
949                    );
950                }
951                (
952                    TensorRead::View(TensorView::$variant(a)),
953                    TensorRead::View(TensorView::$variant(b)),
954                    TensorWrite::Tensor(Tensor::$variant(c)),
955                ) => {
956                    let mut c_view = c.as_view_mut();
957                    return grouped_gemm_default_loop::<_, _, $wrapper>(
958                        backend,
959                        a.host_storage()?,
960                        a.offset(),
961                        b.host_storage()?,
962                        b.offset(),
963                        &mut c_view,
964                        config,
965                    );
966                }
967                (
968                    TensorRead::Tensor(Tensor::$variant(a)),
969                    TensorRead::Tensor(Tensor::$variant(b)),
970                    TensorWrite::View(TensorViewMut::$variant(c)),
971                ) => {
972                    let (a_data, a_base) = typed_read_storage(a, "grouped_gemm")?;
973                    let (b_data, b_base) = typed_read_storage(b, "grouped_gemm")?;
974                    return grouped_gemm_default_loop::<_, _, $wrapper>(
975                        backend, a_data, a_base, b_data, b_base, c, config,
976                    );
977                }
978                (
979                    TensorRead::Tensor(Tensor::$variant(a)),
980                    TensorRead::View(TensorView::$variant(b)),
981                    TensorWrite::View(TensorViewMut::$variant(c)),
982                ) => {
983                    let (a_data, a_base) = typed_read_storage(a, "grouped_gemm")?;
984                    return grouped_gemm_default_loop::<_, _, $wrapper>(
985                        backend,
986                        a_data,
987                        a_base,
988                        b.host_storage()?,
989                        b.offset(),
990                        c,
991                        config,
992                    );
993                }
994                (
995                    TensorRead::View(TensorView::$variant(a)),
996                    TensorRead::Tensor(Tensor::$variant(b)),
997                    TensorWrite::View(TensorViewMut::$variant(c)),
998                ) => {
999                    let (b_data, b_base) = typed_read_storage(b, "grouped_gemm")?;
1000                    return grouped_gemm_default_loop::<_, _, $wrapper>(
1001                        backend,
1002                        a.host_storage()?,
1003                        a.offset(),
1004                        b_data,
1005                        b_base,
1006                        c,
1007                        config,
1008                    );
1009                }
1010                (
1011                    TensorRead::View(TensorView::$variant(a)),
1012                    TensorRead::View(TensorView::$variant(b)),
1013                    TensorWrite::View(TensorViewMut::$variant(c)),
1014                ) => {
1015                    return grouped_gemm_default_loop::<_, _, $wrapper>(
1016                        backend,
1017                        a.host_storage()?,
1018                        a.offset(),
1019                        b.host_storage()?,
1020                        b.offset(),
1021                        c,
1022                        config,
1023                    );
1024                }
1025                _ => {}
1026            }
1027        };
1028    }
1029
1030    dispatch!(F32, GroupedF32);
1031    dispatch!(F64, GroupedF64);
1032    dispatch!(C32, GroupedC32);
1033    dispatch!(C64, GroupedC64);
1034    Err(validation(
1035        "grouped_gemm",
1036        ValidationError::DTypeMismatch {
1037            expected: crate::core_dtype(lhs.dtype()),
1038            actual: crate::core_dtype(out.dtype()),
1039        },
1040    ))
1041}
1042
1043fn grouped_gemm_default<B>(
1044    backend: &mut B,
1045    lhs: TensorRead<'_>,
1046    rhs: TensorRead<'_>,
1047    config: &GroupedGemmConfig<'_>,
1048    out: TensorWrite<'_>,
1049) -> crate::Result<()>
1050where
1051    B: TensorDot + ?Sized,
1052{
1053    grouped_gemm_via_sequential(backend, lhs, rhs, config, out)
1054}
1055
1056#[doc(hidden)]
1057pub fn accumulate_dot_result_into(
1058    dot: &Tensor,
1059    accumulation: DotGeneralAccumulation,
1060    out: &mut TensorWrite<'_>,
1061) -> crate::Result<()> {
1062    macro_rules! dispatch {
1063        ($variant:ident, $ty:ty) => {
1064            if let (
1065                Tensor::$variant(dot),
1066                ContractionScalar::$variant(alpha),
1067                ContractionScalar::$variant(beta),
1068            ) = (dot, accumulation.alpha, accumulation.beta)
1069            {
1070                match out {
1071                    TensorWrite::Tensor(Tensor::$variant(out)) => {
1072                        let mut out = out.as_view_mut();
1073                        accumulate_typed(dot.as_slice()?, alpha, beta, &mut out)?;
1074                        return Ok(());
1075                    }
1076                    TensorWrite::View(crate::TensorViewMut::$variant(out)) => {
1077                        accumulate_typed(dot.as_slice()?, alpha, beta, out)?;
1078                        return Ok(());
1079                    }
1080                    _ => {}
1081                }
1082            }
1083        };
1084    }
1085
1086    dispatch!(F32, f32);
1087    dispatch!(F64, f64);
1088    dispatch!(C32, Complex32);
1089    dispatch!(C64, Complex64);
1090
1091    Err(validation(
1092        "dot_general",
1093        ValidationError::DTypeMismatch {
1094            expected: crate::core_dtype(accumulation.alpha.dtype()),
1095            actual: crate::core_dtype(dot.dtype()),
1096        },
1097    ))
1098}
1099
1100fn accumulate_typed<T>(
1101    dot: &[T],
1102    alpha: T,
1103    beta: T,
1104    out: &mut TypedTensorViewMut<'_, T>,
1105) -> crate::Result<()>
1106where
1107    T: Copy
1108        + PartialEq
1109        + std::ops::Add<Output = T>
1110        + std::ops::Mul<Output = T>
1111        + num_traits::Zero
1112        + 'static,
1113{
1114    let beta_is_zero = beta == T::zero();
1115    if let Some(output) = compact_host_accumulation_slice(out, dot.len())? {
1116        for (output, dot_value) in output.iter_mut().zip(dot.iter().copied()) {
1117            // INVARIANT: beta == 0 follows BLAS GEMM semantics and does not read
1118            // the existing output element; beta != 0 requires an initialized
1119            // TensorWrite target and performs a read-modify-write update.
1120            *output = if beta_is_zero {
1121                alpha * dot_value
1122            } else {
1123                alpha * dot_value + beta * *output
1124            };
1125        }
1126        return Ok(());
1127    }
1128
1129    for (linear, dot_value) in dot.iter().copied().enumerate() {
1130        let indices = flat_to_multi_for_shape(out.shape(), linear);
1131        let output = out.get_mut(&indices).ok_or_else(|| {
1132            invalid_argument(
1133                "dot_general",
1134                "output",
1135                format!("index {indices:?} is outside accumulation target"),
1136            )
1137        })?;
1138        // INVARIANT: beta == 0 follows BLAS GEMM semantics and does not read
1139        // the existing output element; beta != 0 requires an initialized
1140        // TensorWrite target and performs a read-modify-write update.
1141        *output = if beta_is_zero {
1142            alpha * dot_value
1143        } else {
1144            alpha * dot_value + beta * *output
1145        };
1146    }
1147    Ok(())
1148}
1149
1150fn compact_host_accumulation_slice<'a, T: 'static>(
1151    out: &'a mut TypedTensorViewMut<'_, T>,
1152    expected_len: usize,
1153) -> crate::Result<Option<&'a mut [T]>> {
1154    if out.backend_buffer().is_some()
1155        || out.n_elements() != expected_len
1156        || !out.is_col_major_contiguous()?
1157    {
1158        return Ok(None);
1159    }
1160
1161    let start = usize::try_from(out.offset()).map_err(|_| {
1162        invalid_argument("dot_general", "output", "compact output offset is negative")
1163    })?;
1164    let end = start
1165        .checked_add(expected_len)
1166        .ok_or_else(|| validation("dot_general", ValidationError::IntegerOverflow))?;
1167    out.host_storage_mut()?
1168        .get_mut(start..end)
1169        .map(Some)
1170        .ok_or_else(|| {
1171            invalid_argument(
1172                "dot_general",
1173                "output",
1174                "compact output is outside its backing storage",
1175            )
1176        })
1177}
1178
1179fn flat_to_multi_for_shape(shape: &[usize], mut linear: usize) -> Vec<usize> {
1180    let mut indices = Vec::with_capacity(shape.len());
1181    for &dim in shape {
1182        if dim == 0 {
1183            indices.push(0);
1184        } else {
1185            indices.push(linear % dim);
1186            linear /= dim;
1187        }
1188    }
1189    indices
1190}
1191
1192/// Canonical elementwise fusion plan shared between segmented execution and backends.
1193#[doc(hidden)]
1194#[derive(Clone, Debug, Hash, PartialEq, Eq)]
1195pub struct ElementwiseFusionPlan {
1196    dtype: crate::DType,
1197    input_count: usize,
1198    // Keep view metadata in Vecs. A/B benchmarking on the broadcast_mul
1199    // path showed SmallVec made this metadata path about 6-7% slower.
1200    input_views: Vec<ElementwiseFusionInputView>,
1201    outputs: Vec<usize>,
1202    ops: Vec<ElementwiseFusionInst>,
1203}
1204
1205/// Metadata-only view applied to one backend fusion input.
1206#[doc(hidden)]
1207#[derive(Clone, Debug, Hash, PartialEq, Eq)]
1208pub enum ElementwiseFusionInputView {
1209    Identity,
1210    BroadcastInDim {
1211        // Vec is intentional here; see ElementwiseFusionPlan::input_views.
1212        shape: Vec<usize>,
1213        dims: Vec<usize>,
1214    },
1215}
1216
1217/// One node in a canonical elementwise fusion plan.
1218#[doc(hidden)]
1219#[derive(Clone, Debug, Hash, PartialEq, Eq)]
1220pub struct ElementwiseFusionInst {
1221    op: ElementwiseFusionOp,
1222    inputs: Vec<usize>,
1223}
1224
1225tenferro_core_ops::define_elementwise_fusion_op!();
1226
1227impl ElementwiseFusionPlan {
1228    /// Build a backend elementwise fusion plan.
1229    ///
1230    /// # Examples
1231    ///
1232    /// ```rust
1233    /// use tenferro_tensor::backend::{
1234    ///     ElementwiseFusionInst, ElementwiseFusionOp, ElementwiseFusionPlan,
1235    /// };
1236    /// use tenferro_tensor::DType;
1237    ///
1238    /// let plan = ElementwiseFusionPlan::new(
1239    ///     DType::F64,
1240    ///     2,
1241    ///     vec![2],
1242    ///     vec![ElementwiseFusionInst::new(ElementwiseFusionOp::Add, vec![0, 1])],
1243    /// );
1244    /// assert_eq!(plan.input_count(), 2);
1245    /// ```
1246    pub fn new(
1247        dtype: crate::DType,
1248        input_count: usize,
1249        outputs: Vec<usize>,
1250        ops: Vec<ElementwiseFusionInst>,
1251    ) -> Self {
1252        Self::with_input_views(
1253            dtype,
1254            vec![ElementwiseFusionInputView::Identity; input_count],
1255            outputs,
1256            ops,
1257        )
1258    }
1259
1260    /// Build a backend elementwise fusion plan with input view metadata.
1261    ///
1262    /// # Examples
1263    ///
1264    /// ```rust
1265    /// use tenferro_tensor::backend::{
1266    ///     ElementwiseFusionInputView, ElementwiseFusionInst, ElementwiseFusionOp,
1267    ///     ElementwiseFusionPlan,
1268    /// };
1269    /// use tenferro_tensor::DType;
1270    ///
1271    /// let plan = ElementwiseFusionPlan::with_input_views(
1272    ///     DType::F64,
1273    ///     vec![ElementwiseFusionInputView::broadcast_in_dim(vec![2, 3], vec![0])],
1274    ///     vec![1],
1275    ///     vec![ElementwiseFusionInst::new(ElementwiseFusionOp::Negate, vec![0])],
1276    /// );
1277    /// assert_eq!(plan.input_count(), 1);
1278    /// ```
1279    pub fn with_input_views(
1280        dtype: crate::DType,
1281        input_views: impl IntoIterator<Item = ElementwiseFusionInputView>,
1282        outputs: Vec<usize>,
1283        ops: Vec<ElementwiseFusionInst>,
1284    ) -> Self {
1285        let input_views = input_views.into_iter().collect::<Vec<_>>();
1286        let input_count = input_views.len();
1287        Self {
1288            dtype,
1289            input_count,
1290            input_views,
1291            outputs,
1292            ops,
1293        }
1294    }
1295
1296    /// Return the scalar dtype expected by this fusion plan.
1297    ///
1298    /// # Examples
1299    ///
1300    /// ```rust
1301    /// use tenferro_tensor::backend::ElementwiseFusionPlan;
1302    /// use tenferro_tensor::DType;
1303    ///
1304    /// let plan = ElementwiseFusionPlan::new(DType::F32, 0, Vec::new(), Vec::new());
1305    /// assert_eq!(plan.dtype(), DType::F32);
1306    /// ```
1307    pub fn dtype(&self) -> crate::DType {
1308        self.dtype
1309    }
1310
1311    /// Return the number of input tensors expected by this plan.
1312    ///
1313    /// # Examples
1314    ///
1315    /// ```rust
1316    /// use tenferro_tensor::backend::ElementwiseFusionPlan;
1317    /// use tenferro_tensor::DType;
1318    ///
1319    /// let plan = ElementwiseFusionPlan::new(DType::F64, 3, Vec::new(), Vec::new());
1320    /// assert_eq!(plan.input_count(), 3);
1321    /// ```
1322    pub fn input_count(&self) -> usize {
1323        self.input_count
1324    }
1325
1326    /// Return metadata views applied to fusion inputs before executing ops.
1327    ///
1328    /// # Examples
1329    ///
1330    /// ```rust
1331    /// use tenferro_tensor::backend::ElementwiseFusionPlan;
1332    /// use tenferro_tensor::DType;
1333    ///
1334    /// let plan = ElementwiseFusionPlan::new(DType::F64, 2, Vec::new(), Vec::new());
1335    /// assert_eq!(plan.input_views().len(), 2);
1336    /// ```
1337    pub fn input_views(&self) -> &[ElementwiseFusionInputView] {
1338        &self.input_views
1339    }
1340
1341    /// Return the value ids selected as fusion outputs.
1342    ///
1343    /// # Examples
1344    ///
1345    /// ```rust
1346    /// use tenferro_tensor::backend::ElementwiseFusionPlan;
1347    /// use tenferro_tensor::DType;
1348    ///
1349    /// let plan = ElementwiseFusionPlan::new(DType::F64, 0, vec![0], Vec::new());
1350    /// assert_eq!(plan.outputs(), &[0]);
1351    /// ```
1352    pub fn outputs(&self) -> &[usize] {
1353        &self.outputs
1354    }
1355
1356    /// Return the fused elementwise instruction sequence.
1357    ///
1358    /// # Examples
1359    ///
1360    /// ```rust
1361    /// use tenferro_tensor::backend::{
1362    ///     ElementwiseFusionInst, ElementwiseFusionOp, ElementwiseFusionPlan,
1363    /// };
1364    /// use tenferro_tensor::DType;
1365    ///
1366    /// let inst = ElementwiseFusionInst::new(ElementwiseFusionOp::Negate, vec![0]);
1367    /// let plan = ElementwiseFusionPlan::new(DType::F64, 1, vec![1], vec![inst]);
1368    /// assert_eq!(plan.ops().len(), 1);
1369    /// ```
1370    pub fn ops(&self) -> &[ElementwiseFusionInst] {
1371        &self.ops
1372    }
1373}
1374
1375impl ElementwiseFusionInputView {
1376    /// Build metadata for a `BroadcastInDim` fusion input view.
1377    ///
1378    /// # Examples
1379    ///
1380    /// ```rust
1381    /// use tenferro_tensor::backend::ElementwiseFusionInputView;
1382    ///
1383    /// let view = ElementwiseFusionInputView::broadcast_in_dim(vec![2, 3], vec![0]);
1384    /// assert!(matches!(view, ElementwiseFusionInputView::BroadcastInDim { .. }));
1385    /// ```
1386    pub fn broadcast_in_dim(
1387        shape: impl IntoIterator<Item = usize>,
1388        dims: impl IntoIterator<Item = usize>,
1389    ) -> Self {
1390        Self::BroadcastInDim {
1391            shape: shape.into_iter().collect(),
1392            dims: dims.into_iter().collect(),
1393        }
1394    }
1395
1396    /// Return true when this fusion input is an identity view.
1397    ///
1398    /// # Examples
1399    ///
1400    /// ```rust
1401    /// use tenferro_tensor::backend::ElementwiseFusionInputView;
1402    ///
1403    /// assert!(ElementwiseFusionInputView::Identity.is_identity());
1404    /// ```
1405    pub fn is_identity(&self) -> bool {
1406        matches!(self, Self::Identity)
1407    }
1408}
1409
1410impl ElementwiseFusionInst {
1411    /// Build a backend elementwise fusion instruction.
1412    ///
1413    /// # Examples
1414    ///
1415    /// ```rust
1416    /// use tenferro_tensor::backend::{ElementwiseFusionInst, ElementwiseFusionOp};
1417    ///
1418    /// let inst = ElementwiseFusionInst::new(ElementwiseFusionOp::Add, vec![0, 1]);
1419    /// assert_eq!(inst.inputs(), &[0, 1]);
1420    /// ```
1421    pub fn new(op: ElementwiseFusionOp, inputs: Vec<usize>) -> Self {
1422        Self { op, inputs }
1423    }
1424
1425    /// Return the elementwise op executed by this instruction.
1426    ///
1427    /// # Examples
1428    ///
1429    /// ```rust
1430    /// use tenferro_tensor::backend::{ElementwiseFusionInst, ElementwiseFusionOp};
1431    ///
1432    /// let inst = ElementwiseFusionInst::new(ElementwiseFusionOp::Negate, vec![0]);
1433    /// assert_eq!(inst.op(), ElementwiseFusionOp::Negate);
1434    /// ```
1435    pub fn op(&self) -> ElementwiseFusionOp {
1436        self.op
1437    }
1438
1439    /// Return this instruction's input value ids.
1440    ///
1441    /// # Examples
1442    ///
1443    /// ```rust
1444    /// use tenferro_tensor::backend::{ElementwiseFusionInst, ElementwiseFusionOp};
1445    ///
1446    /// let inst = ElementwiseFusionInst::new(ElementwiseFusionOp::Multiply, vec![2, 0]);
1447    /// assert_eq!(inst.inputs(), &[2, 0]);
1448    /// ```
1449    pub fn inputs(&self) -> &[usize] {
1450        &self.inputs
1451    }
1452}
1453
1454/// Runtime operation selected by [`TensorElementwise::elementwise_read_into`].
1455#[non_exhaustive]
1456#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1457pub enum ElementwiseReadOp {
1458    /// Binary addition.
1459    Add,
1460    /// Binary subtraction.
1461    Subtract,
1462    /// Binary multiplication.
1463    Multiply,
1464    /// Unary negation.
1465    Negate,
1466    /// Unary conjugation.
1467    Conj,
1468    /// Binary division.
1469    Divide,
1470}
1471
1472impl ElementwiseReadOp {
1473    fn label(self) -> &'static str {
1474        match self {
1475            Self::Add => "add",
1476            Self::Subtract => "sub",
1477            Self::Multiply => "mul",
1478            Self::Negate => "neg",
1479            Self::Conj => "conj",
1480            Self::Divide => "div",
1481        }
1482    }
1483
1484    fn arity(self) -> usize {
1485        match self {
1486            Self::Negate | Self::Conj => 1,
1487            Self::Add | Self::Subtract | Self::Multiply | Self::Divide => 2,
1488        }
1489    }
1490}
1491
1492#[derive(Clone, Copy, Debug)]
1493enum StorageIdentity {
1494    Host {
1495        start: usize,
1496        end: usize,
1497    },
1498    Backend {
1499        domain: Option<AllocationDomainId>,
1500        allocation: Option<AllocationId>,
1501        family: &'static str,
1502        object: usize,
1503    },
1504}
1505
1506fn host_storage_identity<T>(data: &[T]) -> StorageIdentity {
1507    let start = data.as_ptr() as usize;
1508    let bytes = std::mem::size_of_val(data);
1509    StorageIdentity::Host {
1510        start,
1511        end: start.saturating_add(bytes),
1512    }
1513}
1514
1515fn backend_storage_identity<T: 'static>(
1516    buffer: &std::sync::Arc<dyn crate::BackendBuffer<T>>,
1517) -> StorageIdentity {
1518    StorageIdentity::Backend {
1519        domain: buffer.allocation_domain(),
1520        allocation: buffer.allocation_id(),
1521        family: buffer.backend_family(),
1522        object: std::sync::Arc::as_ptr(buffer) as *const () as usize,
1523    }
1524}
1525
1526fn typed_tensor_storage_identity<T: 'static>(
1527    tensor: &TypedTensor<T>,
1528) -> crate::Result<StorageIdentity> {
1529    match tensor.buffer() {
1530        Buffer::Host(data) => Ok(host_storage_identity(data)),
1531        Buffer::Backend(buffer) => Ok(backend_storage_identity(buffer)),
1532    }
1533}
1534
1535fn typed_view_storage_identity<T: 'static>(
1536    view: &TypedTensorView<'_, T>,
1537) -> crate::Result<StorageIdentity> {
1538    match view.backend_buffer() {
1539        Some(buffer) => Ok(backend_storage_identity(buffer)),
1540        None => view.host_storage().map(host_storage_identity),
1541    }
1542}
1543
1544fn tensor_read_storage_identity(input: &TensorRead<'_>) -> crate::Result<StorageIdentity> {
1545    macro_rules! typed_identity {
1546        ($value:expr) => {
1547            match $value {
1548                Tensor::F32(value) => typed_tensor_storage_identity(value),
1549                Tensor::F64(value) => typed_tensor_storage_identity(value),
1550                Tensor::I32(value) => typed_tensor_storage_identity(value),
1551                Tensor::I64(value) => typed_tensor_storage_identity(value),
1552                Tensor::Bool(value) => typed_tensor_storage_identity(value),
1553                Tensor::C32(value) => typed_tensor_storage_identity(value),
1554                Tensor::C64(value) => typed_tensor_storage_identity(value),
1555            }
1556        };
1557    }
1558    macro_rules! view_identity {
1559        ($value:expr) => {
1560            match $value {
1561                TensorView::F32(value) => typed_view_storage_identity(value),
1562                TensorView::F64(value) => typed_view_storage_identity(value),
1563                TensorView::I32(value) => typed_view_storage_identity(value),
1564                TensorView::I64(value) => typed_view_storage_identity(value),
1565                TensorView::Bool(value) => typed_view_storage_identity(value),
1566                TensorView::C32(value) => typed_view_storage_identity(value),
1567                TensorView::C64(value) => typed_view_storage_identity(value),
1568            }
1569        };
1570    }
1571
1572    match input {
1573        TensorRead::Tensor(tensor) => typed_identity!(tensor),
1574        TensorRead::View(view) => view_identity!(view),
1575    }
1576}
1577
1578fn storage_overlaps(lhs: StorageIdentity, rhs: StorageIdentity) -> bool {
1579    match (lhs, rhs) {
1580        (
1581            StorageIdentity::Host {
1582                start: lhs_start,
1583                end: lhs_end,
1584            },
1585            StorageIdentity::Host {
1586                start: rhs_start,
1587                end: rhs_end,
1588            },
1589        ) => lhs_start < rhs_end && rhs_start < lhs_end,
1590        (
1591            StorageIdentity::Backend {
1592                domain: lhs_domain,
1593                allocation: lhs_allocation,
1594                family: lhs_family,
1595                object: lhs_object,
1596            },
1597            StorageIdentity::Backend {
1598                domain: rhs_domain,
1599                allocation: rhs_allocation,
1600                family: rhs_family,
1601                object: rhs_object,
1602            },
1603        ) => {
1604            lhs_object == rhs_object
1605                || matches!(
1606                    (lhs_domain, rhs_domain, lhs_allocation, rhs_allocation),
1607                    (Some(lhs_domain), Some(rhs_domain), Some(lhs), Some(rhs))
1608                        if lhs_domain == rhs_domain && lhs == rhs
1609                )
1610                || matches!(
1611                    (lhs_domain, rhs_domain, lhs_allocation, rhs_allocation),
1612                    (None, None, Some(lhs), Some(rhs)) if lhs_family == rhs_family && lhs == rhs
1613                )
1614        }
1615        _ => false,
1616    }
1617}
1618
1619fn validate_elementwise_output_disjoint(
1620    op: ElementwiseReadOp,
1621    inputs: &[TensorRead<'_>],
1622    out: &TensorWrite<'_>,
1623) -> crate::Result<()> {
1624    validate_read_into_destination(op.label(), inputs, out)
1625}
1626
1627/// Validate that a caller-owned destination does not overlap any read input.
1628///
1629/// The check is intentionally conservative for host views: two views backed by
1630/// the same host allocation are treated as overlapping because the allocation
1631/// identity is the only stable boundary contract available to erased backend
1632/// code. Backend allocations use their domain/allocation identity when the
1633/// provider exposes it.
1634///
1635/// # Errors
1636///
1637/// Returns `tenferro_tensor_core::ValidationError::InvalidArgument` when the
1638/// destination storage overlaps an input, or `Error::RuntimeState` when
1639/// storage identity cannot be established safely.
1640///
1641/// # Examples
1642///
1643/// ```rust
1644/// use tenferro_tensor::{Tensor, TensorRead, TensorWrite};
1645/// use tenferro_tensor::backend::validate_read_into_destination;
1646///
1647/// let input = Tensor::from_vec_col_major(vec![1], vec![1.0_f64])?;
1648/// let mut output = Tensor::from_vec_col_major(vec![1], vec![0.0_f64])?;
1649/// validate_read_into_destination(
1650///     "example",
1651///     &[TensorRead::from_tensor(&input)],
1652///     &TensorWrite::from_tensor(&mut output),
1653/// )?;
1654/// # Ok::<(), tenferro_tensor::Error>(())
1655/// ```
1656pub fn validate_read_into_destination(
1657    op: &'static str,
1658    inputs: &[TensorRead<'_>],
1659    out: &TensorWrite<'_>,
1660) -> crate::Result<()> {
1661    let output_identity = tensor_read_storage_identity(&out.as_read())?;
1662    for (index, input) in inputs.iter().enumerate() {
1663        if storage_overlaps(tensor_read_storage_identity(input)?, output_identity) {
1664            return Err(Error::invalid_argument(
1665                op,
1666                "out",
1667                format!("destination storage overlaps input {index}"),
1668            ));
1669        }
1670    }
1671    Ok(())
1672}
1673
1674fn read_is_host(input: &TensorRead<'_>) -> bool {
1675    match input {
1676        TensorRead::Tensor(tensor) => !tensor.is_backend_buffer(),
1677        TensorRead::View(view) => match view {
1678            TensorView::F32(view) => view.backend_buffer().is_none(),
1679            TensorView::F64(view) => view.backend_buffer().is_none(),
1680            TensorView::I32(view) => view.backend_buffer().is_none(),
1681            TensorView::I64(view) => view.backend_buffer().is_none(),
1682            TensorView::Bool(view) => view.backend_buffer().is_none(),
1683            TensorView::C32(view) => view.backend_buffer().is_none(),
1684            TensorView::C64(view) => view.backend_buffer().is_none(),
1685        },
1686    }
1687}
1688
1689fn write_is_host(out: &TensorWrite<'_>) -> bool {
1690    read_is_host(&out.as_read())
1691}
1692
1693fn one_shot_supports(op: ElementwiseReadOp, dtype: DType) -> bool {
1694    match op {
1695        ElementwiseReadOp::Conj => true,
1696        ElementwiseReadOp::Add
1697        | ElementwiseReadOp::Subtract
1698        | ElementwiseReadOp::Multiply
1699        | ElementwiseReadOp::Divide
1700        | ElementwiseReadOp::Negate => !matches!(dtype, DType::Bool),
1701    }
1702}
1703
1704fn one_shot_eligible(
1705    op: ElementwiseReadOp,
1706    inputs: &[TensorRead<'_>],
1707    out: &TensorWrite<'_>,
1708) -> bool {
1709    let dtype = out.dtype();
1710    write_is_host(out)
1711        && one_shot_supports(op, dtype)
1712        && inputs.iter().all(|input| {
1713            read_is_host(input) && input.dtype() == dtype && input.shape() == out.shape()
1714        })
1715}
1716
1717fn tensor_write_view(out: TensorWrite<'_>) -> TensorViewMut<'_> {
1718    match out {
1719        TensorWrite::Tensor(tensor) => match tensor {
1720            Tensor::F32(tensor) => TensorViewMut::F32(tensor.as_view_mut()),
1721            Tensor::F64(tensor) => TensorViewMut::F64(tensor.as_view_mut()),
1722            Tensor::I32(tensor) => TensorViewMut::I32(tensor.as_view_mut()),
1723            Tensor::I64(tensor) => TensorViewMut::I64(tensor.as_view_mut()),
1724            Tensor::Bool(tensor) => TensorViewMut::Bool(tensor.as_view_mut()),
1725            Tensor::C32(tensor) => TensorViewMut::C32(tensor.as_view_mut()),
1726            Tensor::C64(tensor) => TensorViewMut::C64(tensor.as_view_mut()),
1727        },
1728        TensorWrite::View(view) => view,
1729    }
1730}
1731
1732fn non_null_bytes<T>(data: &[T]) -> NonNull<u8> {
1733    NonNull::new(data.as_ptr().cast_mut().cast()).unwrap_or_else(NonNull::dangling)
1734}
1735
1736fn typed_bytes<T>(data: &[T]) -> &[u8] {
1737    // SAFETY: u8 has alignment one and the returned bytes retain the shared
1738    // lifetime of the typed source slice.
1739    unsafe { std::slice::from_raw_parts(data.as_ptr().cast(), std::mem::size_of_val(data)) }
1740}
1741
1742fn typed_bytes_mut<T>(data: &mut [T]) -> &mut [u8] {
1743    let len = std::mem::size_of_val(data);
1744    // SAFETY: u8 has alignment one and the returned bytes retain the unique
1745    // lifetime of the typed destination slice.
1746    unsafe { std::slice::from_raw_parts_mut(data.as_mut_ptr().cast(), len) }
1747}
1748
1749fn erased_raw_strided_ptr<'a>(
1750    dtype: KernelDType,
1751    data: &'a [u8],
1752    dims: &'a [usize],
1753    strides: &'a [isize],
1754    offset: isize,
1755) -> strided_kernel::Result<ErasedRawStridedPtr<'a>> {
1756    // SAFETY: callers derive `data` from initialized typed host storage and
1757    // retain the backing borrow for the returned descriptor lifetime.
1758    unsafe {
1759        ErasedRawStridedPtr::from_raw_parts(
1760            dtype,
1761            non_null_bytes(data),
1762            data.len(),
1763            dims,
1764            strides,
1765            offset,
1766        )
1767    }
1768}
1769
1770fn erased_raw_strided_mut<'a>(
1771    dtype: KernelDType,
1772    data: &'a mut [u8],
1773    dims: &'a [usize],
1774    strides: &'a [isize],
1775    offset: isize,
1776) -> strided_kernel::Result<ErasedRawStridedMut<'a>> {
1777    let data_ptr = NonNull::new(data.as_mut_ptr()).unwrap_or_else(NonNull::dangling);
1778    // SAFETY: callers derive `data` from a uniquely borrowed initialized host
1779    // destination and retain that borrow for the returned descriptor lifetime.
1780    unsafe {
1781        ErasedRawStridedMut::from_raw_parts(dtype, data_ptr, data.len(), dims, strides, offset)
1782    }
1783}
1784
1785fn execute_one_shot_map<T: 'static>(
1786    dtype: KernelDType,
1787    op: ErasedMapOp,
1788    ctx: &ExecContext,
1789    input: TypedTensorView<'_, T>,
1790    mut out: TypedTensorViewMut<'_, T>,
1791) -> crate::Result<()> {
1792    let input_data = input.host_storage()?;
1793    // INVARIANT: dtype and layout come from the same validated typed view, and
1794    // its host storage remains borrowed until replay returns.
1795    // SAFETY: input_data supplies the pointer and exact byte length; the view
1796    // owns the matching shape, signed strides, and in-bounds offset.
1797    let input_descriptor = erased_raw_strided_ptr(
1798        dtype,
1799        typed_bytes(input_data),
1800        input.shape(),
1801        input.strides(),
1802        input.offset(),
1803    )
1804    .map_err(|error| Error::backend_source("elementwise_read_into", error))?;
1805
1806    let out_dims = SmallVec::<[usize; 8]>::from_slice(out.shape());
1807    let out_strides = SmallVec::<[isize; 8]>::from_slice(out.strides());
1808    let out_offset = out.offset();
1809    let out_data = out.host_storage_mut()?;
1810    // INVARIANT: the copied output layout describes this uniquely borrowed
1811    // host storage, already validated as disjoint from every input.
1812    let mut out_descriptor = erased_raw_strided_mut(
1813        dtype,
1814        typed_bytes_mut(out_data),
1815        &out_dims,
1816        &out_strides,
1817        out_offset,
1818    )
1819    .map_err(|error| Error::backend_source("elementwise_read_into", error))?;
1820    erased_map_into(dtype, op, ctx, &mut out_descriptor, &input_descriptor)
1821        .map_err(|error| Error::backend_source("elementwise_read_into", error))
1822}
1823
1824fn execute_one_shot_zip<T: 'static>(
1825    dtype: KernelDType,
1826    op: ErasedZipOp,
1827    ctx: &ExecContext,
1828    lhs: TypedTensorView<'_, T>,
1829    rhs: TypedTensorView<'_, T>,
1830    mut out: TypedTensorViewMut<'_, T>,
1831) -> crate::Result<()> {
1832    let lhs_data = lhs.host_storage()?;
1833    // INVARIANT: dtype and layout come from the same validated typed view, and
1834    // its host storage remains borrowed until replay returns.
1835    // SAFETY: lhs_data supplies the pointer and exact byte length; the view
1836    // owns the matching shape, signed strides, and in-bounds offset.
1837    let lhs_descriptor = erased_raw_strided_ptr(
1838        dtype,
1839        typed_bytes(lhs_data),
1840        lhs.shape(),
1841        lhs.strides(),
1842        lhs.offset(),
1843    )
1844    .map_err(|error| Error::backend_source("elementwise_read_into", error))?;
1845    let rhs_data = rhs.host_storage()?;
1846    // INVARIANT: dtype and layout come from the same validated typed view, and
1847    // its host storage remains borrowed until replay returns.
1848    // SAFETY: rhs_data supplies the pointer and exact byte length; the view
1849    // owns the matching shape, signed strides, and in-bounds offset.
1850    let rhs_descriptor = erased_raw_strided_ptr(
1851        dtype,
1852        typed_bytes(rhs_data),
1853        rhs.shape(),
1854        rhs.strides(),
1855        rhs.offset(),
1856    )
1857    .map_err(|error| Error::backend_source("elementwise_read_into", error))?;
1858
1859    let out_dims = SmallVec::<[usize; 8]>::from_slice(out.shape());
1860    let out_strides = SmallVec::<[isize; 8]>::from_slice(out.strides());
1861    let out_offset = out.offset();
1862    let out_data = out.host_storage_mut()?;
1863    // INVARIANT: the copied output layout describes this uniquely borrowed
1864    // host storage, already validated as disjoint from every input.
1865    let mut out_descriptor = erased_raw_strided_mut(
1866        dtype,
1867        typed_bytes_mut(out_data),
1868        &out_dims,
1869        &out_strides,
1870        out_offset,
1871    )
1872    .map_err(|error| Error::backend_source("elementwise_read_into", error))?;
1873    erased_zip_into(
1874        dtype,
1875        op,
1876        ctx,
1877        &mut out_descriptor,
1878        &lhs_descriptor,
1879        &rhs_descriptor,
1880    )
1881    .map_err(|error| Error::backend_source("elementwise_read_into", error))
1882}
1883
1884fn execute_one_shot_elementwise(
1885    op: ElementwiseReadOp,
1886    inputs: &[TensorRead<'_>],
1887    out: TensorWrite<'_>,
1888    ctx: &ExecContext,
1889) -> crate::Result<()> {
1890    let out = tensor_write_view(out);
1891    macro_rules! dispatch_map {
1892        ($map_op:expr) => {{
1893            let input = inputs[0].clone().tensor_view();
1894            match (input, out) {
1895                (TensorView::F32(input), TensorViewMut::F32(out)) => {
1896                    execute_one_shot_map(KernelDType::F32, $map_op, ctx, input, out)
1897                }
1898                (TensorView::F64(input), TensorViewMut::F64(out)) => {
1899                    execute_one_shot_map(KernelDType::F64, $map_op, ctx, input, out)
1900                }
1901                (TensorView::I32(input), TensorViewMut::I32(out)) => {
1902                    execute_one_shot_map(KernelDType::I32, $map_op, ctx, input, out)
1903                }
1904                (TensorView::I64(input), TensorViewMut::I64(out)) => {
1905                    execute_one_shot_map(KernelDType::I64, $map_op, ctx, input, out)
1906                }
1907                (TensorView::Bool(input), TensorViewMut::Bool(out)) => {
1908                    execute_one_shot_map(KernelDType::Bool, $map_op, ctx, input, out)
1909                }
1910                (TensorView::C32(input), TensorViewMut::C32(out)) => {
1911                    execute_one_shot_map(KernelDType::C32, $map_op, ctx, input, out)
1912                }
1913                (TensorView::C64(input), TensorViewMut::C64(out)) => {
1914                    execute_one_shot_map(KernelDType::C64, $map_op, ctx, input, out)
1915                }
1916                _ => unreachable!("one-shot eligibility validates matching dtypes"),
1917            }
1918        }};
1919    }
1920    macro_rules! dispatch_zip {
1921        ($zip_op:expr) => {{
1922            let lhs = inputs[0].clone().tensor_view();
1923            let rhs = inputs[1].clone().tensor_view();
1924            match (lhs, rhs, out) {
1925                (TensorView::F32(lhs), TensorView::F32(rhs), TensorViewMut::F32(out)) => {
1926                    execute_one_shot_zip(KernelDType::F32, $zip_op, ctx, lhs, rhs, out)
1927                }
1928                (TensorView::F64(lhs), TensorView::F64(rhs), TensorViewMut::F64(out)) => {
1929                    execute_one_shot_zip(KernelDType::F64, $zip_op, ctx, lhs, rhs, out)
1930                }
1931                (TensorView::I32(lhs), TensorView::I32(rhs), TensorViewMut::I32(out)) => {
1932                    execute_one_shot_zip(KernelDType::I32, $zip_op, ctx, lhs, rhs, out)
1933                }
1934                (TensorView::I64(lhs), TensorView::I64(rhs), TensorViewMut::I64(out)) => {
1935                    execute_one_shot_zip(KernelDType::I64, $zip_op, ctx, lhs, rhs, out)
1936                }
1937                (TensorView::C32(lhs), TensorView::C32(rhs), TensorViewMut::C32(out)) => {
1938                    execute_one_shot_zip(KernelDType::C32, $zip_op, ctx, lhs, rhs, out)
1939                }
1940                (TensorView::C64(lhs), TensorView::C64(rhs), TensorViewMut::C64(out)) => {
1941                    execute_one_shot_zip(KernelDType::C64, $zip_op, ctx, lhs, rhs, out)
1942                }
1943                _ => unreachable!("one-shot eligibility validates matching dtypes"),
1944            }
1945        }};
1946    }
1947
1948    match op {
1949        ElementwiseReadOp::Add => dispatch_zip!(ErasedZipOp::Add),
1950        ElementwiseReadOp::Subtract => dispatch_zip!(ErasedZipOp::Subtract),
1951        ElementwiseReadOp::Multiply => dispatch_zip!(ErasedZipOp::Multiply),
1952        ElementwiseReadOp::Divide => dispatch_zip!(ErasedZipOp::Divide),
1953        ElementwiseReadOp::Negate => dispatch_map!(ErasedMapOp::Negate),
1954        ElementwiseReadOp::Conj => dispatch_map!(ErasedMapOp::Conj),
1955    }
1956}
1957
1958/// Execute the shared elementwise-into path with an explicit replay context.
1959///
1960/// This is backend glue for implementations that own an execution context.
1961///
1962/// # Errors
1963///
1964/// Returns [`crate::Error::Validation`] when the input arity or tensor
1965/// metadata is invalid, or when the destination overlaps an input. Returns
1966/// [`crate::Error::BackendSource`] when an eligible strided replay fails.
1967/// Errors returned by `fallback` are preserved unchanged.
1968#[doc(hidden)]
1969pub fn elementwise_read_into_with_context(
1970    op: ElementwiseReadOp,
1971    inputs: &[TensorRead<'_>],
1972    out: TensorWrite<'_>,
1973    ctx: &ExecContext,
1974    fallback: impl FnOnce(&[TensorRead<'_>], TensorWrite<'_>) -> crate::Result<()>,
1975) -> crate::Result<()> {
1976    if inputs.len() != op.arity() {
1977        return Err(Error::invalid_argument(
1978            op.label(),
1979            "inputs",
1980            format!("expected {} inputs, got {}", op.arity(), inputs.len()),
1981        ));
1982    }
1983    validate_elementwise_output_disjoint(op, inputs, &out)?;
1984    if one_shot_eligible(op, inputs, &out) {
1985        execute_one_shot_elementwise(op, inputs, out, ctx)
1986    } else {
1987        fallback(inputs, out)
1988    }
1989}
1990
1991/// Elementwise tensor operations.
1992///
1993/// # Examples
1994///
1995/// ```rust
1996/// use tenferro_tensor::TensorElementwise;
1997///
1998/// fn accepts_elementwise<B: TensorElementwise>(_backend: &mut B) {}
1999/// ```
2000pub trait TensorElementwise: TensorStructural {
2001    /// Execute an elementwise operation into caller-owned storage.
2002    ///
2003    /// Backend implementations normally override this hook only to inject
2004    /// their explicit execution context and buffer policy. The default uses a
2005    /// serial host one-shot kernel and preserves the allocating fallback for
2006    /// device storage, dtype promotion, and broadcasting.
2007    ///
2008    /// # Errors
2009    ///
2010    /// Returns [`crate::Error::Validation`] when `inputs` has the wrong arity,
2011    /// tensor metadata is invalid, or the destination overlaps an input.
2012    /// Returns [`crate::Error::BackendSource`] when the strided kernel rejects
2013    /// an eligible host operation. Errors from the allocating backend fallback
2014    /// are preserved unchanged.
2015    fn elementwise_read_into(
2016        &mut self,
2017        op: ElementwiseReadOp,
2018        inputs: &[TensorRead<'_>],
2019        out: TensorWrite<'_>,
2020    ) -> crate::Result<()> {
2021        let ctx = ExecContext::serial();
2022        elementwise_read_into_with_context(op, inputs, out, &ctx, |inputs, out| {
2023            let result = match op {
2024                ElementwiseReadOp::Add => self.add_read(inputs[0].clone(), inputs[1].clone())?,
2025                ElementwiseReadOp::Subtract => {
2026                    self.sub_read(inputs[0].clone(), inputs[1].clone())?
2027                }
2028                ElementwiseReadOp::Multiply => {
2029                    self.mul_read(inputs[0].clone(), inputs[1].clone())?
2030                }
2031                ElementwiseReadOp::Negate => self.neg_read(inputs[0].clone())?,
2032                ElementwiseReadOp::Conj => self.conj_read(inputs[0].clone())?,
2033                ElementwiseReadOp::Divide => self.div_read(inputs[0].clone(), inputs[1].clone())?,
2034            };
2035            self.copy_read_into(TensorRead::from_tensor(&result), out)
2036        })
2037    }
2038
2039    /// # Errors
2040    ///
2041    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2042    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2043    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2044    /// backend execution or storage access cannot provide the requested result.
2045    fn add(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor>;
2046
2047    /// Elementwise addition accepting either owned tensors or borrowed views.
2048    ///
2049    /// Backends that implement this method must not silently move data across
2050    /// devices. A backend that cannot consume views should return an explicit
2051    /// backend error rather than materializing or transferring implicitly.
2052    ///
2053    /// # Examples
2054    ///
2055    /// ```rust
2056    /// use tenferro_tensor::{Tensor, TensorElementwise, TensorRead};
2057    ///
2058    /// fn add_owned<B: TensorElementwise>(
2059    ///     backend: &mut B,
2060    ///     lhs: &Tensor,
2061    ///     rhs: &Tensor,
2062    /// ) -> tenferro_tensor::Result<Tensor> {
2063    ///     backend.add_read(TensorRead::from_tensor(lhs), TensorRead::from_tensor(rhs))
2064    /// }
2065    /// ```
2066    /// # Errors
2067    ///
2068    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2069    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2070    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2071    /// backend execution or storage access cannot provide the requested result.
2072    fn add_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2073        self.add(read_tensor("add", lhs)?, read_tensor("add", rhs)?)
2074    }
2075
2076    /// Overwrite caller-provided output with elementwise addition.
2077    ///
2078    /// `_into` methods never accumulate into the previous output value.
2079    ///
2080    /// # Examples
2081    ///
2082    /// ```rust
2083    /// use tenferro_tensor::{Tensor, TensorElementwise, TensorWrite};
2084    ///
2085    /// fn add_into<B: TensorElementwise>(
2086    ///     backend: &mut B,
2087    ///     lhs: &Tensor,
2088    ///     rhs: &Tensor,
2089    ///     mut out: Tensor,
2090    /// ) -> tenferro_tensor::Result<Tensor> {
2091    ///     backend.add_into(lhs, rhs, TensorWrite::from_tensor(&mut out))?;
2092    ///     Ok(out)
2093    /// }
2094    /// ```
2095    /// # Errors
2096    ///
2097    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2098    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2099    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2100    /// backend execution or storage access cannot provide the requested result.
2101    fn add_into(&mut self, lhs: &Tensor, rhs: &Tensor, out: TensorWrite<'_>) -> crate::Result<()> {
2102        self.add_read_into(
2103            TensorRead::from_tensor(lhs),
2104            TensorRead::from_tensor(rhs),
2105            out,
2106        )
2107    }
2108
2109    /// Overwrite caller-provided output with elementwise addition from reads.
2110    ///
2111    /// # Examples
2112    ///
2113    /// ```rust
2114    /// use tenferro_tensor::{TensorElementwise, TensorRead, TensorWrite};
2115    ///
2116    /// fn add_read_into<B: TensorElementwise>(
2117    ///     backend: &mut B,
2118    ///     lhs: TensorRead<'_>,
2119    ///     rhs: TensorRead<'_>,
2120    ///     out: TensorWrite<'_>,
2121    /// ) -> tenferro_tensor::Result<()> {
2122    ///     backend.add_read_into(lhs, rhs, out)
2123    /// }
2124    /// ```
2125    /// # Errors
2126    ///
2127    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2128    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2129    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2130    /// backend execution or storage access cannot provide the requested result.
2131    fn add_read_into(
2132        &mut self,
2133        lhs: TensorRead<'_>,
2134        rhs: TensorRead<'_>,
2135        out: TensorWrite<'_>,
2136    ) -> crate::Result<()> {
2137        self.elementwise_read_into(ElementwiseReadOp::Add, &[lhs, rhs], out)
2138    }
2139
2140    /// # Errors
2141    ///
2142    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2143    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2144    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2145    /// backend execution or storage access cannot provide the requested result.
2146    fn sub(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor>;
2147
2148    /// Elementwise subtraction accepting either owned tensors or borrowed views.
2149    ///
2150    /// # Examples
2151    ///
2152    /// ```rust
2153    /// use tenferro_tensor::{Tensor, TensorElementwise, TensorRead};
2154    ///
2155    /// fn sub_owned<B: TensorElementwise>(
2156    ///     backend: &mut B,
2157    ///     lhs: &Tensor,
2158    ///     rhs: &Tensor,
2159    /// ) -> tenferro_tensor::Result<Tensor> {
2160    ///     backend.sub_read(TensorRead::from_tensor(lhs), TensorRead::from_tensor(rhs))
2161    /// }
2162    /// ```
2163    /// # Errors
2164    ///
2165    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2166    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2167    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2168    /// backend execution or storage access cannot provide the requested result.
2169    fn sub_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2170        self.sub(read_tensor("sub", lhs)?, read_tensor("sub", rhs)?)
2171    }
2172
2173    /// Overwrite caller-provided output with elementwise subtraction.
2174    /// # Errors
2175    ///
2176    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2177    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2178    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2179    /// backend execution or storage access cannot provide the requested result.
2180    fn sub_into(&mut self, lhs: &Tensor, rhs: &Tensor, out: TensorWrite<'_>) -> crate::Result<()> {
2181        self.sub_read_into(
2182            TensorRead::from_tensor(lhs),
2183            TensorRead::from_tensor(rhs),
2184            out,
2185        )
2186    }
2187
2188    /// Overwrite caller-provided output with elementwise subtraction from reads.
2189    /// # Errors
2190    ///
2191    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2192    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2193    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2194    /// backend execution or storage access cannot provide the requested result.
2195    fn sub_read_into(
2196        &mut self,
2197        lhs: TensorRead<'_>,
2198        rhs: TensorRead<'_>,
2199        out: TensorWrite<'_>,
2200    ) -> crate::Result<()> {
2201        self.elementwise_read_into(ElementwiseReadOp::Subtract, &[lhs, rhs], out)
2202    }
2203
2204    /// # Errors
2205    ///
2206    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2207    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2208    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2209    /// backend execution or storage access cannot provide the requested result.
2210    fn mul(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor>;
2211    /// # Errors
2212    ///
2213    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2214    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2215    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2216    /// backend execution or storage access cannot provide the requested result.
2217    fn mul_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2218        self.mul(read_tensor("mul", lhs)?, read_tensor("mul", rhs)?)
2219    }
2220
2221    /// Overwrite caller-provided output with elementwise multiplication.
2222    /// # Errors
2223    ///
2224    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2225    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2226    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2227    /// backend execution or storage access cannot provide the requested result.
2228    fn mul_into(&mut self, lhs: &Tensor, rhs: &Tensor, out: TensorWrite<'_>) -> crate::Result<()> {
2229        self.mul_read_into(
2230            TensorRead::from_tensor(lhs),
2231            TensorRead::from_tensor(rhs),
2232            out,
2233        )
2234    }
2235
2236    /// Overwrite caller-provided output with elementwise multiplication from reads.
2237    /// # Errors
2238    ///
2239    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2240    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2241    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2242    /// backend execution or storage access cannot provide the requested result.
2243    fn mul_read_into(
2244        &mut self,
2245        lhs: TensorRead<'_>,
2246        rhs: TensorRead<'_>,
2247        out: TensorWrite<'_>,
2248    ) -> crate::Result<()> {
2249        self.elementwise_read_into(ElementwiseReadOp::Multiply, &[lhs, rhs], out)
2250    }
2251
2252    /// # Errors
2253    ///
2254    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2255    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2256    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2257    /// backend execution or storage access cannot provide the requested result.
2258    fn neg(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2259    /// # Errors
2260    ///
2261    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2262    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2263    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2264    /// backend execution or storage access cannot provide the requested result.
2265    fn neg_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2266        self.neg(read_tensor("neg", input)?)
2267    }
2268
2269    /// Overwrite caller-provided output with elementwise negation.
2270    /// # Errors
2271    ///
2272    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2273    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2274    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2275    /// backend execution or storage access cannot provide the requested result.
2276    fn neg_into(&mut self, input: &Tensor, out: TensorWrite<'_>) -> crate::Result<()> {
2277        self.neg_read_into(TensorRead::from_tensor(input), out)
2278    }
2279
2280    /// Overwrite caller-provided output with elementwise negation from a read.
2281    /// # Errors
2282    ///
2283    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2284    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2285    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2286    /// backend execution or storage access cannot provide the requested result.
2287    fn neg_read_into(&mut self, input: TensorRead<'_>, out: TensorWrite<'_>) -> crate::Result<()> {
2288        self.elementwise_read_into(ElementwiseReadOp::Negate, &[input], out)
2289    }
2290
2291    /// # Errors
2292    ///
2293    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2294    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2295    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2296    /// backend execution or storage access cannot provide the requested result.
2297    fn conj(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2298    /// # Errors
2299    ///
2300    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2301    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2302    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2303    /// backend execution or storage access cannot provide the requested result.
2304    fn conj_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2305        self.conj(read_tensor("conj", input)?)
2306    }
2307
2308    /// Overwrite caller-provided output with elementwise conjugation.
2309    /// # Errors
2310    ///
2311    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2312    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2313    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2314    /// backend execution or storage access cannot provide the requested result.
2315    fn conj_into(&mut self, input: &Tensor, out: TensorWrite<'_>) -> crate::Result<()> {
2316        self.conj_read_into(TensorRead::from_tensor(input), out)
2317    }
2318
2319    /// Overwrite caller-provided output with elementwise conjugation from a read.
2320    /// # Errors
2321    ///
2322    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2323    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2324    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2325    /// backend execution or storage access cannot provide the requested result.
2326    fn conj_read_into(&mut self, input: TensorRead<'_>, out: TensorWrite<'_>) -> crate::Result<()> {
2327        self.elementwise_read_into(ElementwiseReadOp::Conj, &[input], out)
2328    }
2329
2330    /// # Errors
2331    ///
2332    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2333    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2334    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2335    /// backend execution or storage access cannot provide the requested result.
2336    fn div(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor>;
2337    /// # Errors
2338    ///
2339    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2340    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2341    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2342    /// backend execution or storage access cannot provide the requested result.
2343    fn div_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2344        self.div(read_tensor("div", lhs)?, read_tensor("div", rhs)?)
2345    }
2346
2347    /// Overwrite caller-provided output with elementwise division.
2348    /// # Errors
2349    ///
2350    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2351    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2352    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2353    /// backend execution or storage access cannot provide the requested result.
2354    fn div_into(&mut self, lhs: &Tensor, rhs: &Tensor, out: TensorWrite<'_>) -> crate::Result<()> {
2355        self.div_read_into(
2356            TensorRead::from_tensor(lhs),
2357            TensorRead::from_tensor(rhs),
2358            out,
2359        )
2360    }
2361
2362    /// Overwrite caller-provided output with elementwise division from reads.
2363    /// # Errors
2364    ///
2365    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2366    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2367    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2368    /// backend execution or storage access cannot provide the requested result.
2369    fn div_read_into(
2370        &mut self,
2371        lhs: TensorRead<'_>,
2372        rhs: TensorRead<'_>,
2373        out: TensorWrite<'_>,
2374    ) -> crate::Result<()> {
2375        self.elementwise_read_into(ElementwiseReadOp::Divide, &[lhs, rhs], out)
2376    }
2377
2378    /// Elementwise remainder.
2379    ///
2380    /// The default is an explicit unsupported error so backend implementors can
2381    /// opt in without silent fallback.
2382    ///
2383    /// # Examples
2384    ///
2385    /// ```rust
2386    /// use tenferro_tensor::{Tensor, TensorElementwise};
2387    ///
2388    /// fn rem_owned<B: TensorElementwise>(
2389    ///     backend: &mut B,
2390    ///     lhs: &Tensor,
2391    ///     rhs: &Tensor,
2392    /// ) -> tenferro_tensor::Result<Tensor> {
2393    ///     backend.rem(lhs, rhs)
2394    /// }
2395    /// ```
2396    /// # Errors
2397    ///
2398    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2399    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2400    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2401    /// backend execution or storage access cannot provide the requested result.
2402    fn rem(&mut self, lhs: &Tensor, _rhs: &Tensor) -> crate::Result<Tensor> {
2403        Err(crate::Error::unsupported(
2404            "rem",
2405            format!("backend does not implement rem for dtype {:?}", lhs.dtype()),
2406        ))
2407    }
2408
2409    /// Elementwise remainder accepting owned tensors or borrowed views.
2410    ///
2411    /// # Examples
2412    ///
2413    /// ```rust
2414    /// use tenferro_tensor::{Tensor, TensorElementwise, TensorRead};
2415    ///
2416    /// fn rem_read<B: TensorElementwise>(
2417    ///     backend: &mut B,
2418    ///     lhs: &Tensor,
2419    ///     rhs: &Tensor,
2420    /// ) -> tenferro_tensor::Result<Tensor> {
2421    ///     backend.rem_read(TensorRead::from_tensor(lhs), TensorRead::from_tensor(rhs))
2422    /// }
2423    /// ```
2424    /// # Errors
2425    ///
2426    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2427    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2428    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2429    /// backend execution or storage access cannot provide the requested result.
2430    fn rem_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2431        self.rem(read_tensor("rem", lhs)?, read_tensor("rem", rhs)?)
2432    }
2433
2434    /// # Errors
2435    ///
2436    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2437    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2438    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2439    /// backend execution or storage access cannot provide the requested result.
2440    fn abs(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2441    /// # Errors
2442    ///
2443    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2444    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2445    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2446    /// backend execution or storage access cannot provide the requested result.
2447    fn abs_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2448        self.abs(read_tensor("abs", input)?)
2449    }
2450
2451    /// # Errors
2452    ///
2453    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2454    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2455    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2456    /// backend execution or storage access cannot provide the requested result.
2457    fn sign(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2458    /// # Errors
2459    ///
2460    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2461    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2462    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2463    /// backend execution or storage access cannot provide the requested result.
2464    fn sign_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2465        self.sign(read_tensor("sign", input)?)
2466    }
2467
2468    /// # Errors
2469    ///
2470    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2471    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2472    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2473    /// backend execution or storage access cannot provide the requested result.
2474    fn maximum(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor>;
2475    /// # Errors
2476    ///
2477    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2478    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2479    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2480    /// backend execution or storage access cannot provide the requested result.
2481    fn maximum_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2482        self.maximum(read_tensor("maximum", lhs)?, read_tensor("maximum", rhs)?)
2483    }
2484
2485    /// # Errors
2486    ///
2487    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2488    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2489    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2490    /// backend execution or storage access cannot provide the requested result.
2491    fn minimum(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor>;
2492    /// # Errors
2493    ///
2494    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2495    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2496    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2497    /// backend execution or storage access cannot provide the requested result.
2498    fn minimum_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2499        self.minimum(read_tensor("minimum", lhs)?, read_tensor("minimum", rhs)?)
2500    }
2501
2502    /// # Errors
2503    ///
2504    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2505    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2506    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2507    /// backend execution or storage access cannot provide the requested result.
2508    fn compare(&mut self, lhs: &Tensor, rhs: &Tensor, dir: &CompareDir) -> crate::Result<Tensor>;
2509    /// # Errors
2510    ///
2511    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2512    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2513    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2514    /// backend execution or storage access cannot provide the requested result.
2515    fn compare_read(
2516        &mut self,
2517        lhs: TensorRead<'_>,
2518        rhs: TensorRead<'_>,
2519        dir: &CompareDir,
2520    ) -> crate::Result<Tensor> {
2521        self.compare(
2522            read_tensor("compare", lhs)?,
2523            read_tensor("compare", rhs)?,
2524            dir,
2525        )
2526    }
2527
2528    /// # Errors
2529    ///
2530    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2531    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2532    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2533    /// backend execution or storage access cannot provide the requested result.
2534    fn select(
2535        &mut self,
2536        pred: &Tensor,
2537        on_true: &Tensor,
2538        on_false: &Tensor,
2539    ) -> crate::Result<Tensor>;
2540    /// # Errors
2541    ///
2542    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2543    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2544    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2545    /// backend execution or storage access cannot provide the requested result.
2546    fn select_read(
2547        &mut self,
2548        pred: TensorRead<'_>,
2549        on_true: TensorRead<'_>,
2550        on_false: TensorRead<'_>,
2551    ) -> crate::Result<Tensor> {
2552        self.select(
2553            read_tensor("select", pred)?,
2554            read_tensor("select", on_true)?,
2555            read_tensor("select", on_false)?,
2556        )
2557    }
2558
2559    /// # Errors
2560    ///
2561    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2562    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2563    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2564    /// backend execution or storage access cannot provide the requested result.
2565    fn clamp(&mut self, input: &Tensor, lower: &Tensor, upper: &Tensor) -> crate::Result<Tensor>;
2566    /// # Errors
2567    ///
2568    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2569    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2570    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2571    /// backend execution or storage access cannot provide the requested result.
2572    fn clamp_read(
2573        &mut self,
2574        input: TensorRead<'_>,
2575        lower: TensorRead<'_>,
2576        upper: TensorRead<'_>,
2577    ) -> crate::Result<Tensor> {
2578        self.clamp(
2579            read_tensor("clamp", input)?,
2580            read_tensor("clamp", lower)?,
2581            read_tensor("clamp", upper)?,
2582        )
2583    }
2584}
2585
2586/// Analytic unary and binary tensor operations.
2587///
2588/// # Examples
2589///
2590/// ```rust
2591/// use tenferro_tensor::TensorAnalytic;
2592///
2593/// fn accepts_analytic<B: TensorAnalytic>(_backend: &mut B) {}
2594/// ```
2595pub trait TensorAnalytic {
2596    /// # Errors
2597    ///
2598    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2599    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2600    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2601    /// backend execution or storage access cannot provide the requested result.
2602    fn exp(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2603    /// # Errors
2604    ///
2605    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2606    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2607    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2608    /// backend execution or storage access cannot provide the requested result.
2609    fn exp_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2610        self.exp(read_tensor("exp", input)?)
2611    }
2612
2613    /// # Errors
2614    ///
2615    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2616    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2617    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2618    /// backend execution or storage access cannot provide the requested result.
2619    fn log(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2620    /// # Errors
2621    ///
2622    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2623    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2624    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2625    /// backend execution or storage access cannot provide the requested result.
2626    fn log_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2627        self.log(read_tensor("log", input)?)
2628    }
2629
2630    /// # Errors
2631    ///
2632    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2633    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2634    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2635    /// backend execution or storage access cannot provide the requested result.
2636    fn sin(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2637    /// # Errors
2638    ///
2639    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2640    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2641    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2642    /// backend execution or storage access cannot provide the requested result.
2643    fn sin_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2644        self.sin(read_tensor("sin", input)?)
2645    }
2646
2647    /// # Errors
2648    ///
2649    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2650    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2651    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2652    /// backend execution or storage access cannot provide the requested result.
2653    fn cos(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2654    /// # Errors
2655    ///
2656    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2657    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2658    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2659    /// backend execution or storage access cannot provide the requested result.
2660    fn cos_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2661        self.cos(read_tensor("cos", input)?)
2662    }
2663
2664    /// # Errors
2665    ///
2666    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2667    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2668    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2669    /// backend execution or storage access cannot provide the requested result.
2670    fn tanh(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2671    /// # Errors
2672    ///
2673    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2674    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2675    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2676    /// backend execution or storage access cannot provide the requested result.
2677    fn tanh_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2678        self.tanh(read_tensor("tanh", input)?)
2679    }
2680
2681    /// # Errors
2682    ///
2683    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2684    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2685    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2686    /// backend execution or storage access cannot provide the requested result.
2687    fn sqrt(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2688    /// # Errors
2689    ///
2690    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2691    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2692    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2693    /// backend execution or storage access cannot provide the requested result.
2694    fn sqrt_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2695        self.sqrt(read_tensor("sqrt", input)?)
2696    }
2697
2698    /// # Errors
2699    ///
2700    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2701    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2702    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2703    /// backend execution or storage access cannot provide the requested result.
2704    fn rsqrt(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2705    /// # Errors
2706    ///
2707    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2708    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2709    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2710    /// backend execution or storage access cannot provide the requested result.
2711    fn rsqrt_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2712        self.rsqrt(read_tensor("rsqrt", input)?)
2713    }
2714
2715    /// # Errors
2716    ///
2717    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2718    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2719    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2720    /// backend execution or storage access cannot provide the requested result.
2721    fn pow(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor>;
2722    /// # Errors
2723    ///
2724    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2725    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2726    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2727    /// backend execution or storage access cannot provide the requested result.
2728    fn pow_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2729        self.pow(read_tensor("pow", lhs)?, read_tensor("pow", rhs)?)
2730    }
2731
2732    /// # Errors
2733    ///
2734    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2735    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2736    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2737    /// backend execution or storage access cannot provide the requested result.
2738    fn expm1(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2739    /// # Errors
2740    ///
2741    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2742    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2743    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2744    /// backend execution or storage access cannot provide the requested result.
2745    fn expm1_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2746        self.expm1(read_tensor("expm1", input)?)
2747    }
2748
2749    /// # Errors
2750    ///
2751    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2752    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2753    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2754    /// backend execution or storage access cannot provide the requested result.
2755    fn log1p(&mut self, input: &Tensor) -> crate::Result<Tensor>;
2756    /// # Errors
2757    ///
2758    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2759    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2760    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2761    /// backend execution or storage access cannot provide the requested result.
2762    fn log1p_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2763        self.log1p(read_tensor("log1p", input)?)
2764    }
2765}
2766
2767/// Shape, layout, and dtype transformation operations.
2768///
2769/// # Examples
2770///
2771/// ```rust
2772/// use tenferro_tensor::TensorStructural;
2773///
2774/// fn accepts_structural<B: TensorStructural>(_backend: &mut B) {}
2775/// ```
2776pub trait TensorStructural {
2777    /// Materialize an owned tensor or borrowed view into fresh compact storage.
2778    ///
2779    /// The result has the input's shape and dtype, uses compact column-major
2780    /// layout, and remains in the input's placement. This operation is a
2781    /// same-placement canonicalization boundary, never an implicit host/device
2782    /// transfer. The conservative default accepts only compact host-owned
2783    /// tensors and clones them; it rejects views, backend buffers, and device
2784    /// placement because only an owning backend can materialize those safely.
2785    ///
2786    /// Backend overrides may accept strided views. CUDA accepts numeric and
2787    /// complex views on its active device, including arbitrary valid strides,
2788    /// but currently reports an explicit unsupported-dtype error for `Bool`.
2789    ///
2790    /// # Examples
2791    ///
2792    /// ```rust
2793    /// use tenferro_tensor::{DType, Tensor, TensorRead, TensorStructural};
2794    ///
2795    /// struct HostDefaults;
2796    /// impl TensorStructural for HostDefaults {
2797    ///     fn transpose(&mut self, _: &Tensor, _: &[usize]) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2798    ///     fn reshape(&mut self, _: &Tensor, _: &[usize]) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2799    ///     fn broadcast_in_dim(&mut self, _: &Tensor, _: &[usize], _: &[usize]) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2800    ///     fn cast(&mut self, _: &Tensor, _: DType) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2801    ///     fn extract_diagonal(&mut self, _: &Tensor, _: usize, _: usize) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2802    ///     fn embed_diagonal(&mut self, _: &Tensor, _: usize, _: usize) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2803    ///     fn tril(&mut self, _: &Tensor, _: i64) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2804    ///     fn triu(&mut self, _: &Tensor, _: i64) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2805    /// }
2806    ///
2807    /// let input = Tensor::from_vec_col_major(vec![2], vec![1_i32, 2])?;
2808    /// let mut backend = HostDefaults;
2809    /// let structural: &mut dyn TensorStructural = &mut backend;
2810    /// let output = structural.to_contiguous_read(TensorRead::from_tensor(&input))?;
2811    /// assert_eq!(output.shape(), &[2]);
2812    /// assert_eq!(output.as_slice::<i32>()?, &[1, 2]);
2813    /// # Ok::<(), tenferro_tensor::Error>(())
2814    /// ```
2815    /// # Errors
2816    ///
2817    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2818    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2819    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2820    /// backend execution or storage access cannot provide the requested result.
2821    fn to_contiguous_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2822        let input = read_tensor("to_contiguous_read", input)?;
2823        if input.is_backend_buffer()
2824            || !matches!(
2825                input.placement().memory_kind,
2826                crate::MemoryKind::PinnedHost | crate::MemoryKind::UnpinnedHost
2827            )
2828        {
2829            return Err(crate::Error::runtime_state(
2830                "to_contiguous_read",
2831                "default materialization accepts only host-owned tensors; use the storage's owning backend",
2832            ));
2833        }
2834        Ok(input.clone())
2835    }
2836
2837    /// Overwrite caller-provided storage from a readable tensor or view.
2838    ///
2839    /// Source and destination must have identical dtype and shape and belong to
2840    /// the executing backend's placement. The destination is not resized, and
2841    /// every logical destination element is overwritten without reading its old
2842    /// value. Source and destination allocations must not alias. Implementations
2843    /// must not materialize through host memory or perform an implicit transfer.
2844    ///
2845    /// CPU accepts arbitrary valid source and destination strides and performs
2846    /// no tensor allocation. CUDA currently accepts only a compact column-major
2847    /// source with offset zero covering its full allocation; CUDA destinations
2848    /// may be arbitrary valid non-overlapping views. CUDA rejects aliased
2849    /// allocations and currently reports an explicit unsupported-dtype error
2850    /// for `Bool`. The conservative default is explicitly unsupported.
2851    ///
2852    /// # Examples
2853    ///
2854    /// ```rust
2855    /// use tenferro_tensor::{DType, Tensor, TensorRead, TensorStructural, TensorWrite};
2856    ///
2857    /// struct ConservativeDefaults;
2858    /// impl TensorStructural for ConservativeDefaults {
2859    ///     fn transpose(&mut self, _: &Tensor, _: &[usize]) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2860    ///     fn reshape(&mut self, _: &Tensor, _: &[usize]) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2861    ///     fn broadcast_in_dim(&mut self, _: &Tensor, _: &[usize], _: &[usize]) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2862    ///     fn cast(&mut self, _: &Tensor, _: DType) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2863    ///     fn extract_diagonal(&mut self, _: &Tensor, _: usize, _: usize) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2864    ///     fn embed_diagonal(&mut self, _: &Tensor, _: usize, _: usize) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2865    ///     fn tril(&mut self, _: &Tensor, _: i64) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2866    ///     fn triu(&mut self, _: &Tensor, _: i64) -> tenferro_tensor::Result<Tensor> { unimplemented!() }
2867    /// }
2868    ///
2869    /// let src = Tensor::from_vec_col_major(vec![2], vec![1_i32, 2])?;
2870    /// let mut dst = Tensor::from_vec_col_major(vec![2], vec![0_i32, 0])?;
2871    /// let mut backend = ConservativeDefaults;
2872    /// let structural: &mut dyn TensorStructural = &mut backend;
2873    /// let error = structural.copy_read_into(
2874    ///     TensorRead::from_tensor(&src),
2875    ///     TensorWrite::from_tensor(&mut dst),
2876    /// ).unwrap_err();
2877    /// assert!(error.to_string().contains("unsupported"));
2878    /// assert_eq!(dst.as_slice::<i32>()?, &[0, 0]);
2879    /// # Ok::<(), tenferro_tensor::Error>(())
2880    /// ```
2881    /// # Errors
2882    ///
2883    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2884    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2885    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2886    /// backend execution or storage access cannot provide the requested result.
2887    fn copy_read_into(&mut self, _src: TensorRead<'_>, _dst: TensorWrite<'_>) -> crate::Result<()> {
2888        Err(crate::Error::unsupported(
2889            "copy_read_into",
2890            "backend-owned runtime copy is unsupported by this backend",
2891        ))
2892    }
2893
2894    /// # Errors
2895    ///
2896    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2897    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2898    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2899    /// backend execution or storage access cannot provide the requested result.
2900    fn transpose(&mut self, input: &Tensor, perm: &[usize]) -> crate::Result<Tensor>;
2901    /// # Errors
2902    ///
2903    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2904    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2905    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2906    /// backend execution or storage access cannot provide the requested result.
2907    fn transpose_read(&mut self, input: TensorRead<'_>, perm: &[usize]) -> crate::Result<Tensor> {
2908        self.transpose(read_tensor("transpose", input)?, perm)
2909    }
2910
2911    /// # Errors
2912    ///
2913    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2914    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2915    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2916    /// backend execution or storage access cannot provide the requested result.
2917    fn reshape(&mut self, input: &Tensor, shape: &[usize]) -> crate::Result<Tensor>;
2918    /// # Errors
2919    ///
2920    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2921    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2922    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2923    /// backend execution or storage access cannot provide the requested result.
2924    fn reshape_read(&mut self, input: TensorRead<'_>, shape: &[usize]) -> crate::Result<Tensor> {
2925        self.reshape(read_tensor("reshape", input)?, shape)
2926    }
2927
2928    /// # Errors
2929    ///
2930    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2931    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2932    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2933    /// backend execution or storage access cannot provide the requested result.
2934    fn broadcast_in_dim(
2935        &mut self,
2936        input: &Tensor,
2937        shape: &[usize],
2938        dims: &[usize],
2939    ) -> crate::Result<Tensor>;
2940    /// # Errors
2941    ///
2942    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2943    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2944    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2945    /// backend execution or storage access cannot provide the requested result.
2946    fn broadcast_in_dim_read(
2947        &mut self,
2948        input: TensorRead<'_>,
2949        shape: &[usize],
2950        dims: &[usize],
2951    ) -> crate::Result<Tensor> {
2952        self.broadcast_in_dim(read_tensor("broadcast_in_dim", input)?, shape, dims)
2953    }
2954
2955    /// Cast a tensor to another dtype using explicit dtype projection.
2956    ///
2957    /// Backends may truncate, narrow precision, project complex values, or use
2958    /// boolean truthiness according to their documented cast support.
2959    ///
2960    /// # Examples
2961    ///
2962    /// ```rust
2963    /// use tenferro_tensor::{DType, Tensor, TensorStructural};
2964    ///
2965    /// fn cast_to_i32<B: TensorStructural>(
2966    ///     backend: &mut B,
2967    ///     input: &Tensor,
2968    /// ) -> tenferro_tensor::Result<Tensor> {
2969    ///     backend.cast(input, DType::I32)
2970    /// }
2971    /// ```
2972    /// # Errors
2973    ///
2974    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
2975    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
2976    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
2977    /// backend execution or storage access cannot provide the requested result.
2978    fn cast(&mut self, input: &Tensor, to: crate::DType) -> crate::Result<Tensor>;
2979
2980    /// Convert a tensor to another dtype using checked dtype conversion.
2981    ///
2982    /// `convert` accepts only conversions allowed by tenferro's dtype-promotion
2983    /// lattice. Use [`TensorStructural::cast`] for explicit lossy projection.
2984    ///
2985    /// # Examples
2986    ///
2987    /// ```rust
2988    /// use tenferro_tensor::{DType, Tensor, TensorStructural};
2989    ///
2990    /// fn convert_to_f64<B: TensorStructural>(
2991    ///     backend: &mut B,
2992    ///     input: &Tensor,
2993    /// ) -> tenferro_tensor::Result<Tensor> {
2994    ///     backend.convert(input, DType::F64)
2995    /// }
2996    /// ```
2997    /// # Errors
2998    ///
2999    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3000    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3001    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3002    /// backend execution or storage access cannot provide the requested result.
3003    fn convert(&mut self, input: &Tensor, to: crate::DType) -> crate::Result<Tensor> {
3004        validate_convert_dtype("convert", input.dtype(), to)?;
3005        self.cast(input, to)
3006    }
3007
3008    /// # Errors
3009    ///
3010    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3011    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3012    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3013    /// backend execution or storage access cannot provide the requested result.
3014    fn extract_diagonal(
3015        &mut self,
3016        input: &Tensor,
3017        axis_a: usize,
3018        axis_b: usize,
3019    ) -> crate::Result<Tensor>;
3020    /// # Errors
3021    ///
3022    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3023    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3024    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3025    /// backend execution or storage access cannot provide the requested result.
3026    fn embed_diagonal(
3027        &mut self,
3028        input: &Tensor,
3029        axis_a: usize,
3030        axis_b: usize,
3031    ) -> crate::Result<Tensor>;
3032    /// # Errors
3033    ///
3034    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3035    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3036    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3037    /// backend execution or storage access cannot provide the requested result.
3038    fn tril(&mut self, input: &Tensor, k: i64) -> crate::Result<Tensor>;
3039    /// # Errors
3040    ///
3041    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3042    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3043    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3044    /// backend execution or storage access cannot provide the requested result.
3045    fn triu(&mut self, input: &Tensor, k: i64) -> crate::Result<Tensor>;
3046}
3047
3048/// Reduction operations.
3049///
3050/// Reducing over an axis whose extent is zero returns an error for every
3051/// reduction operation. Passing an empty `axes` slice is a no-op for the public
3052/// reductions and returns the input values unchanged. Internal mapped
3053/// reductions document their own empty-axis semantics.
3054///
3055/// # Examples
3056///
3057/// ```rust
3058/// use tenferro_tensor::TensorReduction;
3059///
3060/// fn accepts_reduction<B: TensorReduction>(_backend: &mut B) {}
3061/// ```
3062pub trait TensorReduction {
3063    /// # Errors
3064    ///
3065    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3066    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3067    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3068    /// backend execution or storage access cannot provide the requested result.
3069    fn reduce_sum(&mut self, input: &Tensor, axes: &[usize]) -> crate::Result<Tensor>;
3070
3071    /// Sum elements across axes from an owned tensor or borrowed view.
3072    ///
3073    /// # Examples
3074    ///
3075    /// ```rust
3076    /// use tenferro_tensor::{Tensor, TensorRead, TensorReduction};
3077    ///
3078    /// fn sum_owned<B: TensorReduction>(
3079    ///     backend: &mut B,
3080    ///     input: &Tensor,
3081    /// ) -> tenferro_tensor::Result<Tensor> {
3082    ///     backend.reduce_sum_read(TensorRead::from_tensor(input), &[0])
3083    /// }
3084    /// ```
3085    /// # Errors
3086    ///
3087    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3088    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3089    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3090    /// backend execution or storage access cannot provide the requested result.
3091    fn reduce_sum_read(&mut self, input: TensorRead<'_>, axes: &[usize]) -> crate::Result<Tensor> {
3092        match input.as_tensor() {
3093            Some(input) => self.reduce_sum(input, axes),
3094            None => Err(crate::Error::unsupported(
3095                "reduce_sum",
3096                "backend does not accept borrowed tensor views at this execution boundary",
3097            )),
3098        }
3099    }
3100
3101    /// Sum elementwise squares across axes.
3102    ///
3103    /// This execution hook is used by composite operations that avoid a
3104    /// materialized square. Empty axes produce an elementwise square. Backends
3105    /// that support this optimized path must override the hook directly.
3106    ///
3107    /// # Errors
3108    ///
3109    /// Returns the typed validation, unsupported, runtime-state, or backend
3110    /// error produced by multiplication or reduction.
3111    #[doc(hidden)]
3112    fn reduce_sum_squares_read(
3113        &mut self,
3114        _input: TensorRead<'_>,
3115        _axes: &[usize],
3116    ) -> crate::Result<Tensor> {
3117        Err(crate::Error::unsupported(
3118            "reduce_sum_squares",
3119            "backend does not implement fused sum-of-squares reduction",
3120        ))
3121    }
3122
3123    /// # Errors
3124    ///
3125    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3126    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3127    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3128    /// backend execution or storage access cannot provide the requested result.
3129    fn reduce_prod(&mut self, input: &Tensor, axes: &[usize]) -> crate::Result<Tensor>;
3130
3131    /// Multiply elements across axes from an owned tensor or borrowed view.
3132    ///
3133    /// # Examples
3134    ///
3135    /// ```rust
3136    /// use tenferro_tensor::{Tensor, TensorRead, TensorReduction};
3137    ///
3138    /// fn prod_owned<B: TensorReduction>(
3139    ///     backend: &mut B,
3140    ///     input: &Tensor,
3141    /// ) -> tenferro_tensor::Result<Tensor> {
3142    ///     backend.reduce_prod_read(TensorRead::from_tensor(input), &[0])
3143    /// }
3144    /// ```
3145    /// # Errors
3146    ///
3147    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3148    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3149    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3150    /// backend execution or storage access cannot provide the requested result.
3151    fn reduce_prod_read(&mut self, input: TensorRead<'_>, axes: &[usize]) -> crate::Result<Tensor> {
3152        match input.as_tensor() {
3153            Some(input) => self.reduce_prod(input, axes),
3154            None => Err(crate::Error::unsupported(
3155                "reduce_prod",
3156                "backend does not accept borrowed tensor views at this execution boundary",
3157            )),
3158        }
3159    }
3160
3161    /// # Errors
3162    ///
3163    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3164    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3165    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3166    /// backend execution or storage access cannot provide the requested result.
3167    fn reduce_max(&mut self, input: &Tensor, axes: &[usize]) -> crate::Result<Tensor>;
3168
3169    /// Take maximum values across axes from an owned tensor or borrowed view.
3170    ///
3171    /// # Examples
3172    ///
3173    /// ```rust
3174    /// use tenferro_tensor::{Tensor, TensorRead, TensorReduction};
3175    ///
3176    /// fn max_owned<B: TensorReduction>(
3177    ///     backend: &mut B,
3178    ///     input: &Tensor,
3179    /// ) -> tenferro_tensor::Result<Tensor> {
3180    ///     backend.reduce_max_read(TensorRead::from_tensor(input), &[0])
3181    /// }
3182    /// ```
3183    /// # Errors
3184    ///
3185    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3186    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3187    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3188    /// backend execution or storage access cannot provide the requested result.
3189    fn reduce_max_read(&mut self, input: TensorRead<'_>, axes: &[usize]) -> crate::Result<Tensor> {
3190        match input.as_tensor() {
3191            Some(input) => self.reduce_max(input, axes),
3192            None => Err(crate::Error::unsupported(
3193                "reduce_max",
3194                "backend does not accept borrowed tensor views at this execution boundary",
3195            )),
3196        }
3197    }
3198
3199    /// # Errors
3200    ///
3201    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3202    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3203    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3204    /// backend execution or storage access cannot provide the requested result.
3205    fn reduce_min(&mut self, input: &Tensor, axes: &[usize]) -> crate::Result<Tensor>;
3206
3207    /// Take minimum values across axes from an owned tensor or borrowed view.
3208    ///
3209    /// # Examples
3210    ///
3211    /// ```rust
3212    /// use tenferro_tensor::{Tensor, TensorRead, TensorReduction};
3213    ///
3214    /// fn min_owned<B: TensorReduction>(
3215    ///     backend: &mut B,
3216    ///     input: &Tensor,
3217    /// ) -> tenferro_tensor::Result<Tensor> {
3218    ///     backend.reduce_min_read(TensorRead::from_tensor(input), &[0])
3219    /// }
3220    /// ```
3221    /// # Errors
3222    ///
3223    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3224    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3225    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3226    /// backend execution or storage access cannot provide the requested result.
3227    fn reduce_min_read(&mut self, input: TensorRead<'_>, axes: &[usize]) -> crate::Result<Tensor> {
3228        match input.as_tensor() {
3229            Some(input) => self.reduce_min(input, axes),
3230            None => Err(crate::Error::unsupported(
3231                "reduce_min",
3232                "backend does not accept borrowed tensor views at this execution boundary",
3233            )),
3234        }
3235    }
3236}
3237
3238/// Dot-general operations.
3239///
3240/// # Examples
3241///
3242/// ```rust
3243/// use tenferro_tensor::TensorDot;
3244///
3245/// fn accepts_dot<B: TensorDot>(_backend: &mut B) {}
3246/// ```
3247pub trait TensorDot: TensorElementwise {
3248    /// # Errors
3249    ///
3250    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3251    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3252    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3253    /// backend execution or storage access cannot provide the requested result.
3254    fn dot_general(
3255        &mut self,
3256        lhs: &Tensor,
3257        rhs: &Tensor,
3258        config: &DotGeneralConfig,
3259    ) -> crate::Result<Tensor>;
3260
3261    #[doc(hidden)]
3262    fn dot_general_read(
3263        &mut self,
3264        lhs: TensorRead<'_>,
3265        rhs: TensorRead<'_>,
3266        config: &DotGeneralConfig,
3267    ) -> crate::Result<Tensor> {
3268        match (lhs.as_tensor(), rhs.as_tensor()) {
3269            (Some(lhs), Some(rhs)) => self.dot_general(lhs, rhs, config),
3270            _ => {
3271                let lhs = self.to_contiguous_read(lhs)?;
3272                let rhs = self.to_contiguous_read(rhs)?;
3273                self.dot_general(&lhs, &rhs, config)
3274            }
3275        }
3276    }
3277
3278    /// Overwrite caller-provided output with dot-general from read inputs.
3279    ///
3280    /// This is the dot/GEMM spelling of `_into`: the previous output value is
3281    /// not read. Use [`TensorDot::dot_general_read_into_accum`] for explicit
3282    /// read-modify-write accumulation.
3283    ///
3284    /// # Examples
3285    ///
3286    /// ```rust
3287    /// use tenferro_tensor::{DotGeneralConfig, TensorDot, TensorRead, TensorWrite};
3288    ///
3289    /// fn dot_into<B: TensorDot>(
3290    ///     backend: &mut B,
3291    ///     lhs: TensorRead<'_>,
3292    ///     rhs: TensorRead<'_>,
3293    ///     config: &DotGeneralConfig,
3294    ///     out: TensorWrite<'_>,
3295    /// ) -> tenferro_tensor::Result<()> {
3296    ///     backend.dot_general_read_into(lhs, rhs, config, out)
3297    /// }
3298    /// ```
3299    /// # Errors
3300    ///
3301    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3302    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3303    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3304    /// backend execution or storage access cannot provide the requested result.
3305    fn dot_general_read_into(
3306        &mut self,
3307        lhs: TensorRead<'_>,
3308        rhs: TensorRead<'_>,
3309        config: &DotGeneralConfig,
3310        out: TensorWrite<'_>,
3311    ) -> crate::Result<()> {
3312        let accumulation = DotGeneralAccumulation::overwrite(lhs.dtype())?;
3313        self.dot_general_read_into_accum(lhs, rhs, config, accumulation, out)
3314    }
3315
3316    #[doc(hidden)]
3317    fn dot_general_with_conj(
3318        &mut self,
3319        lhs: &Tensor,
3320        rhs: &Tensor,
3321        config: &DotGeneralConfig,
3322        lhs_conj: bool,
3323        rhs_conj: bool,
3324    ) -> crate::Result<Tensor> {
3325        if !lhs_conj && !rhs_conj {
3326            return self.dot_general(lhs, rhs, config);
3327        }
3328
3329        let lhs_tmp;
3330        let lhs_ref = if lhs_conj {
3331            lhs_tmp = self.conj(lhs)?;
3332            &lhs_tmp
3333        } else {
3334            lhs
3335        };
3336        let rhs_tmp;
3337        let rhs_ref = if rhs_conj {
3338            rhs_tmp = self.conj(rhs)?;
3339            &rhs_tmp
3340        } else {
3341            rhs
3342        };
3343        self.dot_general(lhs_ref, rhs_ref, config)
3344    }
3345
3346    #[allow(clippy::too_many_arguments)]
3347    #[doc(hidden)]
3348    fn dot_general_with_conj_read(
3349        &mut self,
3350        lhs: TensorRead<'_>,
3351        rhs: TensorRead<'_>,
3352        config: &DotGeneralConfig,
3353        lhs_conj: bool,
3354        rhs_conj: bool,
3355    ) -> crate::Result<Tensor> {
3356        if !lhs_conj && !rhs_conj {
3357            return self.dot_general_read(lhs, rhs, config);
3358        }
3359
3360        let lhs_tmp;
3361        let lhs_ref = if let Some(tensor) = lhs.as_tensor() {
3362            tensor
3363        } else {
3364            lhs_tmp = self.to_contiguous_read(lhs)?;
3365            &lhs_tmp
3366        };
3367        let rhs_tmp;
3368        let rhs_ref = if let Some(tensor) = rhs.as_tensor() {
3369            tensor
3370        } else {
3371            rhs_tmp = self.to_contiguous_read(rhs)?;
3372            &rhs_tmp
3373        };
3374        self.dot_general_with_conj(lhs_ref, rhs_ref, config, lhs_conj, rhs_conj)
3375    }
3376
3377    /// Apply scaled dot-general accumulation into caller-provided output.
3378    ///
3379    /// This is explicitly read-modify-write when `accumulation.beta` is nonzero:
3380    /// `out = alpha * dot_general(lhs, rhs) + beta * out`.
3381    ///
3382    /// # Examples
3383    ///
3384    /// ```rust
3385    /// use tenferro_tensor::{
3386    ///     DotGeneralAccumulation, DotGeneralConfig, TensorDot, TensorRead, TensorWrite,
3387    /// };
3388    ///
3389    /// fn dot_add_to<B: TensorDot>(
3390    ///     backend: &mut B,
3391    ///     lhs: TensorRead<'_>,
3392    ///     rhs: TensorRead<'_>,
3393    ///     config: &DotGeneralConfig,
3394    ///     out: TensorWrite<'_>,
3395    /// ) -> tenferro_tensor::Result<()> {
3396    ///     let accumulation = DotGeneralAccumulation::add_to(lhs.dtype())?;
3397    ///     backend.dot_general_read_into_accum(lhs, rhs, config, accumulation, out)
3398    /// }
3399    /// ```
3400    /// # Errors
3401    ///
3402    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3403    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3404    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3405    /// backend execution or storage access cannot provide the requested result.
3406    fn dot_general_read_into_accum(
3407        &mut self,
3408        lhs: TensorRead<'_>,
3409        rhs: TensorRead<'_>,
3410        config: &DotGeneralConfig,
3411        accumulation: DotGeneralAccumulation,
3412        out: TensorWrite<'_>,
3413    ) -> crate::Result<()> {
3414        dot_general_accum_via_temp(self, lhs, rhs, config, accumulation, out)
3415    }
3416}
3417
3418/// Session-scoped cached dot-general operations.
3419///
3420/// # Examples
3421///
3422/// ```rust
3423/// use tenferro_tensor::BackendSession;
3424///
3425/// fn accepts_session_dot<S: BackendSession + ?Sized>(_session: &mut S) {}
3426/// ```
3427pub trait SessionCachedDot: TensorDot {
3428    #[doc(hidden)]
3429    fn dot_general_cached(
3430        &mut self,
3431        _cache_slot: Option<usize>,
3432        lhs: &Tensor,
3433        rhs: &Tensor,
3434        config: &DotGeneralConfig,
3435    ) -> crate::Result<Tensor> {
3436        self.dot_general(lhs, rhs, config)
3437    }
3438
3439    #[doc(hidden)]
3440    fn dot_general_read_cached(
3441        &mut self,
3442        cache_slot: Option<usize>,
3443        lhs: TensorRead<'_>,
3444        rhs: TensorRead<'_>,
3445        config: &DotGeneralConfig,
3446    ) -> crate::Result<Tensor> {
3447        match (lhs.as_tensor(), rhs.as_tensor()) {
3448            (Some(lhs), Some(rhs)) => self.dot_general_cached(cache_slot, lhs, rhs, config),
3449            _ => {
3450                let lhs = self.to_contiguous_read(lhs)?;
3451                let rhs = self.to_contiguous_read(rhs)?;
3452                self.dot_general_cached(cache_slot, &lhs, &rhs, config)
3453            }
3454        }
3455    }
3456
3457    // Mirrors the dot-general signature plus runtime-cache metadata.
3458    #[allow(clippy::too_many_arguments)]
3459    #[doc(hidden)]
3460    fn dot_general_with_conj_cached(
3461        &mut self,
3462        _cache_slot: Option<usize>,
3463        lhs: &Tensor,
3464        rhs: &Tensor,
3465        config: &DotGeneralConfig,
3466        lhs_conj: bool,
3467        rhs_conj: bool,
3468    ) -> crate::Result<Tensor> {
3469        self.dot_general_with_conj(lhs, rhs, config, lhs_conj, rhs_conj)
3470    }
3471
3472    // Mirrors the dot-general read signature plus runtime-cache metadata.
3473    #[allow(clippy::too_many_arguments)]
3474    #[doc(hidden)]
3475    fn dot_general_with_conj_read_cached(
3476        &mut self,
3477        cache_slot: Option<usize>,
3478        lhs: TensorRead<'_>,
3479        rhs: TensorRead<'_>,
3480        config: &DotGeneralConfig,
3481        lhs_conj: bool,
3482        rhs_conj: bool,
3483    ) -> crate::Result<Tensor> {
3484        if !lhs_conj && !rhs_conj {
3485            return self.dot_general_read_cached(cache_slot, lhs, rhs, config);
3486        }
3487
3488        let lhs_tmp;
3489        let lhs_ref = if let Some(tensor) = lhs.as_tensor() {
3490            tensor
3491        } else {
3492            lhs_tmp = self.to_contiguous_read(lhs)?;
3493            &lhs_tmp
3494        };
3495        let rhs_tmp;
3496        let rhs_ref = if let Some(tensor) = rhs.as_tensor() {
3497            tensor
3498        } else {
3499            rhs_tmp = self.to_contiguous_read(rhs)?;
3500            &rhs_tmp
3501        };
3502        self.dot_general_with_conj_cached(cache_slot, lhs_ref, rhs_ref, config, lhs_conj, rhs_conj)
3503    }
3504
3505    /// Apply session-cached scaled dot-general accumulation into output.
3506    ///
3507    /// The cache slot is session-local metadata; `accumulation` still controls
3508    /// overwrite versus read-modify-write semantics.
3509    ///
3510    /// # Examples
3511    ///
3512    /// ```rust
3513    /// use tenferro_tensor::{
3514    ///     DotGeneralAccumulation, DotGeneralConfig, SessionCachedDot, TensorRead, TensorWrite,
3515    /// };
3516    ///
3517    /// fn session_cached_dot_add_to<S: SessionCachedDot + ?Sized>(
3518    ///     session: &mut S,
3519    ///     lhs: TensorRead<'_>,
3520    ///     rhs: TensorRead<'_>,
3521    ///     config: &DotGeneralConfig,
3522    ///     out: TensorWrite<'_>,
3523    /// ) -> tenferro_tensor::Result<()> {
3524    ///     let accumulation = DotGeneralAccumulation::add_to(lhs.dtype())?;
3525    ///     session.dot_general_read_into_accum_cached(
3526    ///         Some(0),
3527    ///         lhs,
3528    ///         rhs,
3529    ///         config,
3530    ///         accumulation,
3531    ///         out,
3532    ///     )
3533    /// }
3534    /// ```
3535    /// # Errors
3536    ///
3537    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3538    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3539    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3540    /// backend execution or storage access cannot provide the requested result.
3541    fn dot_general_read_into_accum_cached(
3542        &mut self,
3543        _cache_slot: Option<usize>,
3544        lhs: TensorRead<'_>,
3545        rhs: TensorRead<'_>,
3546        config: &DotGeneralConfig,
3547        accumulation: DotGeneralAccumulation,
3548        out: TensorWrite<'_>,
3549    ) -> crate::Result<()> {
3550        self.dot_general_read_into_accum(lhs, rhs, config, accumulation, out)
3551    }
3552
3553    #[doc(hidden)]
3554    fn grouped_gemm_cached(
3555        &mut self,
3556        _cache_slot: Option<usize>,
3557        lhs: TensorRead<'_>,
3558        rhs: TensorRead<'_>,
3559        config: &GroupedGemmConfig<'_>,
3560        out: TensorWrite<'_>,
3561    ) -> crate::Result<()> {
3562        grouped_gemm_default(self, lhs, rhs, config, out)
3563    }
3564}
3565
3566/// Indexing, slicing, and padding operations.
3567///
3568/// # Examples
3569///
3570/// ```rust
3571/// use tenferro_tensor::TensorIndexing;
3572///
3573/// fn accepts_indexing<B: TensorIndexing>(_backend: &mut B) {}
3574/// ```
3575pub trait TensorIndexing {
3576    /// # Errors
3577    ///
3578    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3579    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3580    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3581    /// backend execution or storage access cannot provide the requested result.
3582    fn gather(
3583        &mut self,
3584        operand: &Tensor,
3585        start_indices: &Tensor,
3586        config: &GatherConfig,
3587    ) -> crate::Result<Tensor>;
3588    /// # Errors
3589    ///
3590    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3591    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3592    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3593    /// backend execution or storage access cannot provide the requested result.
3594    fn scatter(
3595        &mut self,
3596        operand: &Tensor,
3597        scatter_indices: &Tensor,
3598        updates: &Tensor,
3599        config: &ScatterConfig,
3600    ) -> crate::Result<Tensor>;
3601    /// # Errors
3602    ///
3603    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3604    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3605    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3606    /// backend execution or storage access cannot provide the requested result.
3607    fn slice(&mut self, input: &Tensor, config: &SliceConfig) -> crate::Result<Tensor>;
3608    /// # Errors
3609    ///
3610    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3611    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3612    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3613    /// backend execution or storage access cannot provide the requested result.
3614    fn dynamic_slice(
3615        &mut self,
3616        input: &Tensor,
3617        starts: &Tensor,
3618        slice_sizes: &[usize],
3619    ) -> crate::Result<Tensor>;
3620    /// # Errors
3621    ///
3622    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3623    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3624    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3625    /// backend execution or storage access cannot provide the requested result.
3626    fn dynamic_update_slice(
3627        &mut self,
3628        operand: &Tensor,
3629        update: &Tensor,
3630        starts: &Tensor,
3631    ) -> crate::Result<Tensor>;
3632    /// # Errors
3633    ///
3634    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3635    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3636    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3637    /// backend execution or storage access cannot provide the requested result.
3638    fn pad(&mut self, input: &Tensor, config: &PadConfig) -> crate::Result<Tensor>;
3639    /// # Errors
3640    ///
3641    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3642    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3643    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3644    /// backend execution or storage access cannot provide the requested result.
3645    fn concatenate(&mut self, inputs: &[&Tensor], axis: usize) -> crate::Result<Tensor>;
3646    /// # Errors
3647    ///
3648    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3649    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3650    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3651    /// backend execution or storage access cannot provide the requested result.
3652    fn reverse(&mut self, input: &Tensor, axes: &[usize]) -> crate::Result<Tensor>;
3653}
3654
3655/// Backend-owned canonicalization for typed tensor views.
3656///
3657/// Implementations must preserve the input placement family. CPU backends
3658/// canonicalize host views through explicit host copies and reject backend
3659/// buffers with a diagnostic that asks the caller to download first. GPU
3660/// backends canonicalize GPU-resident views on the same device and reject host
3661/// buffers with an upload hint.
3662///
3663/// [`TensorViewCanonicalization::copy_into`] requires source and destination
3664/// shapes, scalar dtypes, and placement families to match. The destination
3665/// view must be internally non-overlapping, and source and destination backing
3666/// allocations must not alias unless an implementation explicitly documents
3667/// and supports that case. Implementations may reject layouts their native
3668/// kernels cannot consume.
3669///
3670/// CUDA currently accepts only a compact column-major source view with offset
3671/// zero that covers its full allocation; arbitrary-stride destinations remain
3672/// supported. Canonicalization and copying are same-placement operations: they
3673/// must not perform hidden host/device transfers or silently materialize an
3674/// unsupported source layout.
3675///
3676/// This trait is intentionally separate from [`BackendSession`] so generic
3677/// typed methods do not change the object-safety contract of `dyn BackendSession`.
3678///
3679/// # Examples
3680///
3681/// ```rust
3682/// use tenferro_tensor::{DynRank, TensorViewCanonicalization, TypedTensor};
3683///
3684/// fn compact_i32<B: TensorViewCanonicalization<i32, DynRank>>(
3685///     backend: &mut B,
3686///     tensor: &TypedTensor<i32>,
3687/// ) -> tenferro_tensor::Result<TypedTensor<i32>> {
3688///     backend.to_contiguous(&tensor.as_view())
3689/// }
3690///
3691/// fn copy_i32<B: TensorViewCanonicalization<i32, DynRank>>(
3692///     backend: &mut B,
3693///     src: &TypedTensor<i32>,
3694///     dst: &mut TypedTensor<i32>,
3695/// ) -> tenferro_tensor::Result<()> {
3696///     backend.copy_into(&src.as_view(), &mut dst.as_view_mut())
3697/// }
3698/// ```
3699pub trait TensorViewCanonicalization<T: TensorScalar, R: TensorRank> {
3700    /// # Errors
3701    ///
3702    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3703    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3704    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3705    /// backend execution or storage access cannot provide the requested result.
3706    fn to_contiguous(
3707        &mut self,
3708        view: &TypedTensorView<'_, T, R>,
3709    ) -> crate::Result<TypedTensor<T, R>>;
3710
3711    /// # Errors
3712    ///
3713    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3714    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3715    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3716    /// backend execution or storage access cannot provide the requested result.
3717    fn copy_into(
3718        &mut self,
3719        src: &TypedTensorView<'_, T, R>,
3720        dst: &mut TypedTensorViewMut<'_, T, R>,
3721    ) -> crate::Result<()>;
3722}
3723
3724/// Optional elementwise fusion execution.
3725///
3726/// # Examples
3727///
3728/// ```rust
3729/// use tenferro_tensor::TensorFusion;
3730///
3731/// fn accepts_fusion<B: TensorFusion>(_backend: &mut B) {}
3732/// ```
3733pub trait TensorFusion {
3734    #[doc(hidden)]
3735    fn execute_elementwise_fusion(
3736        &mut self,
3737        _inputs: &[&Tensor],
3738        _plan: &ElementwiseFusionPlan,
3739    ) -> crate::Result<Option<Vec<Tensor>>> {
3740        Ok(None)
3741    }
3742
3743    #[doc(hidden)]
3744    #[allow(clippy::too_many_arguments)]
3745    fn execute_broadcast_multiply(
3746        &mut self,
3747        _lhs: TensorRead<'_>,
3748        _lhs_shape: &[usize],
3749        _lhs_dims: &[usize],
3750        _rhs: TensorRead<'_>,
3751        _rhs_shape: &[usize],
3752        _rhs_dims: &[usize],
3753    ) -> crate::Result<Option<Tensor>> {
3754        Ok(None)
3755    }
3756
3757    #[doc(hidden)]
3758    #[allow(clippy::too_many_arguments)]
3759    fn execute_broadcast_multiply_value(
3760        &mut self,
3761        lhs: TensorRead<'_>,
3762        lhs_shape: &[usize],
3763        lhs_dims: &[usize],
3764        rhs: TensorRead<'_>,
3765        rhs_shape: &[usize],
3766        rhs_dims: &[usize],
3767    ) -> crate::Result<Option<TensorValue>> {
3768        self.execute_broadcast_multiply(lhs, lhs_shape, lhs_dims, rhs, rhs_shape, rhs_dims)
3769            .map(|tensor| tensor.map(TensorValue::from_tensor))
3770    }
3771}
3772
3773/// Backend buffer lifecycle operations.
3774///
3775/// # Examples
3776///
3777/// ```rust
3778/// use tenferro_tensor::TensorBuffer;
3779///
3780/// fn accepts_buffer<B: TensorBuffer>(_backend: &mut B) {}
3781/// ```
3782pub trait TensorBuffer {
3783    fn reclaim_buffer(&mut self, _tensor: Tensor) {}
3784}
3785
3786/// Device transfer operations on backend boundaries.
3787///
3788/// # Examples
3789///
3790/// ```rust
3791/// use tenferro_tensor::TensorDeviceTransfer;
3792///
3793/// fn accepts_transfer<B: TensorDeviceTransfer>(_backend: &mut B) {}
3794/// ```
3795pub trait TensorDeviceTransfer {
3796    /// # Errors
3797    ///
3798    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3799    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3800    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3801    /// backend execution or storage access cannot provide the requested result.
3802    fn download_to_host(&mut self, tensor: &Tensor) -> crate::Result<Tensor> {
3803        Ok(tensor.clone())
3804    }
3805
3806    /// # Errors
3807    ///
3808    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3809    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3810    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3811    /// backend execution or storage access cannot provide the requested result.
3812    fn upload_host_tensor(&mut self, tensor: &Tensor) -> crate::Result<Tensor> {
3813        Ok(tensor.clone())
3814    }
3815}
3816
3817/// Runtime cache associated with a backend.
3818///
3819/// # Examples
3820///
3821/// ```rust
3822/// use tenferro_tensor::BackendRuntimeCache;
3823///
3824/// fn accepts_runtime_cache<B: BackendRuntimeCache>(_backend: &B) {}
3825/// ```
3826pub trait BackendRuntimeCache {
3827    #[doc(hidden)]
3828    type RuntimeCache: RuntimeCacheControl + Send + Sync + 'static;
3829}
3830
3831/// Backend-owned cached dot-general operations.
3832///
3833/// # Examples
3834///
3835/// ```rust
3836/// use tenferro_tensor::BackendCachedDot;
3837///
3838/// fn accepts_backend_cached_dot<B: BackendCachedDot>(_backend: &mut B) {}
3839/// ```
3840pub trait BackendCachedDot: BackendRuntimeCache + TensorDot {
3841    #[doc(hidden)]
3842    fn dot_general_cached(
3843        &mut self,
3844        _cache: &mut Self::RuntimeCache,
3845        _cache_slot: Option<usize>,
3846        lhs: &Tensor,
3847        rhs: &Tensor,
3848        config: &DotGeneralConfig,
3849    ) -> crate::Result<Tensor> {
3850        self.dot_general(lhs, rhs, config)
3851    }
3852
3853    #[doc(hidden)]
3854    fn dot_general_read_cached(
3855        &mut self,
3856        cache: &mut Self::RuntimeCache,
3857        cache_slot: Option<usize>,
3858        lhs: TensorRead<'_>,
3859        rhs: TensorRead<'_>,
3860        config: &DotGeneralConfig,
3861    ) -> crate::Result<Tensor> {
3862        match (lhs.as_tensor(), rhs.as_tensor()) {
3863            (Some(lhs), Some(rhs)) => self.dot_general_cached(cache, cache_slot, lhs, rhs, config),
3864            _ => {
3865                let lhs = self.to_contiguous_read(lhs)?;
3866                let rhs = self.to_contiguous_read(rhs)?;
3867                self.dot_general_cached(cache, cache_slot, &lhs, &rhs, config)
3868            }
3869        }
3870    }
3871
3872    // Mirrors the dot-general signature plus runtime-cache metadata.
3873    #[allow(clippy::too_many_arguments)]
3874    #[doc(hidden)]
3875    fn dot_general_with_conj_cached(
3876        &mut self,
3877        _cache: &mut Self::RuntimeCache,
3878        _cache_slot: Option<usize>,
3879        lhs: &Tensor,
3880        rhs: &Tensor,
3881        config: &DotGeneralConfig,
3882        lhs_conj: bool,
3883        rhs_conj: bool,
3884    ) -> crate::Result<Tensor> {
3885        self.dot_general_with_conj(lhs, rhs, config, lhs_conj, rhs_conj)
3886    }
3887
3888    // Mirrors the dot-general read signature plus runtime-cache metadata.
3889    #[allow(clippy::too_many_arguments)]
3890    #[doc(hidden)]
3891    fn dot_general_with_conj_read_cached(
3892        &mut self,
3893        cache: &mut Self::RuntimeCache,
3894        cache_slot: Option<usize>,
3895        lhs: TensorRead<'_>,
3896        rhs: TensorRead<'_>,
3897        config: &DotGeneralConfig,
3898        lhs_conj: bool,
3899        rhs_conj: bool,
3900    ) -> crate::Result<Tensor> {
3901        if !lhs_conj && !rhs_conj {
3902            return self.dot_general_read_cached(cache, cache_slot, lhs, rhs, config);
3903        }
3904
3905        let lhs_tmp;
3906        let lhs_ref = if let Some(tensor) = lhs.as_tensor() {
3907            tensor
3908        } else {
3909            lhs_tmp = self.to_contiguous_read(lhs)?;
3910            &lhs_tmp
3911        };
3912        let rhs_tmp;
3913        let rhs_ref = if let Some(tensor) = rhs.as_tensor() {
3914            tensor
3915        } else {
3916            rhs_tmp = self.to_contiguous_read(rhs)?;
3917            &rhs_tmp
3918        };
3919        self.dot_general_with_conj_cached(
3920            cache, cache_slot, lhs_ref, rhs_ref, config, lhs_conj, rhs_conj,
3921        )
3922    }
3923
3924    /// Apply cached scaled dot-general accumulation into caller-provided output.
3925    ///
3926    /// The cache slot identifies backend-local analysis metadata only; output
3927    /// semantics are still fully described by `accumulation`.
3928    ///
3929    /// # Examples
3930    ///
3931    /// ```rust
3932    /// use tenferro_tensor::{
3933    ///     BackendCachedDot, BackendRuntimeCache, DotGeneralAccumulation, DotGeneralConfig,
3934    ///     TensorRead, TensorWrite,
3935    /// };
3936    ///
3937    /// fn cached_dot_add_to<B: BackendCachedDot>(
3938    ///     backend: &mut B,
3939    ///     cache: &mut B::RuntimeCache,
3940    ///     lhs: TensorRead<'_>,
3941    ///     rhs: TensorRead<'_>,
3942    ///     config: &DotGeneralConfig,
3943    ///     out: TensorWrite<'_>,
3944    /// ) -> tenferro_tensor::Result<()>
3945    /// where
3946    ///     B: BackendRuntimeCache,
3947    /// {
3948    ///     let accumulation = DotGeneralAccumulation::add_to(lhs.dtype())?;
3949    ///     backend.dot_general_read_into_accum_cached(
3950    ///         cache,
3951    ///         Some(0),
3952    ///         lhs,
3953    ///         rhs,
3954    ///         config,
3955    ///         accumulation,
3956    ///         out,
3957    ///     )
3958    /// }
3959    /// ```
3960    #[allow(clippy::too_many_arguments)]
3961    /// # Errors
3962    ///
3963    /// Returns [`crate::Error::Validation`] with a typed `ValidationError` source
3964    /// for invalid shapes, ranks, axes, dtypes, or output metadata. It returns
3965    /// [`crate::Error::BackendFailure`] or [`crate::Error::BackendSource`] when
3966    /// backend execution or storage access cannot provide the requested result.
3967    fn dot_general_read_into_accum_cached(
3968        &mut self,
3969        _cache: &mut Self::RuntimeCache,
3970        _cache_slot: Option<usize>,
3971        lhs: TensorRead<'_>,
3972        rhs: TensorRead<'_>,
3973        config: &DotGeneralConfig,
3974        accumulation: DotGeneralAccumulation,
3975        out: TensorWrite<'_>,
3976    ) -> crate::Result<()> {
3977        self.dot_general_read_into_accum(lhs, rhs, config, accumulation, out)
3978    }
3979
3980    #[doc(hidden)]
3981    fn grouped_gemm_cached(
3982        &mut self,
3983        _cache: &mut Self::RuntimeCache,
3984        _cache_slot: Option<usize>,
3985        lhs: TensorRead<'_>,
3986        rhs: TensorRead<'_>,
3987        config: &GroupedGemmConfig<'_>,
3988        out: TensorWrite<'_>,
3989    ) -> crate::Result<()> {
3990        grouped_gemm_default(self, lhs, rhs, config, out)
3991    }
3992}
3993
3994/// Backend execution-session entry points.
3995///
3996/// # Examples
3997///
3998/// ```rust
3999/// use tenferro_tensor::BackendSessionHost;
4000///
4001/// fn accepts_session_host<B: BackendSessionHost>(_backend: &mut B) {}
4002/// ```
4003pub trait BackendSessionHost: BackendRuntimeCache {
4004    fn with_backend_session<R: Send>(
4005        &mut self,
4006        f: impl FnOnce(&mut dyn BackendSession) -> R + Send,
4007    ) -> R
4008    where
4009        Self: TensorBackend + Sized,
4010    {
4011        default_backend_session(self, f)
4012    }
4013
4014    #[doc(hidden)]
4015    fn with_backend_session_cached<R: Send>(
4016        &mut self,
4017        _cache: &mut Self::RuntimeCache,
4018        f: impl FnOnce(&mut dyn BackendSession) -> R + Send,
4019    ) -> R
4020    where
4021        Self: TensorBackend + Sized,
4022    {
4023        self.with_backend_session(f)
4024    }
4025}
4026
4027/// Operation capabilities shared by backends and backend sessions.
4028#[doc(hidden)]
4029pub trait TensorBackendOps:
4030    TensorElementwise
4031    + TensorAnalytic
4032    + TensorStructural
4033    + TensorReduction
4034    + TensorIndexing
4035    + TensorDot
4036    + TensorFusion
4037    + TensorBuffer
4038{
4039}
4040
4041impl<T> TensorBackendOps for T where
4042    T: TensorElementwise
4043        + TensorAnalytic
4044        + TensorStructural
4045        + TensorReduction
4046        + TensorIndexing
4047        + TensorDot
4048        + TensorFusion
4049        + TensorBuffer
4050        + ?Sized
4051{
4052}
4053
4054/// Execution session surface for dense tensor backends.
4055///
4056/// All operations run within a backend-owned execution scope such as a CPU
4057/// thread policy or a GPU stream. Individual ops must not try to re-enter that
4058/// scope.
4059///
4060/// # Examples
4061///
4062/// ```rust
4063/// use tenferro_tensor::{BackendSessionHost, Tensor, TypedTensor};
4064///
4065/// fn add_in_session<B: BackendSessionHost>(
4066///     backend: &mut B,
4067///     a: &Tensor,
4068///     b: &Tensor,
4069/// ) -> tenferro_tensor::Result<Tensor>
4070/// where
4071///     B: tenferro_tensor::TensorBackend,
4072/// {
4073///     backend.with_backend_session(|exec| exec.add(a, b))
4074/// }
4075/// ```
4076pub trait BackendSession: TensorBackendOps + SessionCachedDot + TensorDeviceTransfer {
4077    /// Build-local type identity for backend-extension session capability dispatch.
4078    #[doc(hidden)]
4079    fn session_type_name(&self) -> &'static str;
4080
4081    /// Erased pointer used only by backend leaf crates for a checked session
4082    /// capability bridge. The pointer is borrowed for the lifetime of `self`.
4083    ///
4084    /// # Safety
4085    ///
4086    /// The implementation must return a pointer to the same value represented
4087    /// by `self`, and that pointer must remain valid and uniquely borrowed for
4088    /// the duration of the `&mut self` borrow. Backend leaf crates may use this
4089    /// contract to recover a concrete session capability after checking
4090    /// [`Self::session_type_name`].
4091    #[doc(hidden)]
4092    unsafe fn session_data_mut(&mut self) -> *mut ();
4093}
4094
4095impl<T> BackendSession for T
4096where
4097    T: TensorBackendOps + SessionCachedDot + TensorDeviceTransfer + Sized,
4098{
4099    fn session_type_name(&self) -> &'static str {
4100        std::any::type_name::<T>()
4101    }
4102
4103    unsafe fn session_data_mut(&mut self) -> *mut () {
4104        self as *mut T as *mut ()
4105    }
4106}
4107
4108/// Standard runtime backend over dynamic [`Tensor`] values.
4109///
4110/// # Examples
4111///
4112/// ```rust
4113/// use tenferro_tensor::TensorBackend;
4114///
4115/// fn accepts_backend<B: TensorBackend>(_backend: &mut B) {}
4116/// ```
4117pub trait TensorBackend:
4118    BackendRuntimeCache
4119    + TensorBackendOps
4120    + BackendCachedDot
4121    + TensorDeviceTransfer
4122    + BackendSessionHost
4123{
4124}
4125
4126impl<T> SessionCachedDot for T where T: TensorBackend + ?Sized {}
4127
4128/// Run a closure using the backend itself as a default execution session.
4129///
4130/// This is suitable for backends whose individual ops already manage their own
4131/// execution context.
4132///
4133/// # Examples
4134///
4135/// ```rust
4136/// use tenferro_tensor::{default_backend_session, TensorBackend};
4137///
4138/// fn run_with_default_session<B: TensorBackend>(backend: &mut B) -> usize {
4139///     default_backend_session(backend, |_exec| 1usize)
4140/// }
4141/// ```
4142pub fn default_backend_session<B: TensorBackend, R: Send>(
4143    backend: &mut B,
4144    f: impl FnOnce(&mut dyn BackendSession) -> R + Send,
4145) -> R {
4146    f(backend)
4147}
tenferro_tensor/backend.rs

tenferro_tensor/
backend.rs