tenferro_cpu/
elementwise.rs

1use std::ops::{Add, Div, Mul, Neg, Rem as StdRem, Sub};
2use std::sync::Arc;
3
4use num_complex::Complex;
5use num_traits::{One, Zero};
6use strided_kernel::{
7    batched_outer_product_into, broadcast_mul_into, fused_elementwise_into, map_into, mul_into,
8    reduce, zip_map2_into, zip_map3_into, FusedInst, FusedOp, FusedPlan, FusedScalar, StridedView,
9};
10
11use crate::buffer_pool::{BufferPool, PoolScalar};
12use crate::ConjElem;
13use tenferro_tensor::backend::{
14    ElementwiseFusionInputView, ElementwiseFusionOp, ElementwiseFusionPlan,
15};
16use tenferro_tensor::{
17    col_major_strides, CompareDir, DType, Tensor, TensorOwnedView, TensorRank, TensorRead,
18    TensorScalar, TensorValue, TensorView, TypedTensor, TypedTensorView,
19};
20
21use super::{
22    tensor_from_array, typed_array_uninit_from_pool, typed_host_data, typed_view,
23    typed_view_from_view,
24};
25
26macro_rules! dispatch_ternary_result_with_pool {
27    ($op:literal, $a:expr, $b:expr, $c:expr, |$x:ident, $y:ident, $z:ident| $body:expr) => {
28        match ($a, $b, $c) {
29            (Tensor::F32($x), Tensor::F32($y), Tensor::F32($z)) => Ok(Tensor::F32($body?)),
30            (Tensor::F64($x), Tensor::F64($y), Tensor::F64($z)) => Ok(Tensor::F64($body?)),
31            _ => Err(crate::Error::backend_failure($op, "dtype mismatch")),
32        }
33    };
34}
35
36fn dtype_pair_error(op: &'static str, lhs: DType, rhs: DType) -> crate::Error {
37    if lhs == rhs {
38        crate::Error::backend_failure(op, format!("unsupported dtype {lhs:?}"))
39    } else {
40        crate::Error::DTypeMismatch { op, lhs, rhs }
41    }
42}
43
44fn tensor_pair_error(op: &'static str, lhs: &Tensor, rhs: &Tensor) -> crate::Error {
45    dtype_pair_error(op, lhs.dtype(), rhs.dtype())
46}
47
48fn read_pair_error(op: &'static str, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Error {
49    dtype_pair_error(op, lhs.dtype(), rhs.dtype())
50}
51
52fn is_complex_dtype(dtype: DType) -> bool {
53    matches!(dtype, DType::C32 | DType::C64)
54}
55
56fn ordered_complex_error(op: &'static str) -> crate::Error {
57    crate::Error::InvalidConfig {
58        op,
59        message: "complex tensors do not have a total order; compute abs/norm explicitly before ordered operations".into(),
60    }
61}
62
63fn reject_complex_ordered_dtypes(op: &'static str, dtypes: &[DType]) -> crate::Result<()> {
64    if dtypes.iter().copied().any(is_complex_dtype) {
65        return Err(ordered_complex_error(op));
66    }
67    Ok(())
68}
69
70const ELEMENTWISE_FUSION_OP: &str = "execute_elementwise_fusion";
71const ELEMENTWISE_FUSION_MIN_ELEMENTS: usize = 16 * 1024;
72
73fn validate_elementwise_fusion_inputs(
74    inputs: &[&Tensor],
75    plan: &ElementwiseFusionPlan,
76) -> crate::Result<bool> {
77    if inputs.len() != plan.input_count() {
78        return Err(crate::Error::backend_failure(
79            ELEMENTWISE_FUSION_OP,
80            format!(
81                "plan expects {} inputs but backend received {}",
82                plan.input_count(),
83                inputs.len()
84            ),
85        ));
86    }
87    if plan.input_views().len() != plan.input_count() {
88        return Err(crate::Error::backend_failure(
89            ELEMENTWISE_FUSION_OP,
90            format!(
91                "plan has {} input views for {} inputs",
92                plan.input_views().len(),
93                plan.input_count()
94            ),
95        ));
96    }
97    if plan.outputs().is_empty() {
98        return Ok(false);
99    }
100    for input in inputs {
101        if input.dtype() != plan.dtype() {
102            return Err(crate::Error::DTypeMismatch {
103                op: ELEMENTWISE_FUSION_OP,
104                lhs: input.dtype(),
105                rhs: plan.dtype(),
106            });
107        }
108    }
109    Ok(true)
110}
111
112fn strided_fused_op(op: ElementwiseFusionOp) -> FusedOp {
113    match op {
114        ElementwiseFusionOp::Add => FusedOp::Add,
115        ElementwiseFusionOp::Multiply => FusedOp::Multiply,
116        ElementwiseFusionOp::Negate => FusedOp::Negate,
117        ElementwiseFusionOp::Conj => FusedOp::Conj,
118        ElementwiseFusionOp::Divide => FusedOp::Divide,
119        ElementwiseFusionOp::Abs => FusedOp::Abs,
120        ElementwiseFusionOp::Maximum => FusedOp::Maximum,
121        ElementwiseFusionOp::Minimum => FusedOp::Minimum,
122        ElementwiseFusionOp::Clamp => FusedOp::Clamp,
123        ElementwiseFusionOp::Exp => FusedOp::Exp,
124        ElementwiseFusionOp::Log => FusedOp::Log,
125        ElementwiseFusionOp::Sin => FusedOp::Sin,
126        ElementwiseFusionOp::Cos => FusedOp::Cos,
127        ElementwiseFusionOp::Tanh => FusedOp::Tanh,
128        ElementwiseFusionOp::Sqrt => FusedOp::Sqrt,
129        ElementwiseFusionOp::Rsqrt => FusedOp::Rsqrt,
130        ElementwiseFusionOp::Pow => FusedOp::Pow,
131        ElementwiseFusionOp::Expm1 => FusedOp::Expm1,
132        ElementwiseFusionOp::Log1p => FusedOp::Log1p,
133        ElementwiseFusionOp::Remainder => {
134            unreachable!("remainder must be filtered before CPU elementwise fusion")
135        }
136    }
137}
138
139fn plan_uses_unfused_op(plan: &ElementwiseFusionPlan) -> bool {
140    plan.ops()
141        .iter()
142        .any(|inst| inst.op() == ElementwiseFusionOp::Remainder)
143}
144
145fn plan_uses_ordered_op(plan: &ElementwiseFusionPlan) -> bool {
146    plan.ops().iter().any(|inst| {
147        matches!(
148            inst.op(),
149            ElementwiseFusionOp::Maximum
150                | ElementwiseFusionOp::Minimum
151                | ElementwiseFusionOp::Clamp
152        )
153    })
154}
155
156fn should_defer_to_broadcast_multiply_special_case(plan: &ElementwiseFusionPlan) -> bool {
157    !plan.input_views().iter().all(|view| view.is_identity())
158        && plan.ops().len() == 1
159        && plan.outputs() == [plan.input_count()]
160        && plan.ops()[0].op() == ElementwiseFusionOp::Multiply
161}
162
163fn strided_fused_plan(plan: &ElementwiseFusionPlan) -> FusedPlan {
164    FusedPlan {
165        input_count: plan.input_count(),
166        outputs: plan.outputs().to_vec(),
167        ops: plan
168            .ops()
169            .iter()
170            .map(|inst| FusedInst {
171                op: strided_fused_op(inst.op()),
172                inputs: inst.inputs().to_vec(),
173            })
174            .collect(),
175    }
176}
177
178pub(crate) trait Tier2Elem: Copy + Clone + One + Zero + Send + Sync {
179    fn abs_elem(self) -> Self;
180    fn sign_elem(self) -> Self;
181}
182
183// Keep ordering separate from abs/sign so complex tensors cannot silently pick
184// a magnitude ordering. Callers should compute abs/norm explicitly first.
185pub(crate) trait OrderedElem: Copy + Clone + Send + Sync {
186    fn max_elem(self, other: Self) -> Self;
187    fn min_elem(self, other: Self) -> Self;
188}
189
190pub(crate) trait CompareElem: Copy + Send + Sync {
191    fn compare_elem(self, other: Self, dir: &CompareDir) -> bool;
192}
193
194trait WrappingIntegerElem:
195    Copy + PoolScalar + TensorScalar + Zero + PartialEq + Eq + Send + Sync + 'static
196{
197    fn wrapping_add_elem(self, other: Self) -> Self;
198    fn wrapping_sub_elem(self, other: Self) -> Self;
199    fn wrapping_mul_elem(self, other: Self) -> Self;
200    fn wrapping_div_elem(self, other: Self) -> Self;
201    fn wrapping_rem_elem(self, other: Self) -> Self;
202    fn wrapping_neg_elem(self) -> Self;
203    fn wrapping_abs_elem(self) -> Self;
204    fn signum_elem(self) -> Self;
205}
206
207macro_rules! impl_tier2_elem_real {
208    ($ty:ty) => {
209        impl Tier2Elem for $ty {
210            fn abs_elem(self) -> Self {
211                self.abs()
212            }
213
214            fn sign_elem(self) -> Self {
215                if self == Self::zero() {
216                    Self::zero()
217                } else {
218                    self.signum()
219                }
220            }
221        }
222
223        impl OrderedElem for $ty {
224            fn max_elem(self, other: Self) -> Self {
225                if self.is_nan() || other.is_nan() {
226                    <$ty>::NAN
227                } else if self >= other {
228                    self
229                } else {
230                    other
231                }
232            }
233
234            fn min_elem(self, other: Self) -> Self {
235                if self.is_nan() || other.is_nan() {
236                    <$ty>::NAN
237                } else if self <= other {
238                    self
239                } else {
240                    other
241                }
242            }
243        }
244
245        impl CompareElem for $ty {
246            fn compare_elem(self, other: Self, dir: &CompareDir) -> bool {
247                match dir {
248                    CompareDir::Eq => self == other,
249                    CompareDir::Lt => self < other,
250                    CompareDir::Le => self <= other,
251                    CompareDir::Gt => self > other,
252                    CompareDir::Ge => self >= other,
253                }
254            }
255        }
256    };
257}
258
259macro_rules! impl_tier2_elem_complex {
260    ($real:ty) => {
261        impl Tier2Elem for Complex<$real> {
262            fn abs_elem(self) -> Self {
263                Self::new(self.norm(), <$real>::zero())
264            }
265
266            fn sign_elem(self) -> Self {
267                if self.is_zero() {
268                    Self::zero()
269                } else {
270                    self / self.abs_elem()
271                }
272            }
273        }
274    };
275}
276
277impl_tier2_elem_real!(f32);
278impl_tier2_elem_real!(f64);
279impl_tier2_elem_complex!(f32);
280impl_tier2_elem_complex!(f64);
281
282macro_rules! impl_compare_elem_ord {
283    ($ty:ty) => {
284        impl CompareElem for $ty {
285            fn compare_elem(self, other: Self, dir: &CompareDir) -> bool {
286                match dir {
287                    CompareDir::Eq => self == other,
288                    CompareDir::Lt => self < other,
289                    CompareDir::Le => self <= other,
290                    CompareDir::Gt => self > other,
291                    CompareDir::Ge => self >= other,
292                }
293            }
294        }
295    };
296}
297
298impl_compare_elem_ord!(i32);
299impl_compare_elem_ord!(i64);
300impl_compare_elem_ord!(bool);
301
302macro_rules! impl_ordered_elem_ord {
303    ($ty:ty) => {
304        impl OrderedElem for $ty {
305            fn max_elem(self, other: Self) -> Self {
306                self.max(other)
307            }
308
309            fn min_elem(self, other: Self) -> Self {
310                self.min(other)
311            }
312        }
313    };
314}
315
316impl_ordered_elem_ord!(i32);
317impl_ordered_elem_ord!(i64);
318
319macro_rules! impl_wrapping_integer_elem {
320    ($ty:ty) => {
321        impl WrappingIntegerElem for $ty {
322            fn wrapping_add_elem(self, other: Self) -> Self {
323                self.wrapping_add(other)
324            }
325
326            fn wrapping_sub_elem(self, other: Self) -> Self {
327                self.wrapping_sub(other)
328            }
329
330            fn wrapping_mul_elem(self, other: Self) -> Self {
331                self.wrapping_mul(other)
332            }
333
334            fn wrapping_div_elem(self, other: Self) -> Self {
335                self.wrapping_div(other)
336            }
337
338            fn wrapping_rem_elem(self, other: Self) -> Self {
339                self.wrapping_rem(other)
340            }
341
342            fn wrapping_neg_elem(self) -> Self {
343                self.wrapping_neg()
344            }
345
346            fn wrapping_abs_elem(self) -> Self {
347                self.wrapping_abs()
348            }
349
350            fn signum_elem(self) -> Self {
351                self.signum()
352            }
353        }
354    };
355}
356
357impl_wrapping_integer_elem!(i32);
358impl_wrapping_integer_elem!(i64);
359
360fn strided_view_contains<T>(
361    op: &'static str,
362    view: &StridedView<'_, T>,
363    pred: impl Fn(T) -> bool + Copy + Sync,
364) -> crate::Result<bool>
365where
366    T: Copy + Send + Sync,
367{
368    reduce(view, pred, |lhs, rhs| lhs || rhs, false)
369        .map_err(|err| crate::Error::backend_failure(op, err))
370}
371
372fn ensure_no_zero_divisor<T>(op: &'static str, rhs: &StridedView<'_, T>) -> crate::Result<()>
373where
374    T: WrappingIntegerElem,
375{
376    if strided_view_contains(op, rhs, |value| value == T::zero())? {
377        return Err(crate::Error::division_by_zero(op, T::dtype()));
378    }
379    Ok(())
380}
381
382fn complex_scalar_tensor<T>(scalar: T) -> crate::Result<TypedTensor<Complex<T>>>
383where
384    T: Copy + Clone + Zero,
385{
386    TypedTensor::from_vec_col_major(vec![], vec![Complex::new(scalar, T::zero())])
387}
388
389fn complex_scalar_tensor_from_tensor<T>(
390    input: &TypedTensor<T>,
391) -> crate::Result<TypedTensor<Complex<T>>>
392where
393    T: Copy + Clone + Zero,
394{
395    complex_scalar_tensor(typed_host_data("add", input)?[0])
396}
397
398fn complex_scalar_tensor_from_view<T, R>(
399    input: &TypedTensorView<'_, T, R>,
400) -> crate::Result<TypedTensor<Complex<T>>>
401where
402    T: Copy + Clone + Zero + 'static,
403    R: TensorRank,
404{
405    complex_scalar_tensor(typed_view_from_view("add", input)?.get(&[]))
406}
407
408fn with_local_pool<T>(f: impl FnOnce(&mut BufferPool) -> T) -> T {
409    let mut buffers = BufferPool::new();
410    f(&mut buffers)
411}
412
413/// Add two CPU tensors elementwise.
414///
415/// # Examples
416///
417/// ```
418/// use tenferro_cpu::add;
419/// use tenferro_tensor::Tensor;
420///
421/// let a = Tensor::from_vec_col_major(vec![2], vec![1.0_f64, 2.0])?;
422/// let b = Tensor::from_vec_col_major(vec![2], vec![3.0_f64, 4.0])?;
423/// let out = add(&a, &b)?;
424/// assert_eq!(out.as_slice::<f64>().unwrap(), &[4.0, 6.0]);
425/// # Ok::<(), tenferro_tensor::Error>(())
426/// ```
427pub fn add(lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
428    with_local_pool(|buffers| add_with_pool(buffers, lhs, rhs))
429}
430
431pub(crate) fn add_with_pool(
432    buffers: &mut BufferPool,
433    lhs: &Tensor,
434    rhs: &Tensor,
435) -> crate::Result<Tensor> {
436    match (lhs, rhs) {
437        (Tensor::F32(a), Tensor::F32(b)) => Ok(Tensor::F32(typed_add_with_pool(buffers, a, b)?)),
438        (Tensor::F64(a), Tensor::F64(b)) => Ok(Tensor::F64(typed_add_with_pool(buffers, a, b)?)),
439        (Tensor::I32(a), Tensor::I32(b)) => {
440            Ok(Tensor::I32(typed_wrapping_add_with_pool(buffers, a, b)?))
441        }
442        (Tensor::I64(a), Tensor::I64(b)) => {
443            Ok(Tensor::I64(typed_wrapping_add_with_pool(buffers, a, b)?))
444        }
445        (Tensor::C32(a), Tensor::C32(b)) => Ok(Tensor::C32(typed_add_with_pool(buffers, a, b)?)),
446        (Tensor::C64(a), Tensor::C64(b)) => Ok(Tensor::C64(typed_add_with_pool(buffers, a, b)?)),
447        (Tensor::F32(a), Tensor::C32(b)) if a.shape().is_empty() => {
448            let scalar = complex_scalar_tensor(typed_host_data("add", a)?[0])?;
449            Ok(Tensor::C32(typed_add_with_pool(buffers, &scalar, b)?))
450        }
451        (Tensor::C32(a), Tensor::F32(b)) if b.shape().is_empty() => {
452            let scalar = complex_scalar_tensor(typed_host_data("add", b)?[0])?;
453            Ok(Tensor::C32(typed_add_with_pool(buffers, a, &scalar)?))
454        }
455        (Tensor::F64(a), Tensor::C64(b)) if a.shape().is_empty() => {
456            let scalar = complex_scalar_tensor(typed_host_data("add", a)?[0])?;
457            Ok(Tensor::C64(typed_add_with_pool(buffers, &scalar, b)?))
458        }
459        (Tensor::C64(a), Tensor::F64(b)) if b.shape().is_empty() => {
460            let scalar = complex_scalar_tensor(typed_host_data("add", b)?[0])?;
461            Ok(Tensor::C64(typed_add_with_pool(buffers, a, &scalar)?))
462        }
463        _ => Err(tensor_pair_error("add", lhs, rhs)),
464    }
465}
466
467pub(crate) fn add_read_with_pool(
468    buffers: &mut BufferPool,
469    lhs: TensorRead<'_>,
470    rhs: TensorRead<'_>,
471) -> crate::Result<Tensor> {
472    if let (TensorRead::Tensor(lhs), TensorRead::Tensor(rhs)) = (&lhs, &rhs) {
473        return add_with_pool(buffers, lhs, rhs);
474    }
475
476    macro_rules! dispatch {
477        ($variant:ident, $func:ident) => {
478            match (&lhs, &rhs) {
479                (
480                    TensorRead::Tensor(Tensor::$variant(a)),
481                    TensorRead::View(TensorView::$variant(b)),
482                ) => {
483                    let a = a.as_view();
484                    return Ok(Tensor::$variant($func(buffers, &a, b)?));
485                }
486                (
487                    TensorRead::View(TensorView::$variant(a)),
488                    TensorRead::Tensor(Tensor::$variant(b)),
489                ) => {
490                    let b = b.as_view();
491                    return Ok(Tensor::$variant($func(buffers, a, &b)?));
492                }
493                (
494                    TensorRead::View(TensorView::$variant(a)),
495                    TensorRead::View(TensorView::$variant(b)),
496                ) => {
497                    return Ok(Tensor::$variant($func(buffers, a, b)?));
498                }
499                _ => {}
500            }
501        };
502    }
503
504    macro_rules! dispatch_real_complex_scalar {
505        ($real_variant:ident, $complex_variant:ident) => {
506            match (&lhs, &rhs) {
507                (
508                    TensorRead::Tensor(Tensor::$real_variant(real)),
509                    TensorRead::View(TensorView::$complex_variant(complex)),
510                ) if real.shape().is_empty() => {
511                    let scalar = complex_scalar_tensor_from_tensor(real)?;
512                    let scalar = scalar.as_view();
513                    return Ok(Tensor::$complex_variant(typed_add_view_with_pool(
514                        buffers, &scalar, complex,
515                    )?));
516                }
517                (
518                    TensorRead::View(TensorView::$real_variant(real)),
519                    TensorRead::Tensor(Tensor::$complex_variant(complex)),
520                ) if real.shape().is_empty() => {
521                    let scalar = complex_scalar_tensor_from_view(real)?;
522                    let scalar = scalar.as_view();
523                    let complex = complex.as_view();
524                    return Ok(Tensor::$complex_variant(typed_add_view_with_pool(
525                        buffers, &scalar, &complex,
526                    )?));
527                }
528                (
529                    TensorRead::View(TensorView::$real_variant(real)),
530                    TensorRead::View(TensorView::$complex_variant(complex)),
531                ) if real.shape().is_empty() => {
532                    let scalar = complex_scalar_tensor_from_view(real)?;
533                    let scalar = scalar.as_view();
534                    return Ok(Tensor::$complex_variant(typed_add_view_with_pool(
535                        buffers, &scalar, complex,
536                    )?));
537                }
538                (
539                    TensorRead::Tensor(Tensor::$complex_variant(complex)),
540                    TensorRead::View(TensorView::$real_variant(real)),
541                ) if real.shape().is_empty() => {
542                    let complex = complex.as_view();
543                    let scalar = complex_scalar_tensor_from_view(real)?;
544                    let scalar = scalar.as_view();
545                    return Ok(Tensor::$complex_variant(typed_add_view_with_pool(
546                        buffers, &complex, &scalar,
547                    )?));
548                }
549                (
550                    TensorRead::View(TensorView::$complex_variant(complex)),
551                    TensorRead::Tensor(Tensor::$real_variant(real)),
552                ) if real.shape().is_empty() => {
553                    let scalar = complex_scalar_tensor_from_tensor(real)?;
554                    let scalar = scalar.as_view();
555                    return Ok(Tensor::$complex_variant(typed_add_view_with_pool(
556                        buffers, complex, &scalar,
557                    )?));
558                }
559                (
560                    TensorRead::View(TensorView::$complex_variant(complex)),
561                    TensorRead::View(TensorView::$real_variant(real)),
562                ) if real.shape().is_empty() => {
563                    let scalar = complex_scalar_tensor_from_view(real)?;
564                    let scalar = scalar.as_view();
565                    return Ok(Tensor::$complex_variant(typed_add_view_with_pool(
566                        buffers, complex, &scalar,
567                    )?));
568                }
569                _ => {}
570            }
571        };
572    }
573
574    dispatch_real_complex_scalar!(F32, C32);
575    dispatch_real_complex_scalar!(F64, C64);
576
577    dispatch!(F32, typed_add_view_with_pool);
578    dispatch!(F64, typed_add_view_with_pool);
579    dispatch!(I32, typed_wrapping_add_view_with_pool);
580    dispatch!(I64, typed_wrapping_add_view_with_pool);
581    dispatch!(C32, typed_add_view_with_pool);
582    dispatch!(C64, typed_add_view_with_pool);
583
584    Err(read_pair_error("add", lhs, rhs))
585}
586
587/// Subtract two CPU tensors elementwise.
588///
589/// # Examples
590///
591/// ```
592/// use tenferro_cpu::sub;
593/// use tenferro_tensor::Tensor;
594///
595/// let a = Tensor::from_vec_col_major(vec![2], vec![5.0_f64, 2.0])?;
596/// let b = Tensor::from_vec_col_major(vec![2], vec![3.0_f64, 4.0])?;
597/// let out = sub(&a, &b)?;
598/// assert_eq!(out.as_slice::<f64>().unwrap(), &[2.0, -2.0]);
599/// # Ok::<(), tenferro_tensor::Error>(())
600/// ```
601pub fn sub(lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
602    with_local_pool(|buffers| sub_with_pool(buffers, lhs, rhs))
603}
604
605pub(crate) fn sub_with_pool(
606    buffers: &mut BufferPool,
607    lhs: &Tensor,
608    rhs: &Tensor,
609) -> crate::Result<Tensor> {
610    match (lhs, rhs) {
611        (Tensor::F32(a), Tensor::F32(b)) => Ok(Tensor::F32(typed_sub_with_pool(buffers, a, b)?)),
612        (Tensor::F64(a), Tensor::F64(b)) => Ok(Tensor::F64(typed_sub_with_pool(buffers, a, b)?)),
613        (Tensor::I32(a), Tensor::I32(b)) => {
614            Ok(Tensor::I32(typed_wrapping_sub_with_pool(buffers, a, b)?))
615        }
616        (Tensor::I64(a), Tensor::I64(b)) => {
617            Ok(Tensor::I64(typed_wrapping_sub_with_pool(buffers, a, b)?))
618        }
619        (Tensor::C32(a), Tensor::C32(b)) => Ok(Tensor::C32(typed_sub_with_pool(buffers, a, b)?)),
620        (Tensor::C64(a), Tensor::C64(b)) => Ok(Tensor::C64(typed_sub_with_pool(buffers, a, b)?)),
621        (Tensor::F32(a), Tensor::C32(b)) if a.shape().is_empty() => {
622            let scalar = complex_scalar_tensor(typed_host_data("sub", a)?[0])?;
623            Ok(Tensor::C32(typed_sub_with_pool(buffers, &scalar, b)?))
624        }
625        (Tensor::C32(a), Tensor::F32(b)) if b.shape().is_empty() => {
626            let scalar = complex_scalar_tensor(typed_host_data("sub", b)?[0])?;
627            Ok(Tensor::C32(typed_sub_with_pool(buffers, a, &scalar)?))
628        }
629        (Tensor::F64(a), Tensor::C64(b)) if a.shape().is_empty() => {
630            let scalar = complex_scalar_tensor(typed_host_data("sub", a)?[0])?;
631            Ok(Tensor::C64(typed_sub_with_pool(buffers, &scalar, b)?))
632        }
633        (Tensor::C64(a), Tensor::F64(b)) if b.shape().is_empty() => {
634            let scalar = complex_scalar_tensor(typed_host_data("sub", b)?[0])?;
635            Ok(Tensor::C64(typed_sub_with_pool(buffers, a, &scalar)?))
636        }
637        _ => Err(tensor_pair_error("sub", lhs, rhs)),
638    }
639}
640
641pub(crate) fn sub_read_with_pool(
642    buffers: &mut BufferPool,
643    lhs: TensorRead<'_>,
644    rhs: TensorRead<'_>,
645) -> crate::Result<Tensor> {
646    if let (TensorRead::Tensor(lhs), TensorRead::Tensor(rhs)) = (&lhs, &rhs) {
647        return sub_with_pool(buffers, lhs, rhs);
648    }
649
650    macro_rules! dispatch {
651        ($variant:ident, $func:ident) => {
652            match (&lhs, &rhs) {
653                (
654                    TensorRead::Tensor(Tensor::$variant(a)),
655                    TensorRead::View(TensorView::$variant(b)),
656                ) => {
657                    let a = a.as_view();
658                    return Ok(Tensor::$variant($func(buffers, &a, b)?));
659                }
660                (
661                    TensorRead::View(TensorView::$variant(a)),
662                    TensorRead::Tensor(Tensor::$variant(b)),
663                ) => {
664                    let b = b.as_view();
665                    return Ok(Tensor::$variant($func(buffers, a, &b)?));
666                }
667                (
668                    TensorRead::View(TensorView::$variant(a)),
669                    TensorRead::View(TensorView::$variant(b)),
670                ) => {
671                    return Ok(Tensor::$variant($func(buffers, a, b)?));
672                }
673                _ => {}
674            }
675        };
676    }
677
678    macro_rules! dispatch_real_complex_scalar {
679        ($real_variant:ident, $complex_variant:ident) => {
680            match (&lhs, &rhs) {
681                (
682                    TensorRead::Tensor(Tensor::$real_variant(real)),
683                    TensorRead::View(TensorView::$complex_variant(complex)),
684                ) if real.shape().is_empty() => {
685                    let scalar = complex_scalar_tensor_from_tensor(real)?;
686                    let scalar = scalar.as_view();
687                    return Ok(Tensor::$complex_variant(typed_sub_view_with_pool(
688                        buffers, &scalar, complex,
689                    )?));
690                }
691                (
692                    TensorRead::View(TensorView::$real_variant(real)),
693                    TensorRead::Tensor(Tensor::$complex_variant(complex)),
694                ) if real.shape().is_empty() => {
695                    let scalar = complex_scalar_tensor_from_view(real)?;
696                    let scalar = scalar.as_view();
697                    let complex = complex.as_view();
698                    return Ok(Tensor::$complex_variant(typed_sub_view_with_pool(
699                        buffers, &scalar, &complex,
700                    )?));
701                }
702                (
703                    TensorRead::View(TensorView::$real_variant(real)),
704                    TensorRead::View(TensorView::$complex_variant(complex)),
705                ) if real.shape().is_empty() => {
706                    let scalar = complex_scalar_tensor_from_view(real)?;
707                    let scalar = scalar.as_view();
708                    return Ok(Tensor::$complex_variant(typed_sub_view_with_pool(
709                        buffers, &scalar, complex,
710                    )?));
711                }
712                (
713                    TensorRead::Tensor(Tensor::$complex_variant(complex)),
714                    TensorRead::View(TensorView::$real_variant(real)),
715                ) if real.shape().is_empty() => {
716                    let complex = complex.as_view();
717                    let scalar = complex_scalar_tensor_from_view(real)?;
718                    let scalar = scalar.as_view();
719                    return Ok(Tensor::$complex_variant(typed_sub_view_with_pool(
720                        buffers, &complex, &scalar,
721                    )?));
722                }
723                (
724                    TensorRead::View(TensorView::$complex_variant(complex)),
725                    TensorRead::Tensor(Tensor::$real_variant(real)),
726                ) if real.shape().is_empty() => {
727                    let scalar = complex_scalar_tensor_from_tensor(real)?;
728                    let scalar = scalar.as_view();
729                    return Ok(Tensor::$complex_variant(typed_sub_view_with_pool(
730                        buffers, complex, &scalar,
731                    )?));
732                }
733                (
734                    TensorRead::View(TensorView::$complex_variant(complex)),
735                    TensorRead::View(TensorView::$real_variant(real)),
736                ) if real.shape().is_empty() => {
737                    let scalar = complex_scalar_tensor_from_view(real)?;
738                    let scalar = scalar.as_view();
739                    return Ok(Tensor::$complex_variant(typed_sub_view_with_pool(
740                        buffers, complex, &scalar,
741                    )?));
742                }
743                _ => {}
744            }
745        };
746    }
747
748    dispatch_real_complex_scalar!(F32, C32);
749    dispatch_real_complex_scalar!(F64, C64);
750
751    dispatch!(F32, typed_sub_view_with_pool);
752    dispatch!(F64, typed_sub_view_with_pool);
753    dispatch!(I32, typed_wrapping_sub_view_with_pool);
754    dispatch!(I64, typed_wrapping_sub_view_with_pool);
755    dispatch!(C32, typed_sub_view_with_pool);
756    dispatch!(C64, typed_sub_view_with_pool);
757
758    Err(read_pair_error("sub", lhs, rhs))
759}
760
761/// Multiply two CPU tensors elementwise.
762///
763/// # Examples
764///
765/// ```
766/// use tenferro_cpu::mul;
767/// use tenferro_tensor::Tensor;
768///
769/// let a = Tensor::from_vec_col_major(vec![2], vec![2.0_f64, 3.0])?;
770/// let b = Tensor::from_vec_col_major(vec![2], vec![4.0_f64, 5.0])?;
771/// let out = mul(&a, &b)?;
772/// assert_eq!(out.as_slice::<f64>().unwrap(), &[8.0, 15.0]);
773/// # Ok::<(), tenferro_tensor::Error>(())
774/// ```
775pub fn mul(lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
776    with_local_pool(|buffers| mul_with_pool(buffers, lhs, rhs))
777}
778
779fn binary_read_with_pool(
780    op: &'static str,
781    buffers: &mut BufferPool,
782    lhs: TensorRead<'_>,
783    rhs: TensorRead<'_>,
784    f: impl FnOnce(&mut BufferPool, &Tensor, &Tensor) -> crate::Result<Tensor>,
785) -> crate::Result<Tensor> {
786    if let (Some(lhs), Some(rhs)) = (lhs.as_tensor(), rhs.as_tensor()) {
787        return f(buffers, lhs, rhs);
788    }
789
790    Err(read_pair_error(op, lhs, rhs))
791}
792
793pub(crate) fn mul_with_pool(
794    buffers: &mut BufferPool,
795    lhs: &Tensor,
796    rhs: &Tensor,
797) -> crate::Result<Tensor> {
798    match (lhs, rhs) {
799        (Tensor::F32(a), Tensor::F32(b)) => Ok(Tensor::F32(typed_mul_with_pool(buffers, a, b)?)),
800        (Tensor::F64(a), Tensor::F64(b)) => Ok(Tensor::F64(typed_mul_with_pool(buffers, a, b)?)),
801        (Tensor::I32(a), Tensor::I32(b)) => {
802            Ok(Tensor::I32(typed_wrapping_mul_with_pool(buffers, a, b)?))
803        }
804        (Tensor::I64(a), Tensor::I64(b)) => {
805            Ok(Tensor::I64(typed_wrapping_mul_with_pool(buffers, a, b)?))
806        }
807        (Tensor::C32(a), Tensor::C32(b)) => Ok(Tensor::C32(typed_mul_with_pool(buffers, a, b)?)),
808        (Tensor::C64(a), Tensor::C64(b)) => Ok(Tensor::C64(typed_mul_with_pool(buffers, a, b)?)),
809        (Tensor::F32(a), Tensor::C32(b)) if a.shape().is_empty() => {
810            let scalar = complex_scalar_tensor(typed_host_data("mul", a)?[0])?;
811            Ok(Tensor::C32(typed_mul_with_pool(buffers, &scalar, b)?))
812        }
813        (Tensor::C32(a), Tensor::F32(b)) if b.shape().is_empty() => {
814            let scalar = complex_scalar_tensor(typed_host_data("mul", b)?[0])?;
815            Ok(Tensor::C32(typed_mul_with_pool(buffers, a, &scalar)?))
816        }
817        (Tensor::F64(a), Tensor::C64(b)) if a.shape().is_empty() => {
818            let scalar = complex_scalar_tensor(typed_host_data("mul", a)?[0])?;
819            Ok(Tensor::C64(typed_mul_with_pool(buffers, &scalar, b)?))
820        }
821        (Tensor::C64(a), Tensor::F64(b)) if b.shape().is_empty() => {
822            let scalar = complex_scalar_tensor(typed_host_data("mul", b)?[0])?;
823            Ok(Tensor::C64(typed_mul_with_pool(buffers, a, &scalar)?))
824        }
825        _ => Err(tensor_pair_error("mul", lhs, rhs)),
826    }
827}
828
829pub(crate) fn mul_read_with_pool(
830    buffers: &mut BufferPool,
831    lhs: TensorRead<'_>,
832    rhs: TensorRead<'_>,
833) -> crate::Result<Tensor> {
834    if let (TensorRead::Tensor(lhs), TensorRead::Tensor(rhs)) = (&lhs, &rhs) {
835        return mul_with_pool(buffers, lhs, rhs);
836    }
837
838    macro_rules! dispatch {
839        ($variant:ident, $func:ident) => {
840            match (&lhs, &rhs) {
841                (
842                    TensorRead::Tensor(Tensor::$variant(a)),
843                    TensorRead::View(TensorView::$variant(b)),
844                ) => {
845                    let a = a.as_view();
846                    return Ok(Tensor::$variant($func(buffers, &a, b)?));
847                }
848                (
849                    TensorRead::View(TensorView::$variant(a)),
850                    TensorRead::Tensor(Tensor::$variant(b)),
851                ) => {
852                    let b = b.as_view();
853                    return Ok(Tensor::$variant($func(buffers, a, &b)?));
854                }
855                (
856                    TensorRead::View(TensorView::$variant(a)),
857                    TensorRead::View(TensorView::$variant(b)),
858                ) => {
859                    return Ok(Tensor::$variant($func(buffers, a, b)?));
860                }
861                _ => {}
862            }
863        };
864    }
865
866    macro_rules! dispatch_real_complex_scalar {
867        ($real_variant:ident, $complex_variant:ident) => {
868            match (&lhs, &rhs) {
869                (
870                    TensorRead::Tensor(Tensor::$real_variant(real)),
871                    TensorRead::View(TensorView::$complex_variant(complex)),
872                ) if real.shape().is_empty() => {
873                    let scalar = complex_scalar_tensor_from_tensor(real)?;
874                    let scalar = scalar.as_view();
875                    return Ok(Tensor::$complex_variant(typed_mul_view_with_pool(
876                        buffers, &scalar, complex,
877                    )?));
878                }
879                (
880                    TensorRead::View(TensorView::$real_variant(real)),
881                    TensorRead::Tensor(Tensor::$complex_variant(complex)),
882                ) if real.shape().is_empty() => {
883                    let scalar = complex_scalar_tensor_from_view(real)?;
884                    let scalar = scalar.as_view();
885                    let complex = complex.as_view();
886                    return Ok(Tensor::$complex_variant(typed_mul_view_with_pool(
887                        buffers, &scalar, &complex,
888                    )?));
889                }
890                (
891                    TensorRead::View(TensorView::$real_variant(real)),
892                    TensorRead::View(TensorView::$complex_variant(complex)),
893                ) if real.shape().is_empty() => {
894                    let scalar = complex_scalar_tensor_from_view(real)?;
895                    let scalar = scalar.as_view();
896                    return Ok(Tensor::$complex_variant(typed_mul_view_with_pool(
897                        buffers, &scalar, complex,
898                    )?));
899                }
900                (
901                    TensorRead::Tensor(Tensor::$complex_variant(complex)),
902                    TensorRead::View(TensorView::$real_variant(real)),
903                ) if real.shape().is_empty() => {
904                    let complex = complex.as_view();
905                    let scalar = complex_scalar_tensor_from_view(real)?;
906                    let scalar = scalar.as_view();
907                    return Ok(Tensor::$complex_variant(typed_mul_view_with_pool(
908                        buffers, &complex, &scalar,
909                    )?));
910                }
911                (
912                    TensorRead::View(TensorView::$complex_variant(complex)),
913                    TensorRead::Tensor(Tensor::$real_variant(real)),
914                ) if real.shape().is_empty() => {
915                    let scalar = complex_scalar_tensor_from_tensor(real)?;
916                    let scalar = scalar.as_view();
917                    return Ok(Tensor::$complex_variant(typed_mul_view_with_pool(
918                        buffers, complex, &scalar,
919                    )?));
920                }
921                (
922                    TensorRead::View(TensorView::$complex_variant(complex)),
923                    TensorRead::View(TensorView::$real_variant(real)),
924                ) if real.shape().is_empty() => {
925                    let scalar = complex_scalar_tensor_from_view(real)?;
926                    let scalar = scalar.as_view();
927                    return Ok(Tensor::$complex_variant(typed_mul_view_with_pool(
928                        buffers, complex, &scalar,
929                    )?));
930                }
931                _ => {}
932            }
933        };
934    }
935
936    dispatch_real_complex_scalar!(F32, C32);
937    dispatch_real_complex_scalar!(F64, C64);
938
939    dispatch!(F32, typed_mul_view_with_pool);
940    dispatch!(F64, typed_mul_view_with_pool);
941    dispatch!(I32, typed_wrapping_mul_view_with_pool);
942    dispatch!(I64, typed_wrapping_mul_view_with_pool);
943    dispatch!(C32, typed_mul_view_with_pool);
944    dispatch!(C64, typed_mul_view_with_pool);
945
946    binary_read_with_pool("mul", buffers, lhs, rhs, mul_with_pool)
947}
948
949enum CpuReadView<'a> {
950    F32(TypedTensorView<'a, f32>),
951    F64(TypedTensorView<'a, f64>),
952    I32(TypedTensorView<'a, i32>),
953    I64(TypedTensorView<'a, i64>),
954    Bool(TypedTensorView<'a, bool>),
955    C32(TypedTensorView<'a, Complex<f32>>),
956    C64(TypedTensorView<'a, Complex<f64>>),
957}
958
959fn read_as_cpu_view(input: TensorRead<'_>) -> CpuReadView<'_> {
960    match input {
961        TensorRead::Tensor(Tensor::F32(tensor)) => CpuReadView::F32(tensor.as_view()),
962        TensorRead::Tensor(Tensor::F64(tensor)) => CpuReadView::F64(tensor.as_view()),
963        TensorRead::Tensor(Tensor::I32(tensor)) => CpuReadView::I32(tensor.as_view()),
964        TensorRead::Tensor(Tensor::I64(tensor)) => CpuReadView::I64(tensor.as_view()),
965        TensorRead::Tensor(Tensor::Bool(tensor)) => CpuReadView::Bool(tensor.as_view()),
966        TensorRead::Tensor(Tensor::C32(tensor)) => CpuReadView::C32(tensor.as_view()),
967        TensorRead::Tensor(Tensor::C64(tensor)) => CpuReadView::C64(tensor.as_view()),
968        TensorRead::View(TensorView::F32(view)) => CpuReadView::F32(view),
969        TensorRead::View(TensorView::F64(view)) => CpuReadView::F64(view),
970        TensorRead::View(TensorView::I32(view)) => CpuReadView::I32(view),
971        TensorRead::View(TensorView::I64(view)) => CpuReadView::I64(view),
972        TensorRead::View(TensorView::Bool(view)) => CpuReadView::Bool(view),
973        TensorRead::View(TensorView::C32(view)) => CpuReadView::C32(view),
974        TensorRead::View(TensorView::C64(view)) => CpuReadView::C64(view),
975    }
976}
977
978pub(crate) fn elementwise_fusion_with_pool(
979    buffers: &mut BufferPool,
980    inputs: &[&Tensor],
981    plan: &ElementwiseFusionPlan,
982) -> crate::Result<Option<Vec<Tensor>>> {
983    if !validate_elementwise_fusion_inputs(inputs, plan)? {
984        return Ok(None);
985    }
986    if inputs.is_empty() {
987        return Ok(None);
988    }
989    if plan_uses_unfused_op(plan) {
990        return Ok(None);
991    }
992
993    match plan.dtype() {
994        DType::F32 => {
995            let typed_inputs = inputs
996                .iter()
997                .map(|input| match input {
998                    Tensor::F32(tensor) => Ok(tensor),
999                    _ => Err(crate::Error::DTypeMismatch {
1000                        op: ELEMENTWISE_FUSION_OP,
1001                        lhs: input.dtype(),
1002                        rhs: plan.dtype(),
1003                    }),
1004                })
1005                .collect::<crate::Result<Vec<_>>>()?;
1006            typed_elementwise_fusion_with_pool(buffers, &typed_inputs, plan, Tensor::F32)
1007        }
1008        DType::F64 => {
1009            let typed_inputs = inputs
1010                .iter()
1011                .map(|input| match input {
1012                    Tensor::F64(tensor) => Ok(tensor),
1013                    _ => Err(crate::Error::DTypeMismatch {
1014                        op: ELEMENTWISE_FUSION_OP,
1015                        lhs: input.dtype(),
1016                        rhs: plan.dtype(),
1017                    }),
1018                })
1019                .collect::<crate::Result<Vec<_>>>()?;
1020            typed_elementwise_fusion_with_pool(buffers, &typed_inputs, plan, Tensor::F64)
1021        }
1022        DType::C32 => {
1023            if plan_uses_ordered_op(plan) {
1024                return Ok(None);
1025            }
1026            let typed_inputs = inputs
1027                .iter()
1028                .map(|input| match input {
1029                    Tensor::C32(tensor) => Ok(tensor),
1030                    _ => Err(crate::Error::DTypeMismatch {
1031                        op: ELEMENTWISE_FUSION_OP,
1032                        lhs: input.dtype(),
1033                        rhs: plan.dtype(),
1034                    }),
1035                })
1036                .collect::<crate::Result<Vec<_>>>()?;
1037            typed_elementwise_fusion_with_pool(buffers, &typed_inputs, plan, Tensor::C32)
1038        }
1039        DType::C64 => {
1040            if plan_uses_ordered_op(plan) {
1041                return Ok(None);
1042            }
1043            let typed_inputs = inputs
1044                .iter()
1045                .map(|input| match input {
1046                    Tensor::C64(tensor) => Ok(tensor),
1047                    _ => Err(crate::Error::DTypeMismatch {
1048                        op: ELEMENTWISE_FUSION_OP,
1049                        lhs: input.dtype(),
1050                        rhs: plan.dtype(),
1051                    }),
1052                })
1053                .collect::<crate::Result<Vec<_>>>()?;
1054            typed_elementwise_fusion_with_pool(buffers, &typed_inputs, plan, Tensor::C64)
1055        }
1056        DType::I32 | DType::I64 | DType::Bool => Ok(None),
1057    }
1058}
1059
1060fn typed_elementwise_fusion_with_pool<T>(
1061    buffers: &mut BufferPool,
1062    inputs: &[&TypedTensor<T>],
1063    plan: &ElementwiseFusionPlan,
1064    wrap: fn(TypedTensor<T>) -> Tensor,
1065) -> crate::Result<Option<Vec<Tensor>>>
1066where
1067    T: Copy + Clone + FusedScalar + PoolScalar,
1068{
1069    if should_defer_to_broadcast_multiply_special_case(plan) {
1070        return Ok(None);
1071    }
1072    if plan.input_views().iter().all(|view| view.is_identity()) {
1073        return typed_elementwise_fusion_identity_with_pool(buffers, inputs, plan, wrap);
1074    }
1075
1076    let input_views = inputs
1077        .iter()
1078        .zip(plan.input_views())
1079        .map(|(input, view)| typed_fusion_input_view(input, view))
1080        .collect::<crate::Result<Vec<_>>>()?;
1081    let shape = input_views[0].dims().to_vec();
1082    if input_views
1083        .iter()
1084        .skip(1)
1085        .any(|input| input.dims() != shape.as_slice())
1086    {
1087        return Ok(None);
1088    }
1089    let element_count =
1090        tenferro_tensor::validate::checked_shape_product(ELEMENTWISE_FUSION_OP, "shape", &shape)?;
1091    if element_count < ELEMENTWISE_FUSION_MIN_ELEMENTS {
1092        return Ok(None);
1093    }
1094
1095    run_typed_elementwise_fusion_with_views(buffers, &input_views, &shape, plan, wrap)
1096}
1097
1098fn typed_elementwise_fusion_identity_with_pool<T>(
1099    buffers: &mut BufferPool,
1100    inputs: &[&TypedTensor<T>],
1101    plan: &ElementwiseFusionPlan,
1102    wrap: fn(TypedTensor<T>) -> Tensor,
1103) -> crate::Result<Option<Vec<Tensor>>>
1104where
1105    T: Copy + Clone + FusedScalar + PoolScalar,
1106{
1107    let shape = inputs[0].shape();
1108    if inputs.iter().skip(1).any(|input| input.shape() != shape) {
1109        return Ok(None);
1110    }
1111    let element_count =
1112        tenferro_tensor::validate::checked_shape_product(ELEMENTWISE_FUSION_OP, "shape", shape)?;
1113    if element_count < ELEMENTWISE_FUSION_MIN_ELEMENTS {
1114        return Ok(None);
1115    }
1116
1117    let input_views = inputs
1118        .iter()
1119        .map(|input| typed_view(ELEMENTWISE_FUSION_OP, input))
1120        .collect::<crate::Result<Vec<_>>>()?;
1121    run_typed_elementwise_fusion_with_views(buffers, &input_views, shape, plan, wrap)
1122}
1123
1124fn run_typed_elementwise_fusion_with_views<T>(
1125    buffers: &mut BufferPool,
1126    input_views: &[StridedView<'_, T>],
1127    shape: &[usize],
1128    plan: &ElementwiseFusionPlan,
1129    wrap: fn(TypedTensor<T>) -> Tensor,
1130) -> crate::Result<Option<Vec<Tensor>>>
1131where
1132    T: Copy + Clone + FusedScalar + PoolScalar,
1133{
1134    if let Some(outputs) =
1135        try_typed_mul_add_specialization(buffers, input_views, shape, plan, wrap)?
1136    {
1137        return Ok(Some(outputs));
1138    }
1139
1140    let fused_plan = strided_fused_plan(plan);
1141    let mut output_arrays = Vec::with_capacity(plan.outputs().len());
1142    for _ in plan.outputs() {
1143        // SAFETY: fused_elementwise_into writes every destination element.
1144        output_arrays.push(unsafe { typed_array_uninit_from_pool(buffers, shape) }?);
1145    }
1146
1147    {
1148        let mut output_views = output_arrays
1149            .iter_mut()
1150            .map(|output| output.view_mut())
1151            .collect::<Vec<_>>();
1152        fused_elementwise_into(&mut output_views, input_views, &fused_plan)
1153            .map_err(|err| crate::Error::backend_failure(ELEMENTWISE_FUSION_OP, err))?;
1154    }
1155
1156    Ok(Some(
1157        output_arrays
1158            .into_iter()
1159            .map(|output| wrap(tensor_from_array(output)))
1160            .collect(),
1161    ))
1162}
1163
1164fn try_typed_mul_add_specialization<T>(
1165    buffers: &mut BufferPool,
1166    input_views: &[StridedView<'_, T>],
1167    shape: &[usize],
1168    plan: &ElementwiseFusionPlan,
1169    wrap: fn(TypedTensor<T>) -> Tensor,
1170) -> crate::Result<Option<Vec<Tensor>>>
1171where
1172    T: Copy + Clone + FusedScalar + PoolScalar,
1173{
1174    if plan.input_count() != 2
1175        || input_views.len() != 2
1176        || plan.outputs() != [3]
1177        || plan.ops().len() != 2
1178        || plan.ops()[0].op() != ElementwiseFusionOp::Multiply
1179        || plan.ops()[0].inputs() != [0, 1]
1180        || plan.ops()[1].op() != ElementwiseFusionOp::Add
1181    {
1182        return Ok(None);
1183    }
1184
1185    // SAFETY: zip_map2_into overwrites every output element.
1186    let mut out = unsafe { typed_array_uninit_from_pool(buffers, shape) }?;
1187    match plan.ops()[1].inputs() {
1188        [2, 0] | [0, 2] => zip_map2_into(
1189            &mut out.view_mut(),
1190            &input_views[0],
1191            &input_views[1],
1192            |a, b| a.fused_multiply(b).fused_add(a),
1193        ),
1194        [2, 1] | [1, 2] => zip_map2_into(
1195            &mut out.view_mut(),
1196            &input_views[0],
1197            &input_views[1],
1198            |a, b| a.fused_multiply(b).fused_add(b),
1199        ),
1200        _ => return Ok(None),
1201    }
1202    .map_err(|err| crate::Error::backend_failure(ELEMENTWISE_FUSION_OP, err))?;
1203
1204    Ok(Some(vec![wrap(tensor_from_array(out))]))
1205}
1206
1207fn typed_fusion_input_view<'a, T>(
1208    input: &'a TypedTensor<T>,
1209    view: &ElementwiseFusionInputView,
1210) -> crate::Result<StridedView<'a, T>>
1211where
1212    T: Copy,
1213{
1214    let base = typed_view(ELEMENTWISE_FUSION_OP, input)?;
1215    let ElementwiseFusionInputView::BroadcastInDim { shape, dims } = view else {
1216        return Ok(base);
1217    };
1218    if dims.len() != input.shape().len() {
1219        return Err(crate::Error::InvalidConfig {
1220            op: ELEMENTWISE_FUSION_OP,
1221            message: format!(
1222                "broadcast dims length {} does not match input rank {}",
1223                dims.len(),
1224                input.shape().len()
1225            ),
1226        });
1227    }
1228
1229    let mut strides = vec![0; shape.len()];
1230    let mut seen = Vec::with_capacity(shape.len());
1231    seen.resize(shape.len(), false);
1232    for (source_axis, &target_axis) in dims.iter().enumerate() {
1233        if target_axis >= shape.len() {
1234            return Err(crate::Error::AxisOutOfBounds {
1235                op: ELEMENTWISE_FUSION_OP,
1236                axis: target_axis,
1237                rank: shape.len(),
1238            });
1239        }
1240        if seen[target_axis] {
1241            return Err(crate::Error::DuplicateAxis {
1242                op: ELEMENTWISE_FUSION_OP,
1243                axis: target_axis,
1244                role: "broadcast dims",
1245            });
1246        }
1247        seen[target_axis] = true;
1248        let source_dim = input.shape()[source_axis];
1249        let target_dim = shape[target_axis];
1250        if source_dim != target_dim && source_dim != 1 {
1251            return Err(crate::Error::ShapeMismatch {
1252                op: ELEMENTWISE_FUSION_OP,
1253                lhs: shape.to_vec(),
1254                rhs: input.shape().to_vec(),
1255            });
1256        }
1257        if source_dim == target_dim {
1258            strides[target_axis] = base.strides()[source_axis];
1259        }
1260    }
1261
1262    StridedView::new(base.data(), shape, &strides, base.offset())
1263        .map_err(|err| crate::Error::backend_failure(ELEMENTWISE_FUSION_OP, err))
1264}
1265
1266fn typed_binary_view_with_pool<T, L, R>(
1267    op: &'static str,
1268    buffers: &mut BufferPool,
1269    lhs: &TypedTensorView<'_, T, L>,
1270    rhs: &TypedTensorView<'_, T, R>,
1271    f: impl Fn(T, T) -> T + Copy + Sync,
1272) -> crate::Result<TypedTensor<T>>
1273where
1274    T: Copy + PoolScalar + 'static,
1275    L: TensorRank,
1276    R: TensorRank,
1277{
1278    if lhs.shape() == rhs.shape() {
1279        // SAFETY: the following kernel overwrites every output element before any read.
1280        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
1281        zip_map2_into(
1282            &mut out.view_mut(),
1283            &typed_view_from_view(op, lhs)?,
1284            &typed_view_from_view(op, rhs)?,
1285            f,
1286        )
1287        .map_err(|err| crate::Error::backend_failure(op, err))?;
1288        Ok(tensor_from_array(out))
1289    } else if lhs.shape().is_empty() {
1290        let scalar = typed_view_from_view(op, lhs)?.get(&[]);
1291        // SAFETY: the following kernel overwrites every output element before any read.
1292        let mut out = unsafe { typed_array_uninit_from_pool(buffers, rhs.shape()) }?;
1293        map_into(&mut out.view_mut(), &typed_view_from_view(op, rhs)?, |x| {
1294            f(scalar, x)
1295        })
1296        .map_err(|err| crate::Error::backend_failure(op, err))?;
1297        Ok(tensor_from_array(out))
1298    } else if rhs.shape().is_empty() {
1299        let scalar = typed_view_from_view(op, rhs)?.get(&[]);
1300        // SAFETY: the following kernel overwrites every output element before any read.
1301        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
1302        map_into(&mut out.view_mut(), &typed_view_from_view(op, lhs)?, |x| {
1303            f(x, scalar)
1304        })
1305        .map_err(|err| crate::Error::backend_failure(op, err))?;
1306        Ok(tensor_from_array(out))
1307    } else {
1308        Err(crate::Error::ShapeMismatch {
1309            op,
1310            lhs: lhs.shape().to_vec(),
1311            rhs: rhs.shape().to_vec(),
1312        })
1313    }
1314}
1315
1316fn typed_unary_view_with_pool<T, R>(
1317    op: &'static str,
1318    buffers: &mut BufferPool,
1319    input: &TypedTensorView<'_, T, R>,
1320    f: impl Fn(T) -> T + Copy + Sync,
1321) -> crate::Result<TypedTensor<T>>
1322where
1323    T: Copy + PoolScalar + 'static,
1324    R: TensorRank,
1325{
1326    // SAFETY: the following kernel overwrites every output element before any read.
1327    let mut out = unsafe { typed_array_uninit_from_pool(buffers, input.shape()) }?;
1328    map_into(&mut out.view_mut(), &typed_view_from_view(op, input)?, f)
1329        .map_err(|err| crate::Error::backend_failure(op, err))?;
1330    Ok(tensor_from_array(out))
1331}
1332
1333fn typed_same_shape_binary_view_with_pool<T, O, L, R>(
1334    op: &'static str,
1335    buffers: &mut BufferPool,
1336    lhs: &TypedTensorView<'_, T, L>,
1337    rhs: &TypedTensorView<'_, T, R>,
1338    f: impl Fn(T, T) -> O + Copy + Sync,
1339) -> crate::Result<TypedTensor<O>>
1340where
1341    T: Copy + Send + Sync + 'static,
1342    O: Copy + PoolScalar,
1343    L: TensorRank,
1344    R: TensorRank,
1345{
1346    if lhs.shape() != rhs.shape() {
1347        return Err(crate::Error::ShapeMismatch {
1348            op,
1349            lhs: lhs.shape().to_vec(),
1350            rhs: rhs.shape().to_vec(),
1351        });
1352    }
1353    // SAFETY: the following kernel overwrites every output element before any read.
1354    let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
1355    zip_map2_into(
1356        &mut out.view_mut(),
1357        &typed_view_from_view(op, lhs)?,
1358        &typed_view_from_view(op, rhs)?,
1359        f,
1360    )
1361    .map_err(|err| crate::Error::backend_failure(op, err))?;
1362    Ok(tensor_from_array(out))
1363}
1364
1365fn typed_select_view_with_pool<T, P, A, B>(
1366    buffers: &mut BufferPool,
1367    pred: &TypedTensorView<'_, bool, P>,
1368    on_true: &TypedTensorView<'_, T, A>,
1369    on_false: &TypedTensorView<'_, T, B>,
1370) -> crate::Result<TypedTensor<T>>
1371where
1372    T: Copy + PoolScalar + 'static,
1373    P: TensorRank,
1374    A: TensorRank,
1375    B: TensorRank,
1376{
1377    if pred.shape() != on_true.shape() {
1378        return Err(crate::Error::ShapeMismatch {
1379            op: "select",
1380            lhs: pred.shape().to_vec(),
1381            rhs: on_true.shape().to_vec(),
1382        });
1383    }
1384    if pred.shape() != on_false.shape() {
1385        return Err(crate::Error::ShapeMismatch {
1386            op: "select",
1387            lhs: pred.shape().to_vec(),
1388            rhs: on_false.shape().to_vec(),
1389        });
1390    }
1391    // SAFETY: the following kernel overwrites every output element before any read.
1392    let mut out = unsafe { typed_array_uninit_from_pool(buffers, pred.shape()) }?;
1393    zip_map3_into(
1394        &mut out.view_mut(),
1395        &typed_view_from_view("select", pred)?,
1396        &typed_view_from_view("select", on_true)?,
1397        &typed_view_from_view("select", on_false)?,
1398        |p, t, f| if p { t } else { f },
1399    )
1400    .map_err(|err| crate::Error::backend_failure("select", err))?;
1401    Ok(tensor_from_array(out))
1402}
1403
1404fn typed_clamp_view_with_pool<T, I, L, U>(
1405    buffers: &mut BufferPool,
1406    input: &TypedTensorView<'_, T, I>,
1407    lower: &TypedTensorView<'_, T, L>,
1408    upper: &TypedTensorView<'_, T, U>,
1409) -> crate::Result<TypedTensor<T>>
1410where
1411    T: OrderedElem + PoolScalar + 'static,
1412    I: TensorRank,
1413    L: TensorRank,
1414    U: TensorRank,
1415{
1416    if input.shape() != lower.shape() {
1417        return Err(crate::Error::ShapeMismatch {
1418            op: "clamp",
1419            lhs: input.shape().to_vec(),
1420            rhs: lower.shape().to_vec(),
1421        });
1422    }
1423    if input.shape() != upper.shape() {
1424        return Err(crate::Error::ShapeMismatch {
1425            op: "clamp",
1426            lhs: input.shape().to_vec(),
1427            rhs: upper.shape().to_vec(),
1428        });
1429    }
1430    // SAFETY: the following kernel overwrites every output element before any read.
1431    let mut out = unsafe { typed_array_uninit_from_pool(buffers, input.shape()) }?;
1432    zip_map3_into(
1433        &mut out.view_mut(),
1434        &typed_view_from_view("clamp", input)?,
1435        &typed_view_from_view("clamp", lower)?,
1436        &typed_view_from_view("clamp", upper)?,
1437        |x, lo, hi| hi.min_elem(lo.max_elem(x)),
1438    )
1439    .map_err(|err| crate::Error::backend_failure("clamp", err))?;
1440    Ok(tensor_from_array(out))
1441}
1442
1443#[derive(Clone, Copy)]
1444enum SplitOuterProductLayout {
1445    LhsPrefix,
1446    RhsPrefix,
1447}
1448
1449struct SplitOuterProductPlan {
1450    #[allow(dead_code)]
1451    rows: usize,
1452    #[allow(dead_code)]
1453    cols: usize,
1454    #[allow(dead_code)]
1455    batches: usize,
1456    layout: SplitOuterProductLayout,
1457    lhs_free_axes: Vec<usize>,
1458    rhs_free_axes: Vec<usize>,
1459    lhs_batch_axes: Vec<usize>,
1460    rhs_batch_axes: Vec<usize>,
1461}
1462
1463struct OuterProductAxisPartition {
1464    lhs_free_output_axes: Vec<usize>,
1465    rhs_free_output_axes: Vec<usize>,
1466    batch_output_axes: Vec<usize>,
1467    lhs_free_axes: Vec<usize>,
1468    rhs_free_axes: Vec<usize>,
1469    lhs_batch_axes: Vec<usize>,
1470    rhs_batch_axes: Vec<usize>,
1471}
1472
1473fn shape_matches_dims(source_shape: &[usize], output_shape: &[usize], dims: &[usize]) -> bool {
1474    source_shape.len() == dims.len()
1475        && source_shape
1476            .iter()
1477            .zip(dims.iter())
1478            .all(|(&dim, &axis)| output_shape.get(axis).copied() == Some(dim))
1479}
1480
1481fn axes_by_output(dims: &[usize], output_rank: usize) -> Option<Vec<Option<usize>>> {
1482    let mut axes = vec![None; output_rank];
1483    for (src_axis, &dst_axis) in dims.iter().enumerate() {
1484        let slot = axes.get_mut(dst_axis)?;
1485        if slot.replace(src_axis).is_some() {
1486            return None;
1487        }
1488    }
1489    Some(axes)
1490}
1491
1492fn axes_shape_product<T>(
1493    op: &'static str,
1494    view: &TypedTensorView<'_, T>,
1495    axes: &[usize],
1496) -> crate::Result<usize>
1497where
1498    T: 'static,
1499{
1500    axes.iter().try_fold(1usize, |acc, &axis| {
1501        acc.checked_mul(view.shape()[axis])
1502            .ok_or_else(|| crate::Error::backend_failure(op, "shape size overflows usize"))
1503    })
1504}
1505
1506fn classify_outer_product_axes(
1507    lhs_dims: &[usize],
1508    rhs_dims: &[usize],
1509    output_rank: usize,
1510) -> Option<OuterProductAxisPartition> {
1511    let lhs_axes_by_output = axes_by_output(lhs_dims, output_rank)?;
1512    let rhs_axes_by_output = axes_by_output(rhs_dims, output_rank)?;
1513
1514    let mut lhs_free_output_axes = Vec::new();
1515    let mut rhs_free_output_axes = Vec::new();
1516    let mut batch_output_axes = Vec::new();
1517    let mut lhs_free_axes = Vec::new();
1518    let mut rhs_free_axes = Vec::new();
1519    let mut lhs_batch_axes = Vec::new();
1520    let mut rhs_batch_axes = Vec::new();
1521
1522    for output_axis in 0..output_rank {
1523        match (
1524            lhs_axes_by_output[output_axis],
1525            rhs_axes_by_output[output_axis],
1526        ) {
1527            (Some(lhs_axis), Some(rhs_axis)) => {
1528                batch_output_axes.push(output_axis);
1529                lhs_batch_axes.push(lhs_axis);
1530                rhs_batch_axes.push(rhs_axis);
1531            }
1532            (Some(lhs_axis), None) => {
1533                lhs_free_output_axes.push(output_axis);
1534                lhs_free_axes.push(lhs_axis);
1535            }
1536            (None, Some(rhs_axis)) => {
1537                rhs_free_output_axes.push(output_axis);
1538                rhs_free_axes.push(rhs_axis);
1539            }
1540            (None, None) => return None,
1541        }
1542    }
1543
1544    Some(OuterProductAxisPartition {
1545        lhs_free_output_axes,
1546        rhs_free_output_axes,
1547        batch_output_axes,
1548        lhs_free_axes,
1549        rhs_free_axes,
1550        lhs_batch_axes,
1551        rhs_batch_axes,
1552    })
1553}
1554
1555fn output_axes_match_partition(output_rank: usize, groups: &[&[usize]]) -> bool {
1556    groups
1557        .iter()
1558        .flat_map(|group| group.iter().copied())
1559        .eq(0..output_rank)
1560}
1561
1562fn split_outer_product_plan<T>(
1563    lhs: &TypedTensorView<'_, T>,
1564    lhs_shape: &[usize],
1565    lhs_dims: &[usize],
1566    rhs: &TypedTensorView<'_, T>,
1567    rhs_shape: &[usize],
1568    rhs_dims: &[usize],
1569) -> crate::Result<Option<SplitOuterProductPlan>>
1570where
1571    T: 'static,
1572{
1573    let output_rank = lhs_shape.len();
1574    if lhs_shape != rhs_shape
1575        || !shape_matches_dims(lhs.shape(), lhs_shape, lhs_dims)
1576        || !shape_matches_dims(rhs.shape(), rhs_shape, rhs_dims)
1577        || lhs.backend_buffer().is_some()
1578        || rhs.backend_buffer().is_some()
1579        || lhs.offset() < 0
1580        || rhs.offset() < 0
1581        || lhs.strides().iter().any(|&stride| stride < 0)
1582        || rhs.strides().iter().any(|&stride| stride < 0)
1583    {
1584        return Ok(None);
1585    }
1586
1587    let Some(partition) = classify_outer_product_axes(lhs_dims, rhs_dims, output_rank) else {
1588        return Ok(None);
1589    };
1590
1591    let lhs_free_size = axes_shape_product("broadcast_multiply", lhs, &partition.lhs_free_axes)?;
1592    let rhs_free_size = axes_shape_product("broadcast_multiply", rhs, &partition.rhs_free_axes)?;
1593    if lhs_free_size <= 1 || rhs_free_size <= 1 {
1594        return Ok(None);
1595    }
1596    let batches = axes_shape_product("broadcast_multiply", lhs, &partition.lhs_batch_axes)?;
1597
1598    let lhs_prefix = output_axes_match_partition(
1599        output_rank,
1600        &[
1601            &partition.lhs_free_output_axes,
1602            &partition.rhs_free_output_axes,
1603            &partition.batch_output_axes,
1604        ],
1605    );
1606    if lhs_prefix {
1607        return Ok(Some(SplitOuterProductPlan {
1608            rows: lhs_free_size,
1609            cols: rhs_free_size,
1610            batches,
1611            layout: SplitOuterProductLayout::LhsPrefix,
1612            lhs_free_axes: partition.lhs_free_axes,
1613            rhs_free_axes: partition.rhs_free_axes,
1614            lhs_batch_axes: partition.lhs_batch_axes,
1615            rhs_batch_axes: partition.rhs_batch_axes,
1616        }));
1617    }
1618
1619    let rhs_prefix = output_axes_match_partition(
1620        output_rank,
1621        &[
1622            &partition.rhs_free_output_axes,
1623            &partition.lhs_free_output_axes,
1624            &partition.batch_output_axes,
1625        ],
1626    );
1627    if rhs_prefix {
1628        return Ok(Some(SplitOuterProductPlan {
1629            rows: rhs_free_size,
1630            cols: lhs_free_size,
1631            batches,
1632            layout: SplitOuterProductLayout::RhsPrefix,
1633            lhs_free_axes: partition.lhs_free_axes,
1634            rhs_free_axes: partition.rhs_free_axes,
1635            lhs_batch_axes: partition.lhs_batch_axes,
1636            rhs_batch_axes: partition.rhs_batch_axes,
1637        }));
1638    }
1639
1640    Ok(None)
1641}
1642
1643fn try_outer_product_with_pool<T>(
1644    buffers: &mut BufferPool,
1645    lhs: &TypedTensorView<'_, T>,
1646    lhs_shape: &[usize],
1647    lhs_dims: &[usize],
1648    rhs: &TypedTensorView<'_, T>,
1649    rhs_shape: &[usize],
1650    rhs_dims: &[usize],
1651) -> crate::Result<Option<TypedTensor<T>>>
1652where
1653    T: Copy + Clone + Mul<Output = T> + PoolScalar + 'static,
1654{
1655    let Some(plan) = split_outer_product_plan(lhs, lhs_shape, lhs_dims, rhs, rhs_shape, rhs_dims)?
1656    else {
1657        return Ok(None);
1658    };
1659
1660    // SAFETY: every element in the column-major output is assigned below.
1661    let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs_shape) }?;
1662    let lhs_view = typed_view_from_view("broadcast_multiply", lhs)?;
1663    let rhs_view = typed_view_from_view("broadcast_multiply", rhs)?;
1664    match plan.layout {
1665        SplitOuterProductLayout::LhsPrefix => {
1666            let lhs_perm: Vec<_> = plan
1667                .lhs_free_axes
1668                .iter()
1669                .chain(plan.lhs_batch_axes.iter())
1670                .copied()
1671                .collect();
1672            let rhs_perm: Vec<_> = plan
1673                .rhs_free_axes
1674                .iter()
1675                .chain(plan.rhs_batch_axes.iter())
1676                .copied()
1677                .collect();
1678            let lhs_outer = lhs_view
1679                .permute(&lhs_perm)
1680                .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1681            let rhs_outer = rhs_view
1682                .permute(&rhs_perm)
1683                .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1684            batched_outer_product_into(
1685                &mut out.view_mut(),
1686                &lhs_outer,
1687                &rhs_outer,
1688                plan.lhs_free_axes.len(),
1689                plan.rhs_free_axes.len(),
1690            )
1691            .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1692        }
1693        SplitOuterProductLayout::RhsPrefix => {
1694            let lhs_perm: Vec<_> = plan
1695                .lhs_free_axes
1696                .iter()
1697                .chain(plan.lhs_batch_axes.iter())
1698                .copied()
1699                .collect();
1700            let rhs_perm: Vec<_> = plan
1701                .rhs_free_axes
1702                .iter()
1703                .chain(plan.rhs_batch_axes.iter())
1704                .copied()
1705                .collect();
1706            let lhs_outer = lhs_view
1707                .permute(&lhs_perm)
1708                .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1709            let rhs_outer = rhs_view
1710                .permute(&rhs_perm)
1711                .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1712            batched_outer_product_into(
1713                &mut out.view_mut(),
1714                &rhs_outer,
1715                &lhs_outer,
1716                plan.rhs_free_axes.len(),
1717                plan.lhs_free_axes.len(),
1718            )
1719            .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1720        }
1721    }
1722    Ok(Some(tensor_from_array(out)))
1723}
1724
1725struct LazyOuterProduct<T> {
1726    base: TypedTensor<T>,
1727    shape: Vec<usize>,
1728    strides: Vec<isize>,
1729}
1730
1731fn axes_by_physical_stride<T>(view: &TypedTensorView<'_, T>, axes: &[usize]) -> Vec<usize>
1732where
1733    T: 'static,
1734{
1735    let mut sorted = axes.to_vec();
1736    sorted.sort_by(|&lhs_axis, &rhs_axis| {
1737        view.strides()[lhs_axis]
1738            .cmp(&view.strides()[rhs_axis])
1739            .then_with(|| lhs_axis.cmp(&rhs_axis))
1740    });
1741    sorted
1742}
1743
1744fn append_axis_shapes<T>(shape: &mut Vec<usize>, view: &TypedTensorView<'_, T>, axes: &[usize])
1745where
1746    T: 'static,
1747{
1748    shape.extend(axes.iter().map(|&axis| view.shape()[axis]));
1749}
1750
1751fn set_lazy_stride(
1752    logical_strides: &mut [Option<isize>],
1753    output_axis: usize,
1754    stride: isize,
1755) -> crate::Result<()> {
1756    let rank = logical_strides.len();
1757    let slot = logical_strides
1758        .get_mut(output_axis)
1759        .ok_or(crate::Error::AxisOutOfBounds {
1760            op: "broadcast_multiply",
1761            axis: output_axis,
1762            rank,
1763        })?;
1764    if slot.replace(stride).is_some() {
1765        return Err(crate::Error::DuplicateAxis {
1766            op: "broadcast_multiply",
1767            axis: output_axis,
1768            role: "lazy output layout",
1769        });
1770    }
1771    Ok(())
1772}
1773
1774struct LazyOuterProductStrideSpec<'a> {
1775    output_shape: &'a [usize],
1776    base_shape: &'a [usize],
1777    leading_axes: &'a [usize],
1778    leading_dims: &'a [usize],
1779    trailing_axes: &'a [usize],
1780    trailing_dims: &'a [usize],
1781    lhs_batch_axes: &'a [usize],
1782    rhs_batch_axes: &'a [usize],
1783    lhs_dims: &'a [usize],
1784    rhs_dims: &'a [usize],
1785}
1786
1787fn lazy_outer_product_strides(spec: LazyOuterProductStrideSpec<'_>) -> crate::Result<Vec<isize>> {
1788    let base_strides = col_major_strides(spec.base_shape)?;
1789    let mut logical_strides = vec![None; spec.output_shape.len()];
1790    let mut base_axis = 0usize;
1791
1792    for &axis in spec.leading_axes {
1793        set_lazy_stride(
1794            &mut logical_strides,
1795            spec.leading_dims[axis],
1796            base_strides[base_axis],
1797        )?;
1798        base_axis += 1;
1799    }
1800    for &axis in spec.trailing_axes {
1801        set_lazy_stride(
1802            &mut logical_strides,
1803            spec.trailing_dims[axis],
1804            base_strides[base_axis],
1805        )?;
1806        base_axis += 1;
1807    }
1808    for (&lhs_axis, &rhs_axis) in spec.lhs_batch_axes.iter().zip(spec.rhs_batch_axes.iter()) {
1809        let output_axis = spec.lhs_dims[lhs_axis];
1810        if spec.rhs_dims[rhs_axis] != output_axis {
1811            return Err(crate::Error::backend_failure(
1812                "broadcast_multiply",
1813                "batch axes disagree while building lazy outer-product layout",
1814            ));
1815        }
1816        set_lazy_stride(&mut logical_strides, output_axis, base_strides[base_axis])?;
1817        base_axis += 1;
1818    }
1819
1820    logical_strides
1821        .into_iter()
1822        .collect::<Option<Vec<_>>>()
1823        .ok_or_else(|| {
1824            crate::Error::backend_failure(
1825                "broadcast_multiply",
1826                "lazy outer-product layout did not cover every output axis",
1827            )
1828        })
1829}
1830
1831fn lazy_outer_product_value(
1832    tensor: Tensor,
1833    shape: Vec<usize>,
1834    strides: Vec<isize>,
1835) -> crate::Result<TensorValue> {
1836    Ok(TensorValue::View(TensorOwnedView::from_parts(
1837        Arc::new(tensor),
1838        shape,
1839        strides,
1840        0,
1841    )?))
1842}
1843
1844fn try_lazy_outer_product_with_pool<T>(
1845    buffers: &mut BufferPool,
1846    lhs: &TypedTensorView<'_, T>,
1847    lhs_shape: &[usize],
1848    lhs_dims: &[usize],
1849    rhs: &TypedTensorView<'_, T>,
1850    rhs_shape: &[usize],
1851    rhs_dims: &[usize],
1852) -> crate::Result<Option<LazyOuterProduct<T>>>
1853where
1854    T: Copy + Clone + Mul<Output = T> + PoolScalar + 'static,
1855{
1856    let Some(plan) = split_outer_product_plan(lhs, lhs_shape, lhs_dims, rhs, rhs_shape, rhs_dims)?
1857    else {
1858        return Ok(None);
1859    };
1860
1861    let lhs_free_axes = axes_by_physical_stride(lhs, &plan.lhs_free_axes);
1862    let rhs_free_axes = axes_by_physical_stride(rhs, &plan.rhs_free_axes);
1863    if lhs_free_axes == plan.lhs_free_axes && rhs_free_axes == plan.rhs_free_axes {
1864        return Ok(None);
1865    }
1866
1867    let lhs_view = typed_view_from_view("broadcast_multiply", lhs)?;
1868    let rhs_view = typed_view_from_view("broadcast_multiply", rhs)?;
1869
1870    match plan.layout {
1871        SplitOuterProductLayout::LhsPrefix => {
1872            let lhs_perm: Vec<_> = lhs_free_axes
1873                .iter()
1874                .chain(plan.lhs_batch_axes.iter())
1875                .copied()
1876                .collect();
1877            let rhs_perm: Vec<_> = rhs_free_axes
1878                .iter()
1879                .chain(plan.rhs_batch_axes.iter())
1880                .copied()
1881                .collect();
1882            let lhs_outer = lhs_view
1883                .permute(&lhs_perm)
1884                .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1885            let rhs_outer = rhs_view
1886                .permute(&rhs_perm)
1887                .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1888
1889            let mut base_shape = Vec::with_capacity(lhs_shape.len());
1890            append_axis_shapes(&mut base_shape, lhs, &lhs_free_axes);
1891            append_axis_shapes(&mut base_shape, rhs, &rhs_free_axes);
1892            append_axis_shapes(&mut base_shape, lhs, &plan.lhs_batch_axes);
1893            let strides = lazy_outer_product_strides(LazyOuterProductStrideSpec {
1894                output_shape: lhs_shape,
1895                base_shape: &base_shape,
1896                leading_axes: &lhs_free_axes,
1897                leading_dims: lhs_dims,
1898                trailing_axes: &rhs_free_axes,
1899                trailing_dims: rhs_dims,
1900                lhs_batch_axes: &plan.lhs_batch_axes,
1901                rhs_batch_axes: &plan.rhs_batch_axes,
1902                lhs_dims,
1903                rhs_dims,
1904            })?;
1905
1906            // SAFETY: every element in the physical base output is assigned below.
1907            let mut base = unsafe { typed_array_uninit_from_pool(buffers, &base_shape) }?;
1908            batched_outer_product_into(
1909                &mut base.view_mut(),
1910                &lhs_outer,
1911                &rhs_outer,
1912                lhs_free_axes.len(),
1913                rhs_free_axes.len(),
1914            )
1915            .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1916            Ok(Some(LazyOuterProduct {
1917                base: tensor_from_array(base),
1918                shape: lhs_shape.to_vec(),
1919                strides,
1920            }))
1921        }
1922        SplitOuterProductLayout::RhsPrefix => {
1923            let lhs_perm: Vec<_> = lhs_free_axes
1924                .iter()
1925                .chain(plan.lhs_batch_axes.iter())
1926                .copied()
1927                .collect();
1928            let rhs_perm: Vec<_> = rhs_free_axes
1929                .iter()
1930                .chain(plan.rhs_batch_axes.iter())
1931                .copied()
1932                .collect();
1933            let lhs_outer = lhs_view
1934                .permute(&lhs_perm)
1935                .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1936            let rhs_outer = rhs_view
1937                .permute(&rhs_perm)
1938                .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1939
1940            let mut base_shape = Vec::with_capacity(lhs_shape.len());
1941            append_axis_shapes(&mut base_shape, rhs, &rhs_free_axes);
1942            append_axis_shapes(&mut base_shape, lhs, &lhs_free_axes);
1943            append_axis_shapes(&mut base_shape, lhs, &plan.lhs_batch_axes);
1944            let strides = lazy_outer_product_strides(LazyOuterProductStrideSpec {
1945                output_shape: lhs_shape,
1946                base_shape: &base_shape,
1947                leading_axes: &rhs_free_axes,
1948                leading_dims: rhs_dims,
1949                trailing_axes: &lhs_free_axes,
1950                trailing_dims: lhs_dims,
1951                lhs_batch_axes: &plan.lhs_batch_axes,
1952                rhs_batch_axes: &plan.rhs_batch_axes,
1953                lhs_dims,
1954                rhs_dims,
1955            })?;
1956
1957            // SAFETY: every element in the physical base output is assigned below.
1958            let mut base = unsafe { typed_array_uninit_from_pool(buffers, &base_shape) }?;
1959            batched_outer_product_into(
1960                &mut base.view_mut(),
1961                &rhs_outer,
1962                &lhs_outer,
1963                rhs_free_axes.len(),
1964                lhs_free_axes.len(),
1965            )
1966            .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
1967            Ok(Some(LazyOuterProduct {
1968                base: tensor_from_array(base),
1969                shape: lhs_shape.to_vec(),
1970                strides,
1971            }))
1972        }
1973    }
1974}
1975
1976#[allow(clippy::too_many_arguments)]
1977fn typed_broadcast_mul_view_with_pool<T, L, R>(
1978    buffers: &mut BufferPool,
1979    lhs: &TypedTensorView<'_, T, L>,
1980    lhs_shape: &[usize],
1981    lhs_dims: &[usize],
1982    rhs: &TypedTensorView<'_, T, R>,
1983    rhs_shape: &[usize],
1984    rhs_dims: &[usize],
1985) -> crate::Result<TypedTensor<T>>
1986where
1987    T: Copy + Clone + Zero + Mul<Output = T> + PoolScalar + 'static,
1988    L: TensorRank,
1989    R: TensorRank,
1990{
1991    if lhs_shape != rhs_shape {
1992        return Err(crate::Error::ShapeMismatch {
1993            op: "broadcast_multiply",
1994            lhs: lhs_shape.to_vec(),
1995            rhs: rhs_shape.to_vec(),
1996        });
1997    }
1998    let output_rank = lhs_shape.len();
1999    let lhs_is_scalar = lhs.shape().is_empty() && lhs_dims.is_empty();
2000    let rhs_is_scalar = rhs.shape().is_empty() && rhs_dims.is_empty();
2001    let lhs_is_full_output =
2002        lhs.shape() == lhs_shape && lhs_dims.iter().copied().eq(0..output_rank);
2003    let rhs_is_full_output =
2004        rhs.shape() == rhs_shape && rhs_dims.iter().copied().eq(0..output_rank);
2005    if lhs_is_scalar && rhs_is_scalar {
2006        let lhs_scalar = typed_view_from_view("broadcast_multiply", lhs)?.get(&[]);
2007        let rhs_scalar = typed_view_from_view("broadcast_multiply", rhs)?.get(&[]);
2008        return filled_broadcast_multiply_tensor(buffers, lhs_shape, lhs_scalar * rhs_scalar);
2009    }
2010    if lhs_is_scalar && rhs_is_full_output {
2011        let scalar = typed_view_from_view("broadcast_multiply", lhs)?.get(&[]);
2012        // SAFETY: map_into overwrites every output element.
2013        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs_shape) }?;
2014        map_into(
2015            &mut out.view_mut(),
2016            &typed_view_from_view("broadcast_multiply", rhs)?,
2017            |x| scalar * x,
2018        )
2019        .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
2020        return Ok(tensor_from_array(out));
2021    }
2022    if rhs_is_scalar && lhs_is_full_output {
2023        let scalar = typed_view_from_view("broadcast_multiply", rhs)?.get(&[]);
2024        // SAFETY: map_into overwrites every output element.
2025        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs_shape) }?;
2026        map_into(
2027            &mut out.view_mut(),
2028            &typed_view_from_view("broadcast_multiply", lhs)?,
2029            |x| x * scalar,
2030        )
2031        .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
2032        return Ok(tensor_from_array(out));
2033    }
2034
2035    // SAFETY: broadcast_mul_into overwrites every output element.
2036    let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs_shape) }?;
2037    let lhs_view = typed_view_from_view("broadcast_multiply", lhs)?;
2038    let rhs_view = typed_view_from_view("broadcast_multiply", rhs)?;
2039    broadcast_mul_into(
2040        &mut out.view_mut(),
2041        &lhs_view,
2042        lhs_dims,
2043        &rhs_view,
2044        rhs_dims,
2045    )
2046    .map_err(|err| crate::Error::backend_failure("broadcast_multiply", err))?;
2047    Ok(tensor_from_array(out))
2048}
2049
2050fn filled_broadcast_multiply_tensor<T>(
2051    buffers: &mut BufferPool,
2052    shape: &[usize],
2053    fill: T,
2054) -> crate::Result<TypedTensor<T>>
2055where
2056    T: Copy + Clone + PoolScalar + 'static,
2057{
2058    let len = shape.iter().try_fold(1usize, |acc, &dim| {
2059        acc.checked_mul(dim).ok_or_else(|| {
2060            crate::Error::backend_failure("broadcast_multiply", "output shape size overflows usize")
2061        })
2062    })?;
2063    // SAFETY: every pooled element is initialized with `fill` before tensor construction.
2064    let mut data = unsafe { T::pool_acquire(buffers, len) };
2065    data.fill(fill);
2066    TypedTensor::from_vec_col_major(shape.to_vec(), data)
2067}
2068
2069#[allow(clippy::too_many_arguments)]
2070pub(crate) fn broadcast_multiply_read_with_pool(
2071    buffers: &mut BufferPool,
2072    lhs: TensorRead<'_>,
2073    lhs_shape: &[usize],
2074    lhs_dims: &[usize],
2075    rhs: TensorRead<'_>,
2076    rhs_shape: &[usize],
2077    rhs_dims: &[usize],
2078) -> crate::Result<Option<Tensor>> {
2079    let lhs = read_as_cpu_view(lhs);
2080    let rhs = read_as_cpu_view(rhs);
2081
2082    macro_rules! dispatch {
2083        ($variant:ident, $lhs:expr, $rhs:expr) => {{
2084            if let Some(out) = try_outer_product_with_pool(
2085                buffers, &$lhs, lhs_shape, lhs_dims, &$rhs, rhs_shape, rhs_dims,
2086            )? {
2087                return Ok(Some(Tensor::$variant(out)));
2088            }
2089            Ok(Some(Tensor::$variant(typed_broadcast_mul_view_with_pool(
2090                buffers, &$lhs, lhs_shape, lhs_dims, &$rhs, rhs_shape, rhs_dims,
2091            )?)))
2092        }};
2093    }
2094
2095    match (lhs, rhs) {
2096        (CpuReadView::F32(lhs), CpuReadView::F32(rhs)) => dispatch!(F32, lhs, rhs),
2097        (CpuReadView::F64(lhs), CpuReadView::F64(rhs)) => dispatch!(F64, lhs, rhs),
2098        (CpuReadView::I32(lhs), CpuReadView::I32(rhs)) => dispatch!(I32, lhs, rhs),
2099        (CpuReadView::I64(lhs), CpuReadView::I64(rhs)) => dispatch!(I64, lhs, rhs),
2100        (CpuReadView::C32(lhs), CpuReadView::C32(rhs)) => dispatch!(C32, lhs, rhs),
2101        (CpuReadView::C64(lhs), CpuReadView::C64(rhs)) => dispatch!(C64, lhs, rhs),
2102        _ => Ok(None),
2103    }
2104}
2105
2106#[allow(clippy::too_many_arguments)]
2107pub(crate) fn broadcast_multiply_value_with_pool(
2108    buffers: &mut BufferPool,
2109    lhs: TensorRead<'_>,
2110    lhs_shape: &[usize],
2111    lhs_dims: &[usize],
2112    rhs: TensorRead<'_>,
2113    rhs_shape: &[usize],
2114    rhs_dims: &[usize],
2115) -> crate::Result<Option<TensorValue>> {
2116    let lhs_view = read_as_cpu_view(lhs.clone());
2117    let rhs_view = read_as_cpu_view(rhs.clone());
2118
2119    macro_rules! dispatch_lazy {
2120        ($variant:ident, $lhs:expr, $rhs:expr) => {{
2121            if let Some(out) = try_lazy_outer_product_with_pool(
2122                buffers, &$lhs, lhs_shape, lhs_dims, &$rhs, rhs_shape, rhs_dims,
2123            )? {
2124                return Ok(Some(lazy_outer_product_value(
2125                    Tensor::$variant(out.base),
2126                    out.shape,
2127                    out.strides,
2128                )?));
2129            }
2130        }};
2131    }
2132
2133    match (lhs_view, rhs_view) {
2134        (CpuReadView::F32(lhs_view), CpuReadView::F32(rhs_view)) => {
2135            dispatch_lazy!(F32, lhs_view, rhs_view);
2136        }
2137        (CpuReadView::F64(lhs_view), CpuReadView::F64(rhs_view)) => {
2138            dispatch_lazy!(F64, lhs_view, rhs_view);
2139        }
2140        (CpuReadView::I32(lhs_view), CpuReadView::I32(rhs_view)) => {
2141            dispatch_lazy!(I32, lhs_view, rhs_view);
2142        }
2143        (CpuReadView::I64(lhs_view), CpuReadView::I64(rhs_view)) => {
2144            dispatch_lazy!(I64, lhs_view, rhs_view);
2145        }
2146        (CpuReadView::C32(lhs_view), CpuReadView::C32(rhs_view)) => {
2147            dispatch_lazy!(C32, lhs_view, rhs_view);
2148        }
2149        (CpuReadView::C64(lhs_view), CpuReadView::C64(rhs_view)) => {
2150            dispatch_lazy!(C64, lhs_view, rhs_view);
2151        }
2152        _ => {}
2153    }
2154
2155    broadcast_multiply_read_with_pool(buffers, lhs, lhs_shape, lhs_dims, rhs, rhs_shape, rhs_dims)
2156        .map(|tensor| tensor.map(TensorValue::from_tensor))
2157}
2158
2159/// Divide two CPU tensors elementwise.
2160///
2161/// # Examples
2162///
2163/// ```
2164/// use tenferro_cpu::div;
2165/// use tenferro_tensor::Tensor;
2166///
2167/// let a = Tensor::from_vec_col_major(vec![2], vec![8.0_f64, 15.0])?;
2168/// let b = Tensor::from_vec_col_major(vec![2], vec![2.0_f64, 5.0])?;
2169/// let out = div(&a, &b)?;
2170/// assert_eq!(out.as_slice::<f64>().unwrap(), &[4.0, 3.0]);
2171/// # Ok::<(), tenferro_tensor::Error>(())
2172/// ```
2173pub fn div(lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2174    with_local_pool(|buffers| div_with_pool(buffers, lhs, rhs))
2175}
2176
2177pub(crate) fn div_with_pool(
2178    buffers: &mut BufferPool,
2179    lhs: &Tensor,
2180    rhs: &Tensor,
2181) -> crate::Result<Tensor> {
2182    match (lhs, rhs) {
2183        (Tensor::F32(a), Tensor::F32(b)) => Ok(Tensor::F32(typed_div_with_pool(buffers, a, b)?)),
2184        (Tensor::F64(a), Tensor::F64(b)) => Ok(Tensor::F64(typed_div_with_pool(buffers, a, b)?)),
2185        (Tensor::I32(a), Tensor::I32(b)) => {
2186            Ok(Tensor::I32(typed_integer_div_with_pool(buffers, a, b)?))
2187        }
2188        (Tensor::I64(a), Tensor::I64(b)) => {
2189            Ok(Tensor::I64(typed_integer_div_with_pool(buffers, a, b)?))
2190        }
2191        (Tensor::C32(a), Tensor::C32(b)) => Ok(Tensor::C32(typed_div_with_pool(buffers, a, b)?)),
2192        (Tensor::C64(a), Tensor::C64(b)) => Ok(Tensor::C64(typed_div_with_pool(buffers, a, b)?)),
2193        (Tensor::F32(a), Tensor::C32(b)) if a.shape().is_empty() => {
2194            let scalar = complex_scalar_tensor(typed_host_data("div", a)?[0])?;
2195            Ok(Tensor::C32(typed_div_with_pool(buffers, &scalar, b)?))
2196        }
2197        (Tensor::C32(a), Tensor::F32(b)) if b.shape().is_empty() => {
2198            let scalar = complex_scalar_tensor(typed_host_data("div", b)?[0])?;
2199            Ok(Tensor::C32(typed_div_with_pool(buffers, a, &scalar)?))
2200        }
2201        (Tensor::F64(a), Tensor::C64(b)) if a.shape().is_empty() => {
2202            let scalar = complex_scalar_tensor(typed_host_data("div", a)?[0])?;
2203            Ok(Tensor::C64(typed_div_with_pool(buffers, &scalar, b)?))
2204        }
2205        (Tensor::C64(a), Tensor::F64(b)) if b.shape().is_empty() => {
2206            let scalar = complex_scalar_tensor(typed_host_data("div", b)?[0])?;
2207            Ok(Tensor::C64(typed_div_with_pool(buffers, a, &scalar)?))
2208        }
2209        _ => Err(crate::Error::DTypeMismatch {
2210            op: "div",
2211            lhs: lhs.dtype(),
2212            rhs: rhs.dtype(),
2213        }),
2214    }
2215}
2216
2217pub(crate) fn div_read_with_pool(
2218    buffers: &mut BufferPool,
2219    lhs: TensorRead<'_>,
2220    rhs: TensorRead<'_>,
2221) -> crate::Result<Tensor> {
2222    let lhs_dtype = lhs.dtype();
2223    let rhs_dtype = rhs.dtype();
2224    match (read_as_cpu_view(lhs), read_as_cpu_view(rhs)) {
2225        (CpuReadView::F32(a), CpuReadView::F32(b)) => Ok(Tensor::F32(typed_binary_view_with_pool(
2226            "div",
2227            buffers,
2228            &a,
2229            &b,
2230            |x, y| x / y,
2231        )?)),
2232        (CpuReadView::F64(a), CpuReadView::F64(b)) => Ok(Tensor::F64(typed_binary_view_with_pool(
2233            "div",
2234            buffers,
2235            &a,
2236            &b,
2237            |x, y| x / y,
2238        )?)),
2239        (CpuReadView::I32(a), CpuReadView::I32(b)) => Ok(Tensor::I32(
2240            typed_integer_div_view_with_pool(buffers, &a, &b)?,
2241        )),
2242        (CpuReadView::I64(a), CpuReadView::I64(b)) => Ok(Tensor::I64(
2243            typed_integer_div_view_with_pool(buffers, &a, &b)?,
2244        )),
2245        (CpuReadView::C32(a), CpuReadView::C32(b)) => Ok(Tensor::C32(typed_binary_view_with_pool(
2246            "div",
2247            buffers,
2248            &a,
2249            &b,
2250            |x, y| x / y,
2251        )?)),
2252        (CpuReadView::C64(a), CpuReadView::C64(b)) => Ok(Tensor::C64(typed_binary_view_with_pool(
2253            "div",
2254            buffers,
2255            &a,
2256            &b,
2257            |x, y| x / y,
2258        )?)),
2259        (CpuReadView::F32(real), CpuReadView::C32(complex)) if real.shape().is_empty() => {
2260            let scalar = complex_scalar_tensor_from_view(&real)?;
2261            let scalar = scalar.as_view();
2262            Ok(Tensor::C32(typed_binary_view_with_pool(
2263                "div",
2264                buffers,
2265                &scalar,
2266                &complex,
2267                |x, y| x / y,
2268            )?))
2269        }
2270        (CpuReadView::C32(complex), CpuReadView::F32(real)) if real.shape().is_empty() => {
2271            let scalar = complex_scalar_tensor_from_view(&real)?;
2272            let scalar = scalar.as_view();
2273            Ok(Tensor::C32(typed_binary_view_with_pool(
2274                "div",
2275                buffers,
2276                &complex,
2277                &scalar,
2278                |x, y| x / y,
2279            )?))
2280        }
2281        (CpuReadView::F64(real), CpuReadView::C64(complex)) if real.shape().is_empty() => {
2282            let scalar = complex_scalar_tensor_from_view(&real)?;
2283            let scalar = scalar.as_view();
2284            Ok(Tensor::C64(typed_binary_view_with_pool(
2285                "div",
2286                buffers,
2287                &scalar,
2288                &complex,
2289                |x, y| x / y,
2290            )?))
2291        }
2292        (CpuReadView::C64(complex), CpuReadView::F64(real)) if real.shape().is_empty() => {
2293            let scalar = complex_scalar_tensor_from_view(&real)?;
2294            let scalar = scalar.as_view();
2295            Ok(Tensor::C64(typed_binary_view_with_pool(
2296                "div",
2297                buffers,
2298                &complex,
2299                &scalar,
2300                |x, y| x / y,
2301            )?))
2302        }
2303        _ => Err(crate::Error::DTypeMismatch {
2304            op: "div",
2305            lhs: lhs_dtype,
2306            rhs: rhs_dtype,
2307        }),
2308    }
2309}
2310
2311/// Compute elementwise remainders on CPU tensors.
2312///
2313/// Integer remainders use wrapping two's-complement arithmetic for the
2314/// `MIN % -1` edge and return a structured error on zero divisors.
2315///
2316/// # Examples
2317///
2318/// ```
2319/// use tenferro_cpu::rem;
2320/// use tenferro_tensor::Tensor;
2321///
2322/// let a = Tensor::from_vec_col_major(vec![2], vec![7_i32, -7])?;
2323/// let b = Tensor::from_vec_col_major(vec![2], vec![3_i32, 3])?;
2324/// let out = rem(&a, &b)?;
2325/// assert_eq!(out.as_slice::<i32>().unwrap(), &[1, -1]);
2326/// # Ok::<(), tenferro_tensor::Error>(())
2327/// ```
2328pub fn rem(lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2329    with_local_pool(|buffers| rem_with_pool(buffers, lhs, rhs))
2330}
2331
2332pub(crate) fn rem_with_pool(
2333    buffers: &mut BufferPool,
2334    lhs: &Tensor,
2335    rhs: &Tensor,
2336) -> crate::Result<Tensor> {
2337    match (lhs, rhs) {
2338        (Tensor::F32(a), Tensor::F32(b)) => Ok(Tensor::F32(typed_rem_with_pool(buffers, a, b)?)),
2339        (Tensor::F64(a), Tensor::F64(b)) => Ok(Tensor::F64(typed_rem_with_pool(buffers, a, b)?)),
2340        (Tensor::I32(a), Tensor::I32(b)) => {
2341            Ok(Tensor::I32(typed_integer_rem_with_pool(buffers, a, b)?))
2342        }
2343        (Tensor::I64(a), Tensor::I64(b)) => {
2344            Ok(Tensor::I64(typed_integer_rem_with_pool(buffers, a, b)?))
2345        }
2346        _ => Err(tensor_pair_error("rem", lhs, rhs)),
2347    }
2348}
2349
2350pub(crate) fn rem_read_with_pool(
2351    buffers: &mut BufferPool,
2352    lhs: TensorRead<'_>,
2353    rhs: TensorRead<'_>,
2354) -> crate::Result<Tensor> {
2355    let lhs_dtype = lhs.dtype();
2356    let rhs_dtype = rhs.dtype();
2357    match (read_as_cpu_view(lhs), read_as_cpu_view(rhs)) {
2358        (CpuReadView::F32(a), CpuReadView::F32(b)) => Ok(Tensor::F32(typed_binary_view_with_pool(
2359            "rem",
2360            buffers,
2361            &a,
2362            &b,
2363            |x, y| x % y,
2364        )?)),
2365        (CpuReadView::F64(a), CpuReadView::F64(b)) => Ok(Tensor::F64(typed_binary_view_with_pool(
2366            "rem",
2367            buffers,
2368            &a,
2369            &b,
2370            |x, y| x % y,
2371        )?)),
2372        (CpuReadView::I32(a), CpuReadView::I32(b)) => Ok(Tensor::I32(
2373            typed_integer_rem_view_with_pool(buffers, &a, &b)?,
2374        )),
2375        (CpuReadView::I64(a), CpuReadView::I64(b)) => Ok(Tensor::I64(
2376            typed_integer_rem_view_with_pool(buffers, &a, &b)?,
2377        )),
2378        _ => Err(dtype_pair_error("rem", lhs_dtype, rhs_dtype)),
2379    }
2380}
2381
2382/// Negate a CPU tensor elementwise.
2383///
2384/// # Examples
2385///
2386/// ```
2387/// use tenferro_cpu::neg;
2388/// use tenferro_tensor::Tensor;
2389///
2390/// let input = Tensor::from_vec_col_major(vec![2], vec![1.0_f64, -2.0])?;
2391/// let out = neg(&input)?;
2392/// assert_eq!(out.as_slice::<f64>().unwrap(), &[-1.0, 2.0]);
2393/// # Ok::<(), tenferro_tensor::Error>(())
2394/// ```
2395pub fn neg(input: &Tensor) -> crate::Result<Tensor> {
2396    with_local_pool(|buffers| neg_with_pool(buffers, input))
2397}
2398
2399pub(crate) fn neg_with_pool(buffers: &mut BufferPool, input: &Tensor) -> crate::Result<Tensor> {
2400    match input {
2401        Tensor::F32(t) => Ok(Tensor::F32(typed_neg_with_pool(buffers, t)?)),
2402        Tensor::F64(t) => Ok(Tensor::F64(typed_neg_with_pool(buffers, t)?)),
2403        Tensor::I32(t) => Ok(Tensor::I32(typed_wrapping_neg_with_pool(buffers, t)?)),
2404        Tensor::I64(t) => Ok(Tensor::I64(typed_wrapping_neg_with_pool(buffers, t)?)),
2405        Tensor::Bool(_) => Err(crate::Error::backend_failure(
2406            "neg",
2407            format!("unsupported dtype {:?}", input.dtype()),
2408        )),
2409        Tensor::C32(t) => Ok(Tensor::C32(typed_neg_with_pool(buffers, t)?)),
2410        Tensor::C64(t) => Ok(Tensor::C64(typed_neg_with_pool(buffers, t)?)),
2411    }
2412}
2413
2414pub(crate) fn neg_read_with_pool(
2415    buffers: &mut BufferPool,
2416    input: TensorRead<'_>,
2417) -> crate::Result<Tensor> {
2418    let dtype = input.dtype();
2419    match read_as_cpu_view(input) {
2420        CpuReadView::F32(t) => Ok(Tensor::F32(typed_unary_view_with_pool(
2421            "neg",
2422            buffers,
2423            &t,
2424            |x| -x,
2425        )?)),
2426        CpuReadView::F64(t) => Ok(Tensor::F64(typed_unary_view_with_pool(
2427            "neg",
2428            buffers,
2429            &t,
2430            |x| -x,
2431        )?)),
2432        CpuReadView::I32(t) => Ok(Tensor::I32(typed_unary_view_with_pool(
2433            "neg",
2434            buffers,
2435            &t,
2436            |x| x.wrapping_neg_elem(),
2437        )?)),
2438        CpuReadView::I64(t) => Ok(Tensor::I64(typed_unary_view_with_pool(
2439            "neg",
2440            buffers,
2441            &t,
2442            |x| x.wrapping_neg_elem(),
2443        )?)),
2444        CpuReadView::C32(t) => Ok(Tensor::C32(typed_unary_view_with_pool(
2445            "neg",
2446            buffers,
2447            &t,
2448            |x| -x,
2449        )?)),
2450        CpuReadView::C64(t) => Ok(Tensor::C64(typed_unary_view_with_pool(
2451            "neg",
2452            buffers,
2453            &t,
2454            |x| -x,
2455        )?)),
2456        _ => Err(crate::Error::backend_failure(
2457            "neg",
2458            format!("unsupported dtype {dtype:?}"),
2459        )),
2460    }
2461}
2462
2463/// Conjugate a real or complex CPU tensor elementwise.
2464///
2465/// # Examples
2466///
2467/// ```
2468/// use num_complex::Complex64;
2469/// use tenferro_cpu::conj;
2470/// use tenferro_tensor::Tensor;
2471///
2472/// let input = Tensor::from_vec_col_major(vec![1], vec![Complex64::new(1.0, 2.0)])?;
2473/// let out = conj(&input)?;
2474/// assert_eq!(out.as_slice::<Complex64>().unwrap(), &[Complex64::new(1.0, -2.0)]);
2475/// # Ok::<(), tenferro_tensor::Error>(())
2476/// ```
2477pub fn conj(input: &Tensor) -> crate::Result<Tensor> {
2478    with_local_pool(|buffers| conj_with_pool(buffers, input))
2479}
2480
2481pub(crate) fn conj_with_pool(buffers: &mut BufferPool, input: &Tensor) -> crate::Result<Tensor> {
2482    match input {
2483        Tensor::F32(t) => Ok(Tensor::F32(typed_conj_with_pool(buffers, t)?)),
2484        Tensor::F64(t) => Ok(Tensor::F64(typed_conj_with_pool(buffers, t)?)),
2485        Tensor::I32(_) | Tensor::I64(_) | Tensor::Bool(_) => Err(crate::Error::backend_failure(
2486            "conj",
2487            format!("unsupported dtype {:?}", input.dtype()),
2488        )),
2489        Tensor::C32(t) => Ok(Tensor::C32(typed_conj_with_pool(buffers, t)?)),
2490        Tensor::C64(t) => Ok(Tensor::C64(typed_conj_with_pool(buffers, t)?)),
2491    }
2492}
2493
2494pub(crate) fn conj_read_with_pool(
2495    buffers: &mut BufferPool,
2496    input: TensorRead<'_>,
2497) -> crate::Result<Tensor> {
2498    let dtype = input.dtype();
2499    match read_as_cpu_view(input) {
2500        CpuReadView::F32(t) => Ok(Tensor::F32(typed_unary_view_with_pool(
2501            "conj",
2502            buffers,
2503            &t,
2504            |x| x.conj_elem(),
2505        )?)),
2506        CpuReadView::F64(t) => Ok(Tensor::F64(typed_unary_view_with_pool(
2507            "conj",
2508            buffers,
2509            &t,
2510            |x| x.conj_elem(),
2511        )?)),
2512        CpuReadView::C32(t) => Ok(Tensor::C32(typed_unary_view_with_pool(
2513            "conj",
2514            buffers,
2515            &t,
2516            |x| x.conj_elem(),
2517        )?)),
2518        CpuReadView::C64(t) => Ok(Tensor::C64(typed_unary_view_with_pool(
2519            "conj",
2520            buffers,
2521            &t,
2522            |x| x.conj_elem(),
2523        )?)),
2524        _ => Err(crate::Error::backend_failure(
2525            "conj",
2526            format!("unsupported dtype {dtype:?}"),
2527        )),
2528    }
2529}
2530
2531/// Compute elementwise absolute values.
2532///
2533/// Complex inputs return real magnitudes (`C32 -> F32`, `C64 -> F64`).
2534///
2535/// # Examples
2536///
2537/// ```
2538/// use tenferro_cpu::abs;
2539/// use tenferro_tensor::Tensor;
2540///
2541/// let input = Tensor::from_vec_col_major(vec![2], vec![-3.0_f64, 4.0])?;
2542/// let out = abs(&input)?;
2543/// assert_eq!(out.as_slice::<f64>().unwrap(), &[3.0, 4.0]);
2544/// # Ok::<(), tenferro_tensor::Error>(())
2545/// ```
2546pub fn abs(input: &Tensor) -> crate::Result<Tensor> {
2547    with_local_pool(|buffers| abs_with_pool(buffers, input))
2548}
2549
2550pub(crate) fn abs_with_pool(buffers: &mut BufferPool, input: &Tensor) -> crate::Result<Tensor> {
2551    match input {
2552        Tensor::F32(t) => Ok(Tensor::F32(typed_abs_with_pool(buffers, t)?)),
2553        Tensor::F64(t) => Ok(Tensor::F64(typed_abs_with_pool(buffers, t)?)),
2554        Tensor::I32(t) => Ok(Tensor::I32(typed_wrapping_abs_with_pool(buffers, t)?)),
2555        Tensor::I64(t) => Ok(Tensor::I64(typed_wrapping_abs_with_pool(buffers, t)?)),
2556        Tensor::Bool(_) => Err(crate::Error::backend_failure(
2557            "abs",
2558            format!("unsupported dtype {:?}", input.dtype()),
2559        )),
2560        Tensor::C32(t) => Ok(Tensor::F32(typed_complex_abs_with_pool(buffers, t)?)),
2561        Tensor::C64(t) => Ok(Tensor::F64(typed_complex_abs_with_pool(buffers, t)?)),
2562    }
2563}
2564
2565pub(crate) fn abs_read_with_pool(
2566    buffers: &mut BufferPool,
2567    input: TensorRead<'_>,
2568) -> crate::Result<Tensor> {
2569    let dtype = input.dtype();
2570    match read_as_cpu_view(input) {
2571        CpuReadView::F32(t) => Ok(Tensor::F32(typed_unary_view_with_pool(
2572            "abs",
2573            buffers,
2574            &t,
2575            |x| x.abs_elem(),
2576        )?)),
2577        CpuReadView::F64(t) => Ok(Tensor::F64(typed_unary_view_with_pool(
2578            "abs",
2579            buffers,
2580            &t,
2581            |x| x.abs_elem(),
2582        )?)),
2583        CpuReadView::I32(t) => Ok(Tensor::I32(typed_unary_view_with_pool(
2584            "abs",
2585            buffers,
2586            &t,
2587            |x| x.wrapping_abs_elem(),
2588        )?)),
2589        CpuReadView::I64(t) => Ok(Tensor::I64(typed_unary_view_with_pool(
2590            "abs",
2591            buffers,
2592            &t,
2593            |x| x.wrapping_abs_elem(),
2594        )?)),
2595        CpuReadView::C32(t) => Ok(Tensor::F32(typed_complex_abs_view_with_pool(buffers, &t)?)),
2596        CpuReadView::C64(t) => Ok(Tensor::F64(typed_complex_abs_view_with_pool(buffers, &t)?)),
2597        _ => Err(crate::Error::backend_failure(
2598            "abs",
2599            format!("unsupported dtype {dtype:?}"),
2600        )),
2601    }
2602}
2603
2604/// Compute elementwise signs.
2605///
2606/// # Examples
2607///
2608/// ```
2609/// use tenferro_cpu::sign;
2610/// use tenferro_tensor::Tensor;
2611///
2612/// let input = Tensor::from_vec_col_major(vec![3], vec![-2.0_f64, 0.0, 3.0])?;
2613/// let out = sign(&input)?;
2614/// assert_eq!(out.as_slice::<f64>().unwrap(), &[-1.0, 0.0, 1.0]);
2615/// # Ok::<(), tenferro_tensor::Error>(())
2616/// ```
2617pub fn sign(input: &Tensor) -> crate::Result<Tensor> {
2618    with_local_pool(|buffers| sign_with_pool(buffers, input))
2619}
2620
2621pub(crate) fn sign_with_pool(buffers: &mut BufferPool, input: &Tensor) -> crate::Result<Tensor> {
2622    match input {
2623        Tensor::F32(t) => Ok(Tensor::F32(typed_sign_with_pool(buffers, t)?)),
2624        Tensor::F64(t) => Ok(Tensor::F64(typed_sign_with_pool(buffers, t)?)),
2625        Tensor::I32(t) => Ok(Tensor::I32(typed_integer_sign_with_pool(buffers, t)?)),
2626        Tensor::I64(t) => Ok(Tensor::I64(typed_integer_sign_with_pool(buffers, t)?)),
2627        Tensor::Bool(_) => Err(crate::Error::backend_failure(
2628            "sign",
2629            format!("unsupported dtype {:?}", input.dtype()),
2630        )),
2631        Tensor::C32(t) => Ok(Tensor::C32(typed_sign_with_pool(buffers, t)?)),
2632        Tensor::C64(t) => Ok(Tensor::C64(typed_sign_with_pool(buffers, t)?)),
2633    }
2634}
2635
2636pub(crate) fn sign_read_with_pool(
2637    buffers: &mut BufferPool,
2638    input: TensorRead<'_>,
2639) -> crate::Result<Tensor> {
2640    let dtype = input.dtype();
2641    match read_as_cpu_view(input) {
2642        CpuReadView::F32(t) => Ok(Tensor::F32(typed_unary_view_with_pool(
2643            "sign",
2644            buffers,
2645            &t,
2646            |x| x.sign_elem(),
2647        )?)),
2648        CpuReadView::F64(t) => Ok(Tensor::F64(typed_unary_view_with_pool(
2649            "sign",
2650            buffers,
2651            &t,
2652            |x| x.sign_elem(),
2653        )?)),
2654        CpuReadView::I32(t) => Ok(Tensor::I32(typed_unary_view_with_pool(
2655            "sign",
2656            buffers,
2657            &t,
2658            |x| x.signum_elem(),
2659        )?)),
2660        CpuReadView::I64(t) => Ok(Tensor::I64(typed_unary_view_with_pool(
2661            "sign",
2662            buffers,
2663            &t,
2664            |x| x.signum_elem(),
2665        )?)),
2666        CpuReadView::C32(t) => Ok(Tensor::C32(typed_unary_view_with_pool(
2667            "sign",
2668            buffers,
2669            &t,
2670            |x| x.sign_elem(),
2671        )?)),
2672        CpuReadView::C64(t) => Ok(Tensor::C64(typed_unary_view_with_pool(
2673            "sign",
2674            buffers,
2675            &t,
2676            |x| x.sign_elem(),
2677        )?)),
2678        _ => Err(crate::Error::backend_failure(
2679            "sign",
2680            format!("unsupported dtype {dtype:?}"),
2681        )),
2682    }
2683}
2684
2685/// Compute elementwise maximum values.
2686///
2687/// # Examples
2688///
2689/// ```
2690/// use tenferro_cpu::maximum;
2691/// use tenferro_tensor::Tensor;
2692///
2693/// let a = Tensor::from_vec_col_major(vec![2], vec![1.0_f64, 5.0])?;
2694/// let b = Tensor::from_vec_col_major(vec![2], vec![3.0_f64, 4.0])?;
2695/// let out = maximum(&a, &b)?;
2696/// assert_eq!(out.as_slice::<f64>().unwrap(), &[3.0, 5.0]);
2697/// # Ok::<(), tenferro_tensor::Error>(())
2698/// ```
2699pub fn maximum(lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2700    with_local_pool(|buffers| maximum_with_pool(buffers, lhs, rhs))
2701}
2702
2703pub(crate) fn maximum_with_pool(
2704    buffers: &mut BufferPool,
2705    lhs: &Tensor,
2706    rhs: &Tensor,
2707) -> crate::Result<Tensor> {
2708    reject_complex_ordered_dtypes("maximum", &[lhs.dtype(), rhs.dtype()])?;
2709
2710    match (lhs, rhs) {
2711        (Tensor::F32(a), Tensor::F32(b)) => {
2712            Ok(Tensor::F32(typed_maximum_with_pool(buffers, a, b)?))
2713        }
2714        (Tensor::F64(a), Tensor::F64(b)) => {
2715            Ok(Tensor::F64(typed_maximum_with_pool(buffers, a, b)?))
2716        }
2717        (Tensor::I32(a), Tensor::I32(b)) => {
2718            Ok(Tensor::I32(typed_maximum_with_pool(buffers, a, b)?))
2719        }
2720        (Tensor::I64(a), Tensor::I64(b)) => {
2721            Ok(Tensor::I64(typed_maximum_with_pool(buffers, a, b)?))
2722        }
2723        _ => Err(tensor_pair_error("maximum", lhs, rhs)),
2724    }
2725}
2726
2727pub(crate) fn maximum_read_with_pool(
2728    buffers: &mut BufferPool,
2729    lhs: TensorRead<'_>,
2730    rhs: TensorRead<'_>,
2731) -> crate::Result<Tensor> {
2732    let lhs_dtype = lhs.dtype();
2733    let rhs_dtype = rhs.dtype();
2734    reject_complex_ordered_dtypes("maximum", &[lhs_dtype, rhs_dtype])?;
2735
2736    match (read_as_cpu_view(lhs), read_as_cpu_view(rhs)) {
2737        (CpuReadView::F32(a), CpuReadView::F32(b)) => Ok(Tensor::F32(
2738            typed_same_shape_binary_view_with_pool("maximum", buffers, &a, &b, |x, y| {
2739                x.max_elem(y)
2740            })?,
2741        )),
2742        (CpuReadView::F64(a), CpuReadView::F64(b)) => Ok(Tensor::F64(
2743            typed_same_shape_binary_view_with_pool("maximum", buffers, &a, &b, |x, y| {
2744                x.max_elem(y)
2745            })?,
2746        )),
2747        (CpuReadView::I32(a), CpuReadView::I32(b)) => Ok(Tensor::I32(
2748            typed_same_shape_binary_view_with_pool("maximum", buffers, &a, &b, |x, y| {
2749                x.max_elem(y)
2750            })?,
2751        )),
2752        (CpuReadView::I64(a), CpuReadView::I64(b)) => Ok(Tensor::I64(
2753            typed_same_shape_binary_view_with_pool("maximum", buffers, &a, &b, |x, y| {
2754                x.max_elem(y)
2755            })?,
2756        )),
2757        _ => Err(dtype_pair_error("maximum", lhs_dtype, rhs_dtype)),
2758    }
2759}
2760
2761/// Compute elementwise minimum values.
2762///
2763/// # Examples
2764///
2765/// ```
2766/// use tenferro_cpu::minimum;
2767/// use tenferro_tensor::Tensor;
2768///
2769/// let a = Tensor::from_vec_col_major(vec![2], vec![1.0_f64, 5.0])?;
2770/// let b = Tensor::from_vec_col_major(vec![2], vec![3.0_f64, 4.0])?;
2771/// let out = minimum(&a, &b)?;
2772/// assert_eq!(out.as_slice::<f64>().unwrap(), &[1.0, 4.0]);
2773/// # Ok::<(), tenferro_tensor::Error>(())
2774/// ```
2775pub fn minimum(lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2776    with_local_pool(|buffers| minimum_with_pool(buffers, lhs, rhs))
2777}
2778
2779pub(crate) fn minimum_with_pool(
2780    buffers: &mut BufferPool,
2781    lhs: &Tensor,
2782    rhs: &Tensor,
2783) -> crate::Result<Tensor> {
2784    reject_complex_ordered_dtypes("minimum", &[lhs.dtype(), rhs.dtype()])?;
2785
2786    match (lhs, rhs) {
2787        (Tensor::F32(a), Tensor::F32(b)) => {
2788            Ok(Tensor::F32(typed_minimum_with_pool(buffers, a, b)?))
2789        }
2790        (Tensor::F64(a), Tensor::F64(b)) => {
2791            Ok(Tensor::F64(typed_minimum_with_pool(buffers, a, b)?))
2792        }
2793        (Tensor::I32(a), Tensor::I32(b)) => {
2794            Ok(Tensor::I32(typed_minimum_with_pool(buffers, a, b)?))
2795        }
2796        (Tensor::I64(a), Tensor::I64(b)) => {
2797            Ok(Tensor::I64(typed_minimum_with_pool(buffers, a, b)?))
2798        }
2799        _ => Err(tensor_pair_error("minimum", lhs, rhs)),
2800    }
2801}
2802
2803pub(crate) fn minimum_read_with_pool(
2804    buffers: &mut BufferPool,
2805    lhs: TensorRead<'_>,
2806    rhs: TensorRead<'_>,
2807) -> crate::Result<Tensor> {
2808    let lhs_dtype = lhs.dtype();
2809    let rhs_dtype = rhs.dtype();
2810    reject_complex_ordered_dtypes("minimum", &[lhs_dtype, rhs_dtype])?;
2811
2812    match (read_as_cpu_view(lhs), read_as_cpu_view(rhs)) {
2813        (CpuReadView::F32(a), CpuReadView::F32(b)) => Ok(Tensor::F32(
2814            typed_same_shape_binary_view_with_pool("minimum", buffers, &a, &b, |x, y| {
2815                x.min_elem(y)
2816            })?,
2817        )),
2818        (CpuReadView::F64(a), CpuReadView::F64(b)) => Ok(Tensor::F64(
2819            typed_same_shape_binary_view_with_pool("minimum", buffers, &a, &b, |x, y| {
2820                x.min_elem(y)
2821            })?,
2822        )),
2823        (CpuReadView::I32(a), CpuReadView::I32(b)) => Ok(Tensor::I32(
2824            typed_same_shape_binary_view_with_pool("minimum", buffers, &a, &b, |x, y| {
2825                x.min_elem(y)
2826            })?,
2827        )),
2828        (CpuReadView::I64(a), CpuReadView::I64(b)) => Ok(Tensor::I64(
2829            typed_same_shape_binary_view_with_pool("minimum", buffers, &a, &b, |x, y| {
2830                x.min_elem(y)
2831            })?,
2832        )),
2833        _ => Err(dtype_pair_error("minimum", lhs_dtype, rhs_dtype)),
2834    }
2835}
2836
2837/// Compare two CPU tensors elementwise.
2838///
2839/// # Examples
2840///
2841/// ```
2842/// use tenferro_cpu::compare;
2843/// use tenferro_tensor::{CompareDir, Tensor};
2844///
2845/// let a = Tensor::from_vec_col_major(vec![2], vec![1.0_f64, 5.0])?;
2846/// let b = Tensor::from_vec_col_major(vec![2], vec![3.0_f64, 4.0])?;
2847/// let out = compare(&a, &b, &CompareDir::Gt)?;
2848/// assert_eq!(out.as_slice::<bool>().unwrap(), &[false, true]);
2849/// # Ok::<(), tenferro_tensor::Error>(())
2850/// ```
2851pub fn compare(lhs: &Tensor, rhs: &Tensor, dir: &CompareDir) -> crate::Result<Tensor> {
2852    with_local_pool(|buffers| compare_with_pool(buffers, lhs, rhs, dir))
2853}
2854
2855pub(crate) fn compare_with_pool(
2856    buffers: &mut BufferPool,
2857    lhs: &Tensor,
2858    rhs: &Tensor,
2859    dir: &CompareDir,
2860) -> crate::Result<Tensor> {
2861    reject_complex_ordered_dtypes("compare", &[lhs.dtype(), rhs.dtype()])?;
2862
2863    match (lhs, rhs) {
2864        (Tensor::F32(a), Tensor::F32(b)) => {
2865            Ok(Tensor::Bool(typed_compare_with_pool(buffers, a, b, dir)?))
2866        }
2867        (Tensor::F64(a), Tensor::F64(b)) => {
2868            Ok(Tensor::Bool(typed_compare_with_pool(buffers, a, b, dir)?))
2869        }
2870        (Tensor::I32(a), Tensor::I32(b)) => {
2871            Ok(Tensor::Bool(typed_compare_with_pool(buffers, a, b, dir)?))
2872        }
2873        (Tensor::I64(a), Tensor::I64(b)) => {
2874            Ok(Tensor::Bool(typed_compare_with_pool(buffers, a, b, dir)?))
2875        }
2876        (Tensor::Bool(a), Tensor::Bool(b)) => {
2877            Ok(Tensor::Bool(typed_compare_with_pool(buffers, a, b, dir)?))
2878        }
2879        _ => Err(crate::Error::DTypeMismatch {
2880            op: "compare",
2881            lhs: lhs.dtype(),
2882            rhs: rhs.dtype(),
2883        }),
2884    }
2885}
2886
2887pub(crate) fn compare_read_with_pool(
2888    buffers: &mut BufferPool,
2889    lhs: TensorRead<'_>,
2890    rhs: TensorRead<'_>,
2891    dir: &CompareDir,
2892) -> crate::Result<Tensor> {
2893    let lhs_dtype = lhs.dtype();
2894    let rhs_dtype = rhs.dtype();
2895    reject_complex_ordered_dtypes("compare", &[lhs_dtype, rhs_dtype])?;
2896
2897    match (read_as_cpu_view(lhs), read_as_cpu_view(rhs)) {
2898        (CpuReadView::F32(a), CpuReadView::F32(b)) => Ok(Tensor::Bool(
2899            typed_same_shape_binary_view_with_pool("compare", buffers, &a, &b, |x, y| {
2900                x.compare_elem(y, dir)
2901            })?,
2902        )),
2903        (CpuReadView::F64(a), CpuReadView::F64(b)) => Ok(Tensor::Bool(
2904            typed_same_shape_binary_view_with_pool("compare", buffers, &a, &b, |x, y| {
2905                x.compare_elem(y, dir)
2906            })?,
2907        )),
2908        (CpuReadView::I32(a), CpuReadView::I32(b)) => Ok(Tensor::Bool(
2909            typed_same_shape_binary_view_with_pool("compare", buffers, &a, &b, |x, y| {
2910                x.compare_elem(y, dir)
2911            })?,
2912        )),
2913        (CpuReadView::I64(a), CpuReadView::I64(b)) => Ok(Tensor::Bool(
2914            typed_same_shape_binary_view_with_pool("compare", buffers, &a, &b, |x, y| {
2915                x.compare_elem(y, dir)
2916            })?,
2917        )),
2918        (CpuReadView::Bool(a), CpuReadView::Bool(b)) => Ok(Tensor::Bool(
2919            typed_same_shape_binary_view_with_pool("compare", buffers, &a, &b, |x, y| {
2920                x.compare_elem(y, dir)
2921            })?,
2922        )),
2923        _ => Err(crate::Error::DTypeMismatch {
2924            op: "compare",
2925            lhs: lhs_dtype,
2926            rhs: rhs_dtype,
2927        }),
2928    }
2929}
2930
2931/// Select values from two tensors using a boolean predicate tensor.
2932///
2933/// # Examples
2934///
2935/// ```
2936/// use tenferro_cpu::select;
2937/// use tenferro_tensor::Tensor;
2938///
2939/// let pred = Tensor::from_vec_col_major(vec![2], vec![true, false])?;
2940/// let on_true = Tensor::from_vec_col_major(vec![2], vec![1.0_f64, 2.0])?;
2941/// let on_false = Tensor::from_vec_col_major(vec![2], vec![3.0_f64, 4.0])?;
2942/// let out = select(&pred, &on_true, &on_false)?;
2943/// assert_eq!(out.as_slice::<f64>().unwrap(), &[1.0, 4.0]);
2944/// # Ok::<(), tenferro_tensor::Error>(())
2945/// ```
2946pub fn select(pred: &Tensor, on_true: &Tensor, on_false: &Tensor) -> crate::Result<Tensor> {
2947    with_local_pool(|buffers| select_with_pool(buffers, pred, on_true, on_false))
2948}
2949
2950pub(crate) fn select_with_pool(
2951    buffers: &mut BufferPool,
2952    pred: &Tensor,
2953    on_true: &Tensor,
2954    on_false: &Tensor,
2955) -> crate::Result<Tensor> {
2956    match (pred, on_true, on_false) {
2957        (Tensor::Bool(p), Tensor::F32(t), Tensor::F32(f)) => {
2958            Ok(Tensor::F32(typed_select_with_pool(buffers, p, t, f)?))
2959        }
2960        (Tensor::Bool(p), Tensor::F64(t), Tensor::F64(f)) => {
2961            Ok(Tensor::F64(typed_select_with_pool(buffers, p, t, f)?))
2962        }
2963        (Tensor::Bool(p), Tensor::I32(t), Tensor::I32(f)) => {
2964            Ok(Tensor::I32(typed_select_with_pool(buffers, p, t, f)?))
2965        }
2966        (Tensor::Bool(p), Tensor::I64(t), Tensor::I64(f)) => {
2967            Ok(Tensor::I64(typed_select_with_pool(buffers, p, t, f)?))
2968        }
2969        (Tensor::Bool(p), Tensor::Bool(t), Tensor::Bool(f)) => {
2970            Ok(Tensor::Bool(typed_select_with_pool(buffers, p, t, f)?))
2971        }
2972        (Tensor::Bool(p), Tensor::C32(t), Tensor::C32(f)) => {
2973            Ok(Tensor::C32(typed_select_with_pool(buffers, p, t, f)?))
2974        }
2975        (Tensor::Bool(p), Tensor::C64(t), Tensor::C64(f)) => {
2976            Ok(Tensor::C64(typed_select_with_pool(buffers, p, t, f)?))
2977        }
2978        (Tensor::Bool(_), _, _) => Err(crate::Error::DTypeMismatch {
2979            op: "select",
2980            lhs: on_true.dtype(),
2981            rhs: on_false.dtype(),
2982        }),
2983        _ => Err(crate::Error::DTypeMismatch {
2984            op: "select",
2985            lhs: pred.dtype(),
2986            rhs: crate::DType::Bool,
2987        }),
2988    }
2989}
2990
2991pub(crate) fn select_read_with_pool(
2992    buffers: &mut BufferPool,
2993    pred: TensorRead<'_>,
2994    on_true: TensorRead<'_>,
2995    on_false: TensorRead<'_>,
2996) -> crate::Result<Tensor> {
2997    let pred_dtype = pred.dtype();
2998    let true_dtype = on_true.dtype();
2999    let false_dtype = on_false.dtype();
3000    match (
3001        read_as_cpu_view(pred),
3002        read_as_cpu_view(on_true),
3003        read_as_cpu_view(on_false),
3004    ) {
3005        (CpuReadView::Bool(p), CpuReadView::F32(t), CpuReadView::F32(f)) => Ok(Tensor::F32(
3006            typed_select_view_with_pool(buffers, &p, &t, &f)?,
3007        )),
3008        (CpuReadView::Bool(p), CpuReadView::F64(t), CpuReadView::F64(f)) => Ok(Tensor::F64(
3009            typed_select_view_with_pool(buffers, &p, &t, &f)?,
3010        )),
3011        (CpuReadView::Bool(p), CpuReadView::I32(t), CpuReadView::I32(f)) => Ok(Tensor::I32(
3012            typed_select_view_with_pool(buffers, &p, &t, &f)?,
3013        )),
3014        (CpuReadView::Bool(p), CpuReadView::I64(t), CpuReadView::I64(f)) => Ok(Tensor::I64(
3015            typed_select_view_with_pool(buffers, &p, &t, &f)?,
3016        )),
3017        (CpuReadView::Bool(p), CpuReadView::Bool(t), CpuReadView::Bool(f)) => Ok(Tensor::Bool(
3018            typed_select_view_with_pool(buffers, &p, &t, &f)?,
3019        )),
3020        (CpuReadView::Bool(p), CpuReadView::C32(t), CpuReadView::C32(f)) => Ok(Tensor::C32(
3021            typed_select_view_with_pool(buffers, &p, &t, &f)?,
3022        )),
3023        (CpuReadView::Bool(p), CpuReadView::C64(t), CpuReadView::C64(f)) => Ok(Tensor::C64(
3024            typed_select_view_with_pool(buffers, &p, &t, &f)?,
3025        )),
3026        (CpuReadView::Bool(_), _, _) => Err(crate::Error::DTypeMismatch {
3027            op: "select",
3028            lhs: true_dtype,
3029            rhs: false_dtype,
3030        }),
3031        _ => Err(crate::Error::DTypeMismatch {
3032            op: "select",
3033            lhs: pred_dtype,
3034            rhs: crate::DType::Bool,
3035        }),
3036    }
3037}
3038
3039/// Clamp CPU tensor values elementwise between lower and upper bounds.
3040///
3041/// # Examples
3042///
3043/// ```
3044/// use tenferro_cpu::clamp;
3045/// use tenferro_tensor::Tensor;
3046///
3047/// let input = Tensor::from_vec_col_major(vec![3], vec![-1.0_f64, 2.0, 8.0])?;
3048/// let lower = Tensor::from_vec_col_major(vec![3], vec![0.0_f64, 0.0, 0.0])?;
3049/// let upper = Tensor::from_vec_col_major(vec![3], vec![5.0_f64, 5.0, 5.0])?;
3050/// let out = clamp(&input, &lower, &upper)?;
3051/// assert_eq!(out.as_slice::<f64>().unwrap(), &[0.0, 2.0, 5.0]);
3052/// # Ok::<(), tenferro_tensor::Error>(())
3053/// ```
3054pub fn clamp(input: &Tensor, lower: &Tensor, upper: &Tensor) -> crate::Result<Tensor> {
3055    with_local_pool(|buffers| clamp_with_pool(buffers, input, lower, upper))
3056}
3057
3058pub(crate) fn clamp_with_pool(
3059    buffers: &mut BufferPool,
3060    input: &Tensor,
3061    lower: &Tensor,
3062    upper: &Tensor,
3063) -> crate::Result<Tensor> {
3064    reject_complex_ordered_dtypes("clamp", &[input.dtype(), lower.dtype(), upper.dtype()])?;
3065
3066    dispatch_ternary_result_with_pool!("clamp", input, lower, upper, |x, lo, hi| {
3067        typed_clamp_with_pool(buffers, x, lo, hi)
3068    })
3069}
3070
3071pub(crate) fn clamp_read_with_pool(
3072    buffers: &mut BufferPool,
3073    input: TensorRead<'_>,
3074    lower: TensorRead<'_>,
3075    upper: TensorRead<'_>,
3076) -> crate::Result<Tensor> {
3077    let input_dtype = input.dtype();
3078    let lower_dtype = lower.dtype();
3079    let upper_dtype = upper.dtype();
3080    reject_complex_ordered_dtypes("clamp", &[input_dtype, lower_dtype, upper_dtype])?;
3081
3082    match (
3083        read_as_cpu_view(input),
3084        read_as_cpu_view(lower),
3085        read_as_cpu_view(upper),
3086    ) {
3087        (CpuReadView::F32(input), CpuReadView::F32(lower), CpuReadView::F32(upper)) => Ok(
3088            Tensor::F32(typed_clamp_view_with_pool(buffers, &input, &lower, &upper)?),
3089        ),
3090        (CpuReadView::F64(input), CpuReadView::F64(lower), CpuReadView::F64(upper)) => Ok(
3091            Tensor::F64(typed_clamp_view_with_pool(buffers, &input, &lower, &upper)?),
3092        ),
3093        _ => Err(crate::Error::DTypeMismatch {
3094            op: "clamp",
3095            lhs: input_dtype,
3096            rhs: lower_dtype,
3097        }),
3098    }
3099}
3100
3101pub(crate) fn typed_add_with_pool<T>(
3102    buffers: &mut BufferPool,
3103    lhs: &TypedTensor<T>,
3104    rhs: &TypedTensor<T>,
3105) -> crate::Result<TypedTensor<T>>
3106where
3107    T: Copy + Clone + Zero + Add<Output = T> + PoolScalar,
3108{
3109    if lhs.shape() == rhs.shape() {
3110        // SAFETY: zip_map2_into overwrites every output element.
3111        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3112        zip_map2_into(
3113            &mut out.view_mut(),
3114            &typed_view("add", lhs)?,
3115            &typed_view("add", rhs)?,
3116            |x, y| x + y,
3117        )
3118        .map_err(|err| crate::Error::backend_failure("add", err.to_string()))?;
3119        Ok(tensor_from_array(out))
3120    } else if lhs.shape().is_empty() {
3121        let scalar = typed_host_data("add", lhs)?[0];
3122        // SAFETY: map_into overwrites every output element.
3123        let mut out = unsafe { typed_array_uninit_from_pool(buffers, rhs.shape()) }?;
3124        map_into(&mut out.view_mut(), &typed_view("add", rhs)?, |x| {
3125            scalar + x
3126        })
3127        .map_err(|err| crate::Error::backend_failure("add", err.to_string()))?;
3128        Ok(tensor_from_array(out))
3129    } else if rhs.shape().is_empty() {
3130        let scalar = typed_host_data("add", rhs)?[0];
3131        // SAFETY: map_into overwrites every output element.
3132        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3133        map_into(&mut out.view_mut(), &typed_view("add", lhs)?, |x| {
3134            x + scalar
3135        })
3136        .map_err(|err| crate::Error::backend_failure("add", err.to_string()))?;
3137        Ok(tensor_from_array(out))
3138    } else {
3139        Err(crate::Error::ShapeMismatch {
3140            op: "add",
3141            lhs: lhs.shape().to_vec(),
3142            rhs: rhs.shape().to_vec(),
3143        })
3144    }
3145}
3146
3147fn typed_binary_with_pool<T>(
3148    op: &'static str,
3149    buffers: &mut BufferPool,
3150    lhs: &TypedTensor<T>,
3151    rhs: &TypedTensor<T>,
3152    f: impl Fn(T, T) -> T + Copy + Sync,
3153) -> crate::Result<TypedTensor<T>>
3154where
3155    T: Copy + PoolScalar + 'static,
3156{
3157    if lhs.shape() == rhs.shape() {
3158        // SAFETY: zip_map2_into overwrites every output element.
3159        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3160        zip_map2_into(
3161            &mut out.view_mut(),
3162            &typed_view(op, lhs)?,
3163            &typed_view(op, rhs)?,
3164            f,
3165        )
3166        .map_err(|err| crate::Error::backend_failure(op, err.to_string()))?;
3167        Ok(tensor_from_array(out))
3168    } else if lhs.shape().is_empty() {
3169        let scalar = typed_host_data(op, lhs)?[0];
3170        // SAFETY: map_into overwrites every output element.
3171        let mut out = unsafe { typed_array_uninit_from_pool(buffers, rhs.shape()) }?;
3172        map_into(&mut out.view_mut(), &typed_view(op, rhs)?, |x| f(scalar, x))
3173            .map_err(|err| crate::Error::backend_failure(op, err.to_string()))?;
3174        Ok(tensor_from_array(out))
3175    } else if rhs.shape().is_empty() {
3176        let scalar = typed_host_data(op, rhs)?[0];
3177        // SAFETY: map_into overwrites every output element.
3178        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3179        map_into(&mut out.view_mut(), &typed_view(op, lhs)?, |x| f(x, scalar))
3180            .map_err(|err| crate::Error::backend_failure(op, err.to_string()))?;
3181        Ok(tensor_from_array(out))
3182    } else {
3183        Err(crate::Error::ShapeMismatch {
3184            op,
3185            lhs: lhs.shape().to_vec(),
3186            rhs: rhs.shape().to_vec(),
3187        })
3188    }
3189}
3190
3191fn typed_wrapping_add_with_pool<T>(
3192    buffers: &mut BufferPool,
3193    lhs: &TypedTensor<T>,
3194    rhs: &TypedTensor<T>,
3195) -> crate::Result<TypedTensor<T>>
3196where
3197    T: WrappingIntegerElem,
3198{
3199    typed_binary_with_pool("add", buffers, lhs, rhs, |x, y| x.wrapping_add_elem(y))
3200}
3201
3202fn typed_wrapping_add_view_with_pool<T, L, R>(
3203    buffers: &mut BufferPool,
3204    lhs: &TypedTensorView<'_, T, L>,
3205    rhs: &TypedTensorView<'_, T, R>,
3206) -> crate::Result<TypedTensor<T>>
3207where
3208    T: WrappingIntegerElem,
3209    L: TensorRank,
3210    R: TensorRank,
3211{
3212    typed_binary_view_with_pool("add", buffers, lhs, rhs, |x, y| x.wrapping_add_elem(y))
3213}
3214
3215pub(crate) fn typed_add_view_with_pool<T, L, R>(
3216    buffers: &mut BufferPool,
3217    lhs: &TypedTensorView<'_, T, L>,
3218    rhs: &TypedTensorView<'_, T, R>,
3219) -> crate::Result<TypedTensor<T>>
3220where
3221    T: Copy + Clone + Zero + Add<Output = T> + PoolScalar + 'static,
3222    L: TensorRank,
3223    R: TensorRank,
3224{
3225    if lhs.shape() == rhs.shape() {
3226        // SAFETY: zip_map2_into overwrites every output element.
3227        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3228        zip_map2_into(
3229            &mut out.view_mut(),
3230            &typed_view_from_view("add", lhs)?,
3231            &typed_view_from_view("add", rhs)?,
3232            |x, y| x + y,
3233        )
3234        .map_err(|err| crate::Error::backend_failure("add", err.to_string()))?;
3235        Ok(tensor_from_array(out))
3236    } else if lhs.shape().is_empty() {
3237        let scalar = typed_view_from_view("add", lhs)?.get(&[]);
3238        // SAFETY: map_into overwrites every output element.
3239        let mut out = unsafe { typed_array_uninit_from_pool(buffers, rhs.shape()) }?;
3240        map_into(
3241            &mut out.view_mut(),
3242            &typed_view_from_view("add", rhs)?,
3243            |x| scalar + x,
3244        )
3245        .map_err(|err| crate::Error::backend_failure("add", err.to_string()))?;
3246        Ok(tensor_from_array(out))
3247    } else if rhs.shape().is_empty() {
3248        let scalar = typed_view_from_view("add", rhs)?.get(&[]);
3249        // SAFETY: map_into overwrites every output element.
3250        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3251        map_into(
3252            &mut out.view_mut(),
3253            &typed_view_from_view("add", lhs)?,
3254            |x| x + scalar,
3255        )
3256        .map_err(|err| crate::Error::backend_failure("add", err.to_string()))?;
3257        Ok(tensor_from_array(out))
3258    } else {
3259        Err(crate::Error::ShapeMismatch {
3260            op: "add",
3261            lhs: lhs.shape().to_vec(),
3262            rhs: rhs.shape().to_vec(),
3263        })
3264    }
3265}
3266
3267pub(crate) fn typed_sub_with_pool<T>(
3268    buffers: &mut BufferPool,
3269    lhs: &TypedTensor<T>,
3270    rhs: &TypedTensor<T>,
3271) -> crate::Result<TypedTensor<T>>
3272where
3273    T: Copy + PoolScalar + Sub<Output = T> + 'static,
3274{
3275    typed_binary_with_pool("sub", buffers, lhs, rhs, |x, y| x - y)
3276}
3277
3278fn typed_wrapping_sub_with_pool<T>(
3279    buffers: &mut BufferPool,
3280    lhs: &TypedTensor<T>,
3281    rhs: &TypedTensor<T>,
3282) -> crate::Result<TypedTensor<T>>
3283where
3284    T: WrappingIntegerElem,
3285{
3286    typed_binary_with_pool("sub", buffers, lhs, rhs, |x, y| x.wrapping_sub_elem(y))
3287}
3288
3289fn typed_wrapping_sub_view_with_pool<T, L, R>(
3290    buffers: &mut BufferPool,
3291    lhs: &TypedTensorView<'_, T, L>,
3292    rhs: &TypedTensorView<'_, T, R>,
3293) -> crate::Result<TypedTensor<T>>
3294where
3295    T: WrappingIntegerElem,
3296    L: TensorRank,
3297    R: TensorRank,
3298{
3299    typed_binary_view_with_pool("sub", buffers, lhs, rhs, |x, y| x.wrapping_sub_elem(y))
3300}
3301
3302pub(crate) fn typed_sub_view_with_pool<T, L, R>(
3303    buffers: &mut BufferPool,
3304    lhs: &TypedTensorView<'_, T, L>,
3305    rhs: &TypedTensorView<'_, T, R>,
3306) -> crate::Result<TypedTensor<T>>
3307where
3308    T: Copy + PoolScalar + Sub<Output = T> + 'static,
3309    L: TensorRank,
3310    R: TensorRank,
3311{
3312    typed_binary_view_with_pool("sub", buffers, lhs, rhs, |x, y| x - y)
3313}
3314
3315pub(crate) fn typed_mul_with_pool<T>(
3316    buffers: &mut BufferPool,
3317    lhs: &TypedTensor<T>,
3318    rhs: &TypedTensor<T>,
3319) -> crate::Result<TypedTensor<T>>
3320where
3321    T: Copy + Clone + Zero + Mul<Output = T> + PoolScalar + 'static,
3322{
3323    if lhs.shape() == rhs.shape() {
3324        // SAFETY: mul_into overwrites every output element.
3325        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3326        mul_into(
3327            &mut out.view_mut(),
3328            &typed_view("mul", lhs)?,
3329            &typed_view("mul", rhs)?,
3330        )
3331        .map_err(|err| crate::Error::backend_failure("mul", err))?;
3332        Ok(tensor_from_array(out))
3333    } else if lhs.shape().is_empty() {
3334        let scalar = typed_host_data("mul", lhs)?[0];
3335        // SAFETY: map_into overwrites every output element.
3336        let mut out = unsafe { typed_array_uninit_from_pool(buffers, rhs.shape()) }?;
3337        map_into(&mut out.view_mut(), &typed_view("mul", rhs)?, |x| {
3338            scalar * x
3339        })
3340        .map_err(|err| crate::Error::backend_failure("mul", err))?;
3341        Ok(tensor_from_array(out))
3342    } else if rhs.shape().is_empty() {
3343        let scalar = typed_host_data("mul", rhs)?[0];
3344        // SAFETY: map_into overwrites every output element.
3345        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3346        map_into(&mut out.view_mut(), &typed_view("mul", lhs)?, |x| {
3347            x * scalar
3348        })
3349        .map_err(|err| crate::Error::backend_failure("mul", err))?;
3350        Ok(tensor_from_array(out))
3351    } else {
3352        Err(crate::Error::ShapeMismatch {
3353            op: "mul",
3354            lhs: lhs.shape().to_vec(),
3355            rhs: rhs.shape().to_vec(),
3356        })
3357    }
3358}
3359
3360fn typed_wrapping_mul_with_pool<T>(
3361    buffers: &mut BufferPool,
3362    lhs: &TypedTensor<T>,
3363    rhs: &TypedTensor<T>,
3364) -> crate::Result<TypedTensor<T>>
3365where
3366    T: WrappingIntegerElem,
3367{
3368    typed_binary_with_pool("mul", buffers, lhs, rhs, |x, y| x.wrapping_mul_elem(y))
3369}
3370
3371fn typed_wrapping_mul_view_with_pool<T, L, R>(
3372    buffers: &mut BufferPool,
3373    lhs: &TypedTensorView<'_, T, L>,
3374    rhs: &TypedTensorView<'_, T, R>,
3375) -> crate::Result<TypedTensor<T>>
3376where
3377    T: WrappingIntegerElem,
3378    L: TensorRank,
3379    R: TensorRank,
3380{
3381    typed_binary_view_with_pool("mul", buffers, lhs, rhs, |x, y| x.wrapping_mul_elem(y))
3382}
3383
3384pub(crate) fn typed_mul_view_with_pool<T, L, R>(
3385    buffers: &mut BufferPool,
3386    lhs: &TypedTensorView<'_, T, L>,
3387    rhs: &TypedTensorView<'_, T, R>,
3388) -> crate::Result<TypedTensor<T>>
3389where
3390    T: Copy + Clone + Zero + Mul<Output = T> + PoolScalar + 'static,
3391    L: TensorRank,
3392    R: TensorRank,
3393{
3394    if lhs.shape() == rhs.shape() {
3395        // SAFETY: mul_into overwrites every output element.
3396        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3397        mul_into(
3398            &mut out.view_mut(),
3399            &typed_view_from_view("mul", lhs)?,
3400            &typed_view_from_view("mul", rhs)?,
3401        )
3402        .map_err(|err| crate::Error::backend_failure("mul", err))?;
3403        Ok(tensor_from_array(out))
3404    } else if lhs.shape().is_empty() {
3405        let scalar = typed_view_from_view("mul", lhs)?.get(&[]);
3406        // SAFETY: map_into overwrites every output element.
3407        let mut out = unsafe { typed_array_uninit_from_pool(buffers, rhs.shape()) }?;
3408        map_into(
3409            &mut out.view_mut(),
3410            &typed_view_from_view("mul", rhs)?,
3411            |x| scalar * x,
3412        )
3413        .map_err(|err| crate::Error::backend_failure("mul", err))?;
3414        Ok(tensor_from_array(out))
3415    } else if rhs.shape().is_empty() {
3416        let scalar = typed_view_from_view("mul", rhs)?.get(&[]);
3417        // SAFETY: map_into overwrites every output element.
3418        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3419        map_into(
3420            &mut out.view_mut(),
3421            &typed_view_from_view("mul", lhs)?,
3422            |x| x * scalar,
3423        )
3424        .map_err(|err| crate::Error::backend_failure("mul", err))?;
3425        Ok(tensor_from_array(out))
3426    } else {
3427        Err(crate::Error::ShapeMismatch {
3428            op: "mul",
3429            lhs: lhs.shape().to_vec(),
3430            rhs: rhs.shape().to_vec(),
3431        })
3432    }
3433}
3434
3435pub(crate) fn typed_div_with_pool<T>(
3436    buffers: &mut BufferPool,
3437    lhs: &TypedTensor<T>,
3438    rhs: &TypedTensor<T>,
3439) -> crate::Result<TypedTensor<T>>
3440where
3441    T: Copy + Clone + Zero + Div<Output = T> + PoolScalar,
3442{
3443    if lhs.shape() == rhs.shape() {
3444        // SAFETY: zip_map2_into overwrites every output element.
3445        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3446        zip_map2_into(
3447            &mut out.view_mut(),
3448            &typed_view("div", lhs)?,
3449            &typed_view("div", rhs)?,
3450            |x, y| x / y,
3451        )
3452        .map_err(|err| crate::Error::backend_failure("div", err))?;
3453        Ok(tensor_from_array(out))
3454    } else if lhs.shape().is_empty() {
3455        let scalar = typed_host_data("div", lhs)?[0];
3456        // SAFETY: map_into overwrites every output element.
3457        let mut out = unsafe { typed_array_uninit_from_pool(buffers, rhs.shape()) }?;
3458        map_into(&mut out.view_mut(), &typed_view("div", rhs)?, |x| {
3459            scalar / x
3460        })
3461        .map_err(|err| crate::Error::backend_failure("div", err))?;
3462        Ok(tensor_from_array(out))
3463    } else if rhs.shape().is_empty() {
3464        let scalar = typed_host_data("div", rhs)?[0];
3465        // SAFETY: map_into overwrites every output element.
3466        let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3467        map_into(&mut out.view_mut(), &typed_view("div", lhs)?, |x| {
3468            x / scalar
3469        })
3470        .map_err(|err| crate::Error::backend_failure("div", err))?;
3471        Ok(tensor_from_array(out))
3472    } else {
3473        Err(crate::Error::ShapeMismatch {
3474            op: "div",
3475            lhs: lhs.shape().to_vec(),
3476            rhs: rhs.shape().to_vec(),
3477        })
3478    }
3479}
3480
3481fn typed_integer_div_with_pool<T>(
3482    buffers: &mut BufferPool,
3483    lhs: &TypedTensor<T>,
3484    rhs: &TypedTensor<T>,
3485) -> crate::Result<TypedTensor<T>>
3486where
3487    T: WrappingIntegerElem,
3488{
3489    let rhs_view = typed_view("div", rhs)?;
3490    ensure_no_zero_divisor("div", &rhs_view)?;
3491    typed_binary_with_pool("div", buffers, lhs, rhs, |x, y| x.wrapping_div_elem(y))
3492}
3493
3494fn typed_integer_div_view_with_pool<T, L, R>(
3495    buffers: &mut BufferPool,
3496    lhs: &TypedTensorView<'_, T, L>,
3497    rhs: &TypedTensorView<'_, T, R>,
3498) -> crate::Result<TypedTensor<T>>
3499where
3500    T: WrappingIntegerElem,
3501    L: TensorRank,
3502    R: TensorRank,
3503{
3504    let rhs_view = typed_view_from_view("div", rhs)?;
3505    ensure_no_zero_divisor("div", &rhs_view)?;
3506    typed_binary_view_with_pool("div", buffers, lhs, rhs, |x, y| x.wrapping_div_elem(y))
3507}
3508
3509fn typed_rem_with_pool<T>(
3510    buffers: &mut BufferPool,
3511    lhs: &TypedTensor<T>,
3512    rhs: &TypedTensor<T>,
3513) -> crate::Result<TypedTensor<T>>
3514where
3515    T: Copy + Clone + Zero + StdRem<Output = T> + PoolScalar + 'static,
3516{
3517    typed_binary_with_pool("rem", buffers, lhs, rhs, |x, y| x % y)
3518}
3519
3520fn typed_integer_rem_with_pool<T>(
3521    buffers: &mut BufferPool,
3522    lhs: &TypedTensor<T>,
3523    rhs: &TypedTensor<T>,
3524) -> crate::Result<TypedTensor<T>>
3525where
3526    T: WrappingIntegerElem,
3527{
3528    let rhs_view = typed_view("rem", rhs)?;
3529    ensure_no_zero_divisor("rem", &rhs_view)?;
3530    typed_binary_with_pool("rem", buffers, lhs, rhs, |x, y| x.wrapping_rem_elem(y))
3531}
3532
3533fn typed_integer_rem_view_with_pool<T, L, R>(
3534    buffers: &mut BufferPool,
3535    lhs: &TypedTensorView<'_, T, L>,
3536    rhs: &TypedTensorView<'_, T, R>,
3537) -> crate::Result<TypedTensor<T>>
3538where
3539    T: WrappingIntegerElem,
3540    L: TensorRank,
3541    R: TensorRank,
3542{
3543    let rhs_view = typed_view_from_view("rem", rhs)?;
3544    ensure_no_zero_divisor("rem", &rhs_view)?;
3545    typed_binary_view_with_pool("rem", buffers, lhs, rhs, |x, y| x.wrapping_rem_elem(y))
3546}
3547
3548pub(crate) fn typed_neg_with_pool<T>(
3549    buffers: &mut BufferPool,
3550    input: &TypedTensor<T>,
3551) -> crate::Result<TypedTensor<T>>
3552where
3553    T: Copy + Clone + Zero + Neg<Output = T> + PoolScalar,
3554{
3555    // SAFETY: map_into overwrites every output element.
3556    let mut out = unsafe { typed_array_uninit_from_pool(buffers, input.shape()) }?;
3557    map_into(&mut out.view_mut(), &typed_view("neg", input)?, |x| -x)
3558        .map_err(|err| crate::Error::backend_failure("neg", err))?;
3559    Ok(tensor_from_array(out))
3560}
3561
3562fn typed_wrapping_unary_with_pool<T>(
3563    op: &'static str,
3564    buffers: &mut BufferPool,
3565    input: &TypedTensor<T>,
3566    f: impl Fn(T) -> T + Copy + Sync,
3567) -> crate::Result<TypedTensor<T>>
3568where
3569    T: WrappingIntegerElem,
3570{
3571    // SAFETY: map_into overwrites every output element.
3572    let mut out = unsafe { typed_array_uninit_from_pool(buffers, input.shape()) }?;
3573    map_into(&mut out.view_mut(), &typed_view(op, input)?, f)
3574        .map_err(|err| crate::Error::backend_failure(op, err))?;
3575    Ok(tensor_from_array(out))
3576}
3577
3578fn typed_wrapping_neg_with_pool<T>(
3579    buffers: &mut BufferPool,
3580    input: &TypedTensor<T>,
3581) -> crate::Result<TypedTensor<T>>
3582where
3583    T: WrappingIntegerElem,
3584{
3585    typed_wrapping_unary_with_pool("neg", buffers, input, |x| x.wrapping_neg_elem())
3586}
3587
3588fn typed_wrapping_abs_with_pool<T>(
3589    buffers: &mut BufferPool,
3590    input: &TypedTensor<T>,
3591) -> crate::Result<TypedTensor<T>>
3592where
3593    T: WrappingIntegerElem,
3594{
3595    typed_wrapping_unary_with_pool("abs", buffers, input, |x| x.wrapping_abs_elem())
3596}
3597
3598pub(crate) fn typed_conj_with_pool<T>(
3599    buffers: &mut BufferPool,
3600    input: &TypedTensor<T>,
3601) -> crate::Result<TypedTensor<T>>
3602where
3603    T: Copy + Clone + Zero + ConjElem + PoolScalar,
3604{
3605    // SAFETY: map_into overwrites every output element.
3606    let mut out = unsafe { typed_array_uninit_from_pool(buffers, input.shape()) }?;
3607    map_into(&mut out.view_mut(), &typed_view("conj", input)?, |x| {
3608        x.conj_elem()
3609    })
3610    .map_err(|err| crate::Error::backend_failure("conj", err))?;
3611    Ok(tensor_from_array(out))
3612}
3613
3614pub(crate) fn typed_abs_with_pool<T>(
3615    buffers: &mut BufferPool,
3616    input: &TypedTensor<T>,
3617) -> crate::Result<TypedTensor<T>>
3618where
3619    T: Tier2Elem + PoolScalar,
3620{
3621    // SAFETY: map_into overwrites every output element.
3622    let mut out = unsafe { typed_array_uninit_from_pool(buffers, input.shape()) }?;
3623    map_into(&mut out.view_mut(), &typed_view("abs", input)?, |x| {
3624        x.abs_elem()
3625    })
3626    .map_err(|err| crate::Error::backend_failure("abs", err))?;
3627    Ok(tensor_from_array(out))
3628}
3629
3630fn typed_complex_abs_with_pool<T>(
3631    buffers: &mut BufferPool,
3632    input: &TypedTensor<Complex<T>>,
3633) -> crate::Result<TypedTensor<T>>
3634where
3635    T: num_traits::Float + PoolScalar,
3636{
3637    // SAFETY: the following kernel overwrites every output element before any read.
3638    let mut out = unsafe { typed_array_uninit_from_pool(buffers, input.shape()) }?;
3639    map_into(&mut out.view_mut(), &typed_view("abs", input)?, |x| {
3640        x.norm()
3641    })
3642    .map_err(|err| crate::Error::backend_failure("abs", err))?;
3643    Ok(tensor_from_array(out))
3644}
3645
3646fn typed_complex_abs_view_with_pool<T, R>(
3647    buffers: &mut BufferPool,
3648    input: &TypedTensorView<'_, Complex<T>, R>,
3649) -> crate::Result<TypedTensor<T>>
3650where
3651    T: num_traits::Float + PoolScalar + 'static,
3652    R: TensorRank,
3653{
3654    // SAFETY: the following kernel overwrites every output element before any read.
3655    let mut out = unsafe { typed_array_uninit_from_pool(buffers, input.shape()) }?;
3656    map_into(
3657        &mut out.view_mut(),
3658        &typed_view_from_view("abs", input)?,
3659        |x| x.norm(),
3660    )
3661    .map_err(|err| crate::Error::backend_failure("abs", err))?;
3662    Ok(tensor_from_array(out))
3663}
3664
3665pub(crate) fn typed_sign_with_pool<T>(
3666    buffers: &mut BufferPool,
3667    input: &TypedTensor<T>,
3668) -> crate::Result<TypedTensor<T>>
3669where
3670    T: Tier2Elem + PoolScalar,
3671{
3672    // SAFETY: map_into overwrites every output element.
3673    let mut out = unsafe { typed_array_uninit_from_pool(buffers, input.shape()) }?;
3674    map_into(&mut out.view_mut(), &typed_view("sign", input)?, |x| {
3675        x.sign_elem()
3676    })
3677    .map_err(|err| crate::Error::backend_failure("sign", err))?;
3678    Ok(tensor_from_array(out))
3679}
3680
3681fn typed_integer_sign_with_pool<T>(
3682    buffers: &mut BufferPool,
3683    input: &TypedTensor<T>,
3684) -> crate::Result<TypedTensor<T>>
3685where
3686    T: WrappingIntegerElem,
3687{
3688    typed_wrapping_unary_with_pool("sign", buffers, input, |x| x.signum_elem())
3689}
3690
3691pub(crate) fn typed_maximum_with_pool<T>(
3692    buffers: &mut BufferPool,
3693    lhs: &TypedTensor<T>,
3694    rhs: &TypedTensor<T>,
3695) -> crate::Result<TypedTensor<T>>
3696where
3697    T: OrderedElem + PoolScalar,
3698{
3699    if lhs.shape() != rhs.shape() {
3700        return Err(crate::Error::ShapeMismatch {
3701            op: "maximum",
3702            lhs: lhs.shape().to_vec(),
3703            rhs: rhs.shape().to_vec(),
3704        });
3705    }
3706    // SAFETY: zip_map2_into overwrites every output element.
3707    let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3708    zip_map2_into(
3709        &mut out.view_mut(),
3710        &typed_view("maximum", lhs)?,
3711        &typed_view("maximum", rhs)?,
3712        |x, y| x.max_elem(y),
3713    )
3714    .map_err(|err| crate::Error::backend_failure("maximum", err))?;
3715    Ok(tensor_from_array(out))
3716}
3717
3718pub(crate) fn typed_minimum_with_pool<T>(
3719    buffers: &mut BufferPool,
3720    lhs: &TypedTensor<T>,
3721    rhs: &TypedTensor<T>,
3722) -> crate::Result<TypedTensor<T>>
3723where
3724    T: OrderedElem + PoolScalar,
3725{
3726    if lhs.shape() != rhs.shape() {
3727        return Err(crate::Error::ShapeMismatch {
3728            op: "minimum",
3729            lhs: lhs.shape().to_vec(),
3730            rhs: rhs.shape().to_vec(),
3731        });
3732    }
3733    // SAFETY: zip_map2_into overwrites every output element.
3734    let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3735    zip_map2_into(
3736        &mut out.view_mut(),
3737        &typed_view("minimum", lhs)?,
3738        &typed_view("minimum", rhs)?,
3739        |x, y| x.min_elem(y),
3740    )
3741    .map_err(|err| crate::Error::backend_failure("minimum", err))?;
3742    Ok(tensor_from_array(out))
3743}
3744
3745pub(crate) fn typed_compare_with_pool<T>(
3746    buffers: &mut BufferPool,
3747    lhs: &TypedTensor<T>,
3748    rhs: &TypedTensor<T>,
3749    dir: &CompareDir,
3750) -> crate::Result<TypedTensor<bool>>
3751where
3752    T: CompareElem,
3753{
3754    if lhs.shape() != rhs.shape() {
3755        return Err(crate::Error::ShapeMismatch {
3756            op: "compare",
3757            lhs: lhs.shape().to_vec(),
3758            rhs: rhs.shape().to_vec(),
3759        });
3760    }
3761    // SAFETY: zip_map2_into overwrites every output element.
3762    let mut out = unsafe { typed_array_uninit_from_pool(buffers, lhs.shape()) }?;
3763    zip_map2_into(
3764        &mut out.view_mut(),
3765        &typed_view("compare", lhs)?,
3766        &typed_view("compare", rhs)?,
3767        |x, y| x.compare_elem(y, dir),
3768    )
3769    .map_err(|err| crate::Error::backend_failure("compare", err))?;
3770    Ok(tensor_from_array(out))
3771}
3772
3773pub(crate) fn typed_select_with_pool<T>(
3774    buffers: &mut BufferPool,
3775    pred: &TypedTensor<bool>,
3776    on_true: &TypedTensor<T>,
3777    on_false: &TypedTensor<T>,
3778) -> crate::Result<TypedTensor<T>>
3779where
3780    T: Copy + PoolScalar,
3781{
3782    if pred.shape() != on_true.shape() {
3783        return Err(crate::Error::ShapeMismatch {
3784            op: "select",
3785            lhs: pred.shape().to_vec(),
3786            rhs: on_true.shape().to_vec(),
3787        });
3788    }
3789    if pred.shape() != on_false.shape() {
3790        return Err(crate::Error::ShapeMismatch {
3791            op: "select",
3792            lhs: pred.shape().to_vec(),
3793            rhs: on_false.shape().to_vec(),
3794        });
3795    }
3796    // SAFETY: zip_map3_into overwrites every output element.
3797    let mut out = unsafe { typed_array_uninit_from_pool(buffers, pred.shape()) }?;
3798    zip_map3_into(
3799        &mut out.view_mut(),
3800        &typed_view("select", pred)?,
3801        &typed_view("select", on_true)?,
3802        &typed_view("select", on_false)?,
3803        |p, t, f| if p { t } else { f },
3804    )
3805    .map_err(|err| crate::Error::backend_failure("select", err))?;
3806    Ok(tensor_from_array(out))
3807}
3808
3809pub(crate) fn typed_clamp_with_pool<T>(
3810    buffers: &mut BufferPool,
3811    input: &TypedTensor<T>,
3812    lower: &TypedTensor<T>,
3813    upper: &TypedTensor<T>,
3814) -> crate::Result<TypedTensor<T>>
3815where
3816    T: OrderedElem + PoolScalar,
3817{
3818    if input.shape() != lower.shape() {
3819        return Err(crate::Error::ShapeMismatch {
3820            op: "clamp",
3821            lhs: input.shape().to_vec(),
3822            rhs: lower.shape().to_vec(),
3823        });
3824    }
3825    if input.shape() != upper.shape() {
3826        return Err(crate::Error::ShapeMismatch {
3827            op: "clamp",
3828            lhs: input.shape().to_vec(),
3829            rhs: upper.shape().to_vec(),
3830        });
3831    }
3832    // SAFETY: zip_map3_into overwrites every output element.
3833    let mut out = unsafe { typed_array_uninit_from_pool(buffers, input.shape()) }?;
3834    zip_map3_into(
3835        &mut out.view_mut(),
3836        &typed_view("clamp", input)?,
3837        &typed_view("clamp", lower)?,
3838        &typed_view("clamp", upper)?,
3839        |x, lo, hi| hi.min_elem(lo.max_elem(x)),
3840    )
3841    .map_err(|err| crate::Error::backend_failure("clamp", err))?;
3842    Ok(tensor_from_array(out))
3843}
3844
3845#[cfg(test)]
3846mod tests;
tenferro_cpu/elementwise.rs

tenferro_cpu/
elementwise.rs