tenferro_cpu/
backend.rs

1use std::cmp::Reverse;
2use std::collections::{BTreeMap, BTreeSet, HashMap};
3use std::env;
4use std::fmt;
5use std::sync::atomic::{AtomicUsize, Ordering};
6use std::sync::{Arc, Mutex, OnceLock};
7use std::thread;
8use std::time::{Duration, Instant};
9
10use crate::arbiter::{
11    inherited_or_new_execution_owner, with_execution_owner, ResourceArbiter, ResourceOwner,
12    ResourcePermit,
13};
14use crate::buffer_pool::{BufferPool, BufferPoolStats, PoolScalar};
15use crate::dot_runtime::{CpuProviderBundle, CpuProviderBundleInstallError};
16use crate::engine::{CpuEngine, EngineResources};
17use crate::indexed_plan_cache::{
18    IndexedPlanCache, IndexedPlanCacheLimits, DEFAULT_INDEXED_PLAN_CACHE_LIMITS,
19};
20use crate::placement::{
21    resolve_placement, resolve_placement_with_affinity, CpuEngineConstructionError,
22    ResolvedCpuExecution,
23};
24use crate::provider::{CpuExecutionContext, CpuOperationEntry, ParallelMode};
25use crate::{
26    discover_cpu_topology, CpuDomainId, CpuDomainOwnership, CpuExecutorAffinity,
27    CpuExecutorShutdown, CpuId, CpuPlacement, CpuPlacementError, CpuPlacementGuarantee, CpuSet,
28    CpuTopology, CpuTopologyError, ExternalCpuDomain, NumaNodeId, ResolvedCpuPlacement,
29};
30use crate::{
31    Buffer, CacheStats, Tensor, TensorRank, TensorRead, TensorScalar, TensorValue, TensorWrite,
32    TypedTensor, TypedTensorView, TypedTensorViewMut,
33};
34use tenferro_tensor::backend::{ElementwiseFusionPlan, GroupedGemmConfig};
35use tenferro_tensor::SharedTensorAllocationDomain;
36use tenferro_tensor::{
37    AllocationDomainId, BackendCachedDot, BackendRuntimeCache, BackendSession, BackendSessionHost,
38    DotGeneralAccumulation, ElementwiseReadOp, TensorAnalytic, TensorBackend, TensorBuffer,
39    TensorDeviceTransfer, TensorDot, TensorElementwise, TensorFusion, TensorIndexing,
40    TensorReduction, TensorStructural, TensorViewCanonicalization,
41};
42use tenferro_tensor::{
43    CompareDir, DotGeneralConfig, GatherConfig, PadConfig, ScatterConfig, SliceConfig,
44};
45
46use super::exec_session::CpuExecSession;
47use super::{
48    analytic, copy_tensor_read_into, elementwise, gemm, indexing, materialize_tensor_read,
49    reduction, structural, CpuContext,
50};
51
52pub(crate) fn tag_fresh_output(output: &mut Tensor, domain: CpuDomainId) {
53    macro_rules! tag {
54        ($tensor:expr) => {{
55            $tensor.set_cpu_affinity(Some(domain));
56        }};
57    }
58    match output {
59        Tensor::F32(tensor) => tag!(tensor),
60        Tensor::F64(tensor) => tag!(tensor),
61        Tensor::I32(tensor) => tag!(tensor),
62        Tensor::I64(tensor) => tag!(tensor),
63        Tensor::Bool(tensor) => tag!(tensor),
64        Tensor::C32(tensor) => tag!(tensor),
65        Tensor::C64(tensor) => tag!(tensor),
66    }
67}
68
69pub(crate) fn elementwise_read_into_fallback_with_pool(
70    buffers: &mut BufferPool,
71    op: ElementwiseReadOp,
72    inputs: &[TensorRead<'_>],
73    out: TensorWrite<'_>,
74) -> crate::Result<()> {
75    let result = match op {
76        ElementwiseReadOp::Add => {
77            elementwise::add_read_with_pool(buffers, inputs[0].clone(), inputs[1].clone())?
78        }
79        ElementwiseReadOp::Subtract => {
80            elementwise::sub_read_with_pool(buffers, inputs[0].clone(), inputs[1].clone())?
81        }
82        ElementwiseReadOp::Multiply => {
83            elementwise::mul_read_with_pool(buffers, inputs[0].clone(), inputs[1].clone())?
84        }
85        ElementwiseReadOp::Negate => elementwise::neg_read_with_pool(buffers, inputs[0].clone())?,
86        ElementwiseReadOp::Conj => elementwise::conj_read_with_pool(buffers, inputs[0].clone())?,
87        ElementwiseReadOp::Divide => {
88            elementwise::div_read_with_pool(buffers, inputs[0].clone(), inputs[1].clone())?
89        }
90        _ => {
91            return Err(crate::Error::unsupported(
92                "CpuBackend::elementwise_read_into",
93                format!("CPU backend does not implement {op:?}"),
94            ))
95        }
96    };
97    copy_tensor_read_into(
98        "CpuBackend::elementwise_read_into",
99        TensorRead::from_tensor(&result),
100        out,
101    )
102}
103
104pub(crate) trait FreshCpuOutput {
105    fn tag_fresh(&mut self, domain: CpuDomainId);
106}
107
108impl FreshCpuOutput for Tensor {
109    fn tag_fresh(&mut self, domain: CpuDomainId) {
110        tag_fresh_output(self, domain);
111    }
112}
113
114impl<T, R: TensorRank> FreshCpuOutput for TypedTensor<T, R> {
115    fn tag_fresh(&mut self, domain: CpuDomainId) {
116        self.set_cpu_affinity(Some(domain));
117    }
118}
119
120impl<T: FreshCpuOutput> FreshCpuOutput for Option<T> {
121    fn tag_fresh(&mut self, domain: CpuDomainId) {
122        if let Some(output) = self {
123            output.tag_fresh(domain);
124        }
125    }
126}
127
128impl<T: FreshCpuOutput> FreshCpuOutput for Vec<T> {
129    fn tag_fresh(&mut self, domain: CpuDomainId) {
130        for output in self {
131            output.tag_fresh(domain);
132        }
133    }
134}
135
136#[derive(Debug, Default, Clone)]
137struct CpuSessionProfileEntry {
138    calls: usize,
139    total_time: Duration,
140}
141
142fn cpu_session_profile_enabled() -> bool {
143    static ENABLED: OnceLock<bool> = OnceLock::new();
144    *ENABLED.get_or_init(|| env::var("TENFERRO_PROFILE_CPU_SESSION").is_ok())
145}
146
147fn cpu_session_profile_print_every() -> Option<usize> {
148    static PRINT_EVERY: OnceLock<Option<usize>> = OnceLock::new();
149    *PRINT_EVERY.get_or_init(|| {
150        env::var("TENFERRO_PROFILE_CPU_SESSION_PRINT_EVERY")
151            .ok()
152            .and_then(|value| value.parse::<usize>().ok())
153            .filter(|&value| value > 0)
154    })
155}
156
157fn cpu_session_profile_state() -> &'static Mutex<HashMap<&'static str, CpuSessionProfileEntry>> {
158    static STATE: OnceLock<Mutex<HashMap<&'static str, CpuSessionProfileEntry>>> = OnceLock::new();
159    STATE.get_or_init(|| Mutex::new(HashMap::new()))
160}
161
162fn record_cpu_session_profile(section: &'static str, elapsed: Duration) {
163    if !cpu_session_profile_enabled() {
164        return;
165    }
166    let Ok(mut state) = cpu_session_profile_state().lock() else {
167        return;
168    };
169    let entry = state.entry(section).or_default();
170    entry.calls += 1;
171    entry.total_time += elapsed;
172}
173
174fn profile_cpu_session_section<T>(section: &'static str, f: impl FnOnce() -> T) -> T {
175    if !cpu_session_profile_enabled() {
176        return f();
177    }
178    let started = Instant::now();
179    let result = f();
180    record_cpu_session_profile(section, started.elapsed());
181    result
182}
183
184fn maybe_print_cpu_session_profile() {
185    let Some(print_every) = cpu_session_profile_print_every() else {
186        return;
187    };
188    let should_print = {
189        let Ok(state) = cpu_session_profile_state().lock() else {
190            return;
191        };
192        state
193            .get("with_backend_session_cached.total")
194            .is_some_and(|entry| entry.calls % print_every == 0)
195    };
196    if !should_print {
197        return;
198    }
199    let mut entries = {
200        let Ok(mut state) = cpu_session_profile_state().lock() else {
201            return;
202        };
203        let entries = state
204            .iter()
205            .map(|(section, entry)| (*section, entry.clone()))
206            .collect::<Vec<_>>();
207        state.clear();
208        entries
209    };
210    entries.sort_by_key(|(_, entry)| Reverse(entry.total_time));
211    eprintln!("=== tenferro CPU session profile ===");
212    for (section, entry) in entries {
213        eprintln!(
214            "{section}: calls={} total={:.6}ms per_call={:.3}us",
215            entry.calls,
216            entry.total_time.as_secs_f64() * 1.0e3,
217            entry.total_time.as_secs_f64() * 1.0e6 / entry.calls as f64,
218        );
219    }
220}
221
222struct BufferPoolLoan<'a> {
223    buffers: &'a mut BufferPool,
224}
225
226impl<'a> BufferPoolLoan<'a> {
227    fn new(buffers: &'a mut BufferPool) -> Self {
228        Self { buffers }
229    }
230
231    fn get_mut(&mut self) -> &mut BufferPool {
232        self.buffers
233    }
234}
235
236impl Drop for BufferPoolLoan<'_> {
237    fn drop(&mut self) {
238        if thread::panicking() {
239            self.buffers.replenish_in_flight_retained();
240        } else {
241            self.buffers.clear_in_flight_retained();
242        }
243    }
244}
245
246/// CPU provider selected by a [`CpuBackend`] instance.
247///
248/// CPU provider features are additive at compile time; this runtime selector
249/// chooses which compiled provider an individual backend uses for provider-owned
250/// kernels such as GEMM.
251///
252/// # Examples
253///
254/// ```
255/// use tenferro_cpu::CpuBackendKind;
256///
257/// let kind = CpuBackendKind::default_compiled();
258/// assert!(matches!(kind, CpuBackendKind::Faer | CpuBackendKind::Blas));
259/// ```
260#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
261pub enum CpuBackendKind {
262    /// faer-backed CPU kernels.
263    Faer,
264    /// BLAS/LAPACK-backed CPU kernels.
265    Blas,
266}
267
268impl CpuBackendKind {
269    /// Return the default compiled CPU provider.
270    ///
271    /// BLAS is preferred when both BLAS and faer are compiled in because an
272    /// application that links a BLAS/LAPACK provider normally expects
273    /// provider-backed kernels to use it by default.
274    ///
275    /// # Examples
276    ///
277    /// ```
278    /// use tenferro_cpu::CpuBackendKind;
279    ///
280    /// let _kind = CpuBackendKind::default_compiled();
281    /// ```
282    pub fn default_compiled() -> Self {
283        #[cfg(feature = "cpu-blas")]
284        {
285            Self::Blas
286        }
287        #[cfg(all(not(feature = "cpu-blas"), feature = "cpu-faer"))]
288        {
289            Self::Faer
290        }
291    }
292
293    // Used by feature-specific diagnostics; some feature combinations leave
294    // the formatter path inactive.
295    #[allow(dead_code)]
296    pub(crate) fn name(self) -> &'static str {
297        match self {
298            Self::Faer => "faer",
299            Self::Blas => "blas",
300        }
301    }
302}
303
304/// Stable execution-ownership mode selected for a CPU backend handle.
305///
306/// # Examples
307///
308/// ```
309/// use tenferro_cpu::{CpuBackend, CpuExecutionMode};
310///
311/// let mode = CpuBackend::new().execution_info().execution_mode();
312/// assert!(matches!(
313///     mode,
314///     CpuExecutionMode::Managed
315///         | CpuExecutionMode::ExternalManaged
316///         | CpuExecutionMode::ProviderDefaultExclusive
317///         | CpuExecutionMode::Compatibility
318/// ));
319/// ```
320#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
321pub enum CpuExecutionMode {
322    /// tenferro owns a pinned Rayon engine for the resolved CPU placement.
323    Managed,
324    /// The application supplied and owns the selected CPU domain executor.
325    ExternalManaged,
326    /// An external provider owns worker placement under a process-wide permit.
327    ProviderDefaultExclusive,
328    /// A legacy unpinned Rayon context is used because managed affinity is unavailable.
329    Compatibility,
330}
331
332/// Failure to construct an externally managed CPU-domain registry.
333///
334/// # Examples
335///
336/// ```
337/// use tenferro_cpu::ExternalCpuDomainRegistryError;
338///
339/// let error = ExternalCpuDomainRegistryError::EmptyRegistry;
340/// assert!(error.to_string().contains("at least one"));
341/// ```
342#[derive(Clone, Debug, Eq, PartialEq, thiserror::Error)]
343pub enum ExternalCpuDomainRegistryError {
344    /// No external domain descriptor was supplied.
345    #[error("externally managed CPU registry must contain at least one domain")]
346    EmptyRegistry,
347    /// More than one descriptor used the same caller-stable domain ID.
348    #[error("CPU domain ID {id:?} is registered more than once")]
349    DuplicateDomainId {
350        /// Duplicate caller-supplied identity.
351        id: CpuDomainId,
352    },
353    /// More than one descriptor claimed the same placement identity.
354    #[error("CPU placement {placement:?} is registered more than once")]
355    DuplicatePlacementIdentity {
356        /// Duplicate NUMA-node or all-allowed identity.
357        placement: CpuPlacement,
358    },
359    /// A declared CPU is outside the process-allowed CPU set.
360    #[error("CPU domain {domain:?} declares process-disallowed CPU {cpu}")]
361    CpuOutsideAllowedSet {
362        /// Domain containing the invalid CPU declaration.
363        domain: CpuDomainId,
364        /// CPU absent from the process affinity set.
365        cpu: CpuId,
366    },
367    /// The selected default domain ID was not supplied.
368    #[error("default CPU domain {default_domain:?} is not registered")]
369    MissingDefaultDomain {
370        /// Missing caller-selected default identity.
371        default_domain: CpuDomainId,
372    },
373    /// An exact all-allowed declaration did not equal the process-allowed set.
374    #[error(
375        "exact all-allowed CPU domain {domain:?} declares {declared:?}, but the process allows {allowed:?}"
376    )]
377    ExactAllAllowedMismatch {
378        /// Domain with the inconsistent all-allowed declaration.
379        domain: CpuDomainId,
380        /// CPUs declared by the external descriptor.
381        declared: CpuSet,
382        /// CPUs allowed by the current process affinity mask.
383        allowed: CpuSet,
384    },
385}
386
387/// Errors returned while constructing a [`CpuBackend`].
388///
389/// Placement failures remain typed so callers can distinguish topology
390/// discovery failures from unsupported placement requests. Configuration and
391/// provider-selection failures retain the existing tensor error contract.
392///
393/// # Examples
394///
395/// ```
396/// use tenferro_cpu::{CpuBackend, CpuBackendError};
397///
398/// let error = CpuBackend::with_threads(0).unwrap_err();
399/// assert!(matches!(error, CpuBackendError::Tensor(_)));
400/// ```
401#[derive(Debug, thiserror::Error)]
402pub enum CpuBackendError {
403    /// CPU context configuration or provider selection failed.
404    #[error(transparent)]
405    Tensor(#[from] crate::Error),
406    /// CPU placement resolution or engine construction failed.
407    #[error("{op}: {source}")]
408    Placement {
409        /// Constructor that observed the placement failure.
410        op: &'static str,
411        /// Typed placement failure.
412        #[source]
413        source: CpuPlacementError,
414    },
415    /// Externally managed domain registry validation failed.
416    #[error(transparent)]
417    ExternalRegistry(#[from] ExternalCpuDomainRegistryError),
418}
419
420impl CpuBackendError {
421    fn placement(op: &'static str, source: CpuPlacementError) -> Self {
422        Self::Placement { op, source }
423    }
424
425    /// Return the typed placement failure, when construction reached placement resolution.
426    ///
427    /// # Examples
428    ///
429    /// ```
430    /// use tenferro_cpu::{CpuBackend, CpuBackendError};
431    ///
432    /// let result: Result<CpuBackend, CpuBackendError> = CpuBackend::with_threads(1);
433    /// if let Err(error) = result {
434    ///     let _placement_failure = error.placement_error();
435    /// }
436    /// ```
437    pub fn placement_error(&self) -> Option<&CpuPlacementError> {
438        match self {
439            Self::Tensor(_) => None,
440            Self::Placement { source, .. } => Some(source),
441            Self::ExternalRegistry(_) => None,
442        }
443    }
444}
445
446impl From<CpuBackendError> for crate::Error {
447    fn from(error: CpuBackendError) -> Self {
448        match error {
449            CpuBackendError::Tensor(error) => error,
450            CpuBackendError::ExternalRegistry(source) => Self::extension(
451                "CpuBackend::from_external_managed_domains",
452                "cpu",
453                crate::ErrorKind::Validation(crate::ValidationKind::InvalidArgument),
454                source,
455            ),
456            CpuBackendError::Placement { op, source } => match source {
457                CpuPlacementError::TopologyDiscovery { .. }
458                | CpuPlacementError::ManagedAffinityUnavailable { .. }
459                | CpuPlacementError::NumaDiscoveryUnavailable { .. }
460                | CpuPlacementError::UnknownNumaNode { .. }
461                | CpuPlacementError::UnregisteredExternalPlacement { .. } => {
462                    Self::runtime_state_source(op, source)
463                }
464                CpuPlacementError::ExternalProviderAffinityUnmanaged { .. } => {
465                    Self::extension(op, "cpu", crate::ErrorKind::Unsupported, source)
466                }
467                CpuPlacementError::EngineConstruction { .. } => Self::backend_source(op, source),
468                CpuPlacementError::InternalState { .. } => {
469                    Self::extension(op, "cpu", crate::ErrorKind::Internal, source)
470                }
471            },
472        }
473    }
474}
475
476/// Snapshot of the stable CPU execution contract and non-contractual provider diagnostics.
477///
478/// [`CpuBackendKind`] is the stable provider identity. The diagnostic string is
479/// intended for logs and may change between builds or releases.
480///
481/// # Examples
482///
483/// ```
484/// use tenferro_cpu::{CpuBackend, CpuPlacement};
485///
486/// let info = CpuBackend::new().execution_info();
487/// assert_eq!(info.requested_placement(), CpuPlacement::Auto);
488/// assert!(!info.provider_diagnostic().is_empty());
489/// ```
490#[derive(Clone, Debug, PartialEq, Eq)]
491pub struct CpuExecutionInfo {
492    backend_kind: CpuBackendKind,
493    execution_mode: CpuExecutionMode,
494    requested_placement: CpuPlacement,
495    resolved_placement: Option<ResolvedCpuPlacement>,
496    topology: CpuTopology,
497    domain_id: CpuDomainId,
498    domain_cpus: CpuSet,
499    worker_count: usize,
500    thread_budget: usize,
501    placement_guarantee: CpuPlacementGuarantee,
502    domain_ownership: CpuDomainOwnership,
503    executor_affinity: CpuExecutorAffinity,
504    executor_shutdown: CpuExecutorShutdown,
505    provider_diagnostic: &'static str,
506}
507
508impl CpuExecutionInfo {
509    /// Return the stable public provider identity.
510    ///
511    /// # Examples
512    ///
513    /// ```
514    /// let info = tenferro_cpu::CpuBackend::new().execution_info();
515    /// assert_eq!(info.backend_kind(), tenferro_cpu::CpuBackend::new().kind());
516    /// ```
517    pub fn backend_kind(&self) -> CpuBackendKind {
518        self.backend_kind
519    }
520
521    /// Return the stable execution-ownership mode.
522    ///
523    /// # Examples
524    ///
525    /// ```
526    /// let mode = tenferro_cpu::CpuBackend::new()
527    ///     .execution_info()
528    ///     .execution_mode();
529    /// let _ = format!("{mode:?}");
530    /// ```
531    pub fn execution_mode(&self) -> CpuExecutionMode {
532        self.execution_mode
533    }
534
535    /// Return the placement requested by this backend handle.
536    ///
537    /// # Examples
538    ///
539    /// ```
540    /// let info = tenferro_cpu::CpuBackend::new().execution_info();
541    /// assert_eq!(info.requested_placement(), tenferro_cpu::CpuPlacement::Auto);
542    /// ```
543    pub fn requested_placement(&self) -> CpuPlacement {
544        self.requested_placement
545    }
546
547    /// Return the concrete managed placement or external placement declaration.
548    ///
549    /// # Examples
550    ///
551    /// ```
552    /// let backend = tenferro_cpu::CpuBackend::new();
553    /// let _managed = backend.execution_info().resolved_placement();
554    /// ```
555    pub fn resolved_placement(&self) -> Option<&ResolvedCpuPlacement> {
556        self.resolved_placement.as_ref()
557    }
558
559    /// Return the process-visible topology used for placement resolution.
560    ///
561    /// # Examples
562    ///
563    /// ```
564    /// let info = tenferro_cpu::CpuBackend::new().execution_info();
565    /// assert!(!info.topology().allowed_cpus().is_empty());
566    /// ```
567    pub fn topology(&self) -> &CpuTopology {
568        &self.topology
569    }
570
571    /// Return the coordinator-stable identity of the selected CPU domain.
572    ///
573    /// # Examples
574    ///
575    /// ```
576    /// let id = tenferro_cpu::CpuBackend::new().execution_info().domain_id();
577    /// let _ = id.as_u64();
578    /// ```
579    pub fn domain_id(&self) -> CpuDomainId {
580        self.domain_id
581    }
582
583    /// Return the resolved or caller-declared logical CPUs of the selected domain.
584    ///
585    /// # Examples
586    ///
587    /// ```
588    /// let info = tenferro_cpu::CpuBackend::new().execution_info();
589    /// assert!(!info.domain_cpus().is_empty());
590    /// ```
591    pub fn domain_cpus(&self) -> &CpuSet {
592        &self.domain_cpus
593    }
594
595    /// Return the worker count of the selected domain executor.
596    ///
597    /// # Examples
598    ///
599    /// ```
600    /// let info = tenferro_cpu::CpuBackend::new().execution_info();
601    /// assert!(info.worker_count() >= 1);
602    /// ```
603    pub fn worker_count(&self) -> usize {
604        self.worker_count
605    }
606
607    /// Return the maximum number of participating threads requested for this domain.
608    ///
609    /// This can be smaller than [`Self::worker_count`] for an externally
610    /// supplied executor.
611    ///
612    /// # Examples
613    ///
614    /// ```
615    /// let info = tenferro_cpu::CpuBackend::new().execution_info();
616    /// assert!(info.thread_budget() >= 1);
617    /// assert!(info.thread_budget() <= info.worker_count());
618    /// ```
619    pub fn thread_budget(&self) -> usize {
620        self.thread_budget
621    }
622
623    /// Return whether the selected placement is exact or advisory.
624    ///
625    /// # Examples
626    ///
627    /// ```
628    /// let guarantee = tenferro_cpu::CpuBackend::new()
629    ///     .execution_info()
630    ///     .placement_guarantee();
631    /// let _ = format!("{guarantee:?}");
632    /// ```
633    pub fn placement_guarantee(&self) -> CpuPlacementGuarantee {
634        self.placement_guarantee
635    }
636
637    /// Return whether tenferro or the application owns the selected domain.
638    ///
639    /// # Examples
640    ///
641    /// ```
642    /// let ownership = tenferro_cpu::CpuBackend::new()
643    ///     .execution_info()
644    ///     .domain_ownership();
645    /// let _ = format!("{ownership:?}");
646    /// ```
647    pub fn domain_ownership(&self) -> CpuDomainOwnership {
648        self.domain_ownership
649    }
650
651    /// Return the selected executor's worker-affinity claim.
652    ///
653    /// # Examples
654    ///
655    /// ```
656    /// let affinity = tenferro_cpu::CpuBackend::new()
657    ///     .execution_info()
658    ///     .executor_affinity();
659    /// let _ = format!("{affinity:?}");
660    /// ```
661    pub fn executor_affinity(&self) -> CpuExecutorAffinity {
662        self.executor_affinity
663    }
664
665    /// Return who owns shutdown of the selected executor.
666    ///
667    /// # Examples
668    ///
669    /// ```
670    /// let shutdown = tenferro_cpu::CpuBackend::new()
671    ///     .execution_info()
672    ///     .executor_shutdown();
673    /// let _ = format!("{shutdown:?}");
674    /// ```
675    pub fn executor_shutdown(&self) -> CpuExecutorShutdown {
676        self.executor_shutdown
677    }
678
679    /// Return a human-readable provider description for logs.
680    ///
681    /// This string is diagnostic only and is not a provider identity contract.
682    ///
683    /// # Examples
684    ///
685    /// ```
686    /// let diagnostic = tenferro_cpu::CpuBackend::new()
687    ///     .execution_info()
688    ///     .provider_diagnostic();
689    /// assert!(!diagnostic.is_empty());
690    /// ```
691    pub fn provider_diagnostic(&self) -> &'static str {
692        self.provider_diagnostic
693    }
694}
695
696fn provider_diagnostic(kind: CpuBackendKind, ownership: CpuDomainOwnership) -> &'static str {
697    if ownership == CpuDomainOwnership::ExternalManaged {
698        return match kind {
699            CpuBackendKind::Faer => "faer (externally managed CPU executor)",
700            CpuBackendKind::Blas => "BLAS/LAPACK (externally managed CPU executor)",
701        };
702    }
703    match kind {
704        CpuBackendKind::Faer => "faer (tenferro-managed Rayon affinity)",
705        CpuBackendKind::Blas => {
706            #[cfg(feature = "blas-openblas")]
707            return "OpenBLAS (external worker affinity)";
708            #[cfg(feature = "blas-mkl")]
709            return "Intel MKL (external worker affinity)";
710            #[cfg(feature = "blas-accelerate")]
711            return "Apple Accelerate (external worker affinity)";
712            #[cfg(feature = "provider-inject")]
713            return "runtime-injected BLAS/LAPACK (external worker affinity)";
714            #[cfg(not(any(
715                feature = "blas-openblas",
716                feature = "blas-mkl",
717                feature = "blas-accelerate",
718                feature = "provider-inject"
719            )))]
720            return "linked BLAS/LAPACK provider (identity unknown; external worker affinity)";
721        }
722    }
723}
724
725fn ensure_cpu_backend_kind_available(kind: CpuBackendKind, op: &'static str) -> crate::Result<()> {
726    let _ = op;
727    match kind {
728        CpuBackendKind::Faer => {
729            #[cfg(feature = "cpu-faer")]
730            {
731                Ok(())
732            }
733            #[cfg(not(feature = "cpu-faer"))]
734            {
735                Err(crate::Error::invalid_argument(
736                    op,
737                    "configuration",
738                    "CpuBackendKind::Faer requires the cpu-faer feature".to_string(),
739                ))
740            }
741        }
742        CpuBackendKind::Blas => {
743            #[cfg(feature = "cpu-blas")]
744            {
745                Ok(())
746            }
747            #[cfg(not(feature = "cpu-blas"))]
748            {
749                Err(crate::Error::invalid_argument(
750                    op,
751                    "configuration",
752                    "CpuBackendKind::Blas requires the cpu-blas feature".to_string(),
753                ))
754            }
755        }
756    }
757}
758
759fn constructor_tensor_error(op: &'static str, error: crate::Error) -> CpuBackendError {
760    CpuBackendError::Tensor(match error {
761        crate::Error::Validation { source, .. } => crate::Error::validation(op, source),
762        error => error,
763    })
764}
765
766// Used by feature-disabled backend paths; a given feature build may compile no
767// direct call site for one provider.
768#[allow(dead_code)]
769pub(super) fn unavailable_cpu_backend_kind(kind: CpuBackendKind, op: &'static str) -> crate::Error {
770    crate::Error::invalid_argument(
771        op,
772        "configuration",
773        format!("CPU backend kind {} is not compiled in", kind.name()),
774    )
775}
776
777struct ManagedEngineRegistry {
778    node_engines: Mutex<BTreeMap<NumaNodeId, Arc<CpuEngine>>>,
779    node_domain_ids: BTreeMap<NumaNodeId, CpuDomainId>,
780    all_allowed: OnceLock<Arc<CpuEngine>>,
781    all_allowed_build: Mutex<()>,
782    base_engine: Arc<CpuEngine>,
783    thread_budget: usize,
784}
785
786struct ExternalEngineRegistry {
787    by_id: BTreeMap<CpuDomainId, Arc<CpuEngine>>,
788    by_node: BTreeMap<NumaNodeId, Arc<CpuEngine>>,
789    all_allowed: Option<Arc<CpuEngine>>,
790    default_domain: CpuDomainId,
791}
792
793enum CpuEngineRegistry {
794    ManagedLazy(ManagedEngineRegistry),
795    ExternalPrebuilt(ExternalEngineRegistry),
796}
797
798struct CpuBackendState {
799    topology: CpuTopology,
800    engines: CpuEngineRegistry,
801    arbiter: ResourceArbiter,
802    kind: CpuBackendKind,
803    buffer_limit: AtomicUsize,
804    indexed_plan_cache_limits: Mutex<IndexedPlanCacheLimits>,
805}
806
807impl CpuBackendState {
808    fn managed_engine_for(
809        &self,
810        placement: &ResolvedCpuPlacement,
811        requested: CpuPlacement,
812    ) -> Result<Arc<CpuEngine>, CpuPlacementError> {
813        // INVARIANT: cache configuration is the outermost lock for lazy engine
814        // creation and limit updates. The shared order is configuration,
815        // registry, then engine resources.
816        let cache_configuration = self.indexed_plan_cache_limits.lock().map_err(|_| {
817            CpuPlacementError::InternalState {
818                requested,
819                backend: self.kind,
820                message: "CPU indexed-plan cache configuration lock is poisoned",
821            }
822        })?;
823        let cache_limits = *cache_configuration;
824        let CpuEngineRegistry::ManagedLazy(registry) = &self.engines else {
825            return Err(CpuPlacementError::InternalState {
826                requested,
827                backend: self.kind,
828                message: "managed placement requested from an external engine registry",
829            });
830        };
831        match placement {
832            ResolvedCpuPlacement::NumaNode { id, .. } => {
833                let mut engines = registry
834                    .node_engines
835                    .lock()
836                    .unwrap_or_else(std::sync::PoisonError::into_inner);
837                if let Some(engine) = engines.get(id) {
838                    return Ok(Arc::clone(engine));
839                }
840                let Some(domain_id) = registry.node_domain_ids.get(id).copied() else {
841                    return Err(CpuPlacementError::InternalState {
842                        requested,
843                        backend: self.kind,
844                        message: "managed NUMA node has no coordinator-stable domain ID",
845                    });
846                };
847                let engine = Arc::new(
848                    CpuEngine::new_managed(
849                        domain_id,
850                        placement.clone(),
851                        registry.thread_budget,
852                        self.buffer_limit.load(Ordering::Relaxed),
853                    )
854                    .map_err(|error| {
855                        CpuPlacementError::EngineConstruction {
856                            requested,
857                            backend: self.kind,
858                            source: CpuEngineConstructionError::Context(error),
859                        }
860                    })?,
861                );
862                self.configure_new_indexed_plan_cache(&engine, requested, cache_limits)?;
863                engines.insert(*id, Arc::clone(&engine));
864                Ok(engine)
865            }
866            ResolvedCpuPlacement::AllAllowed { .. } => {
867                if let Some(engine) = registry.all_allowed.get() {
868                    return Ok(Arc::clone(engine));
869                }
870                let _build = registry
871                    .all_allowed_build
872                    .lock()
873                    .unwrap_or_else(std::sync::PoisonError::into_inner);
874                if let Some(engine) = registry.all_allowed.get() {
875                    return Ok(Arc::clone(engine));
876                }
877                let engine = Arc::new(
878                    CpuEngine::new_managed(
879                        CpuDomainId::new(0),
880                        placement.clone(),
881                        registry.thread_budget,
882                        self.buffer_limit.load(Ordering::Relaxed),
883                    )
884                    .map_err(|error| {
885                        CpuPlacementError::EngineConstruction {
886                            requested,
887                            backend: self.kind,
888                            source: CpuEngineConstructionError::Context(error),
889                        }
890                    })?,
891                );
892                self.configure_new_indexed_plan_cache(&engine, requested, cache_limits)?;
893                let _ = registry.all_allowed.set(Arc::clone(&engine));
894                Ok(engine)
895            }
896        }
897    }
898
899    fn configure_new_indexed_plan_cache(
900        &self,
901        engine: &CpuEngine,
902        requested: CpuPlacement,
903        limits: IndexedPlanCacheLimits,
904    ) -> Result<(), CpuPlacementError> {
905        let mut resources =
906            engine
907                .resources
908                .lock()
909                .map_err(|_| CpuPlacementError::InternalState {
910                    requested,
911                    backend: self.kind,
912                    message: "new CPU engine indexed-plan cache lock is poisoned",
913                })?;
914        resources.indexed_plan_cache.set_limits(limits);
915        Ok(())
916    }
917
918    fn managed_base_engine(
919        &self,
920        requested: CpuPlacement,
921    ) -> Result<Arc<CpuEngine>, CpuPlacementError> {
922        match &self.engines {
923            CpuEngineRegistry::ManagedLazy(registry) => Ok(Arc::clone(&registry.base_engine)),
924            CpuEngineRegistry::ExternalPrebuilt(_) => Err(CpuPlacementError::InternalState {
925                requested,
926                backend: self.kind,
927                message: "managed compatibility placement requested from an external registry",
928            }),
929        }
930    }
931
932    fn external_engine_for(
933        &self,
934        requested: CpuPlacement,
935    ) -> Result<Arc<CpuEngine>, CpuPlacementError> {
936        let CpuEngineRegistry::ExternalPrebuilt(registry) = &self.engines else {
937            return Err(CpuPlacementError::InternalState {
938                requested,
939                backend: self.kind,
940                message: "external placement requested from a managed engine registry",
941            });
942        };
943        let engine = match requested {
944            CpuPlacement::Auto => registry.by_id.get(&registry.default_domain),
945            CpuPlacement::NumaNode(id) => registry.by_node.get(&id),
946            CpuPlacement::AllAllowed => registry.all_allowed.as_ref(),
947        };
948        engine
949            .cloned()
950            .ok_or(CpuPlacementError::UnregisteredExternalPlacement { requested })
951    }
952
953    fn is_external(&self) -> bool {
954        matches!(&self.engines, CpuEngineRegistry::ExternalPrebuilt(_))
955    }
956
957    fn initialized_engines(&self, op: &'static str) -> crate::Result<Vec<Arc<CpuEngine>>> {
958        let mut engines = match &self.engines {
959            CpuEngineRegistry::ManagedLazy(registry) => {
960                let mut engines = vec![Arc::clone(&registry.base_engine)];
961                if let Some(engine) = registry.all_allowed.get() {
962                    engines.push(Arc::clone(engine));
963                }
964                engines.extend(
965                    registry
966                        .node_engines
967                        .lock()
968                        .map_err(|_| poisoned_cpu_lock(op, "CPU engine registry"))?
969                        .values()
970                        .cloned(),
971                );
972                engines
973            }
974            CpuEngineRegistry::ExternalPrebuilt(registry) => {
975                registry.by_id.values().cloned().collect()
976            }
977        };
978        if engines.len() > 1 {
979            engines.sort_unstable_by_key(|engine| Arc::as_ptr(engine) as usize);
980            engines.dedup_by(|left, right| Arc::ptr_eq(left, right));
981        }
982        Ok(engines)
983    }
984}
985
986fn poisoned_cpu_lock(op: &'static str, lock: &'static str) -> crate::Error {
987    crate::Error::runtime_state(op, format!("{lock} lock poisoned"))
988}
989
990fn lock_engine_resources<'a>(
991    engine: &'a CpuEngine,
992    op: &'static str,
993) -> crate::Result<std::sync::MutexGuard<'a, EngineResources>> {
994    engine
995        .resources
996        .lock()
997        .map_err(|_| poisoned_cpu_lock(op, "CPU engine resources"))
998}
999
1000fn saturating_add_tensor_cache_stats(total: &mut CacheStats, value: CacheStats) {
1001    total.entries = total.entries.saturating_add(value.entries);
1002    total.retained_bytes = total.retained_bytes.saturating_add(value.retained_bytes);
1003    total.hits = total.hits.saturating_add(value.hits);
1004    total.misses = total.misses.saturating_add(value.misses);
1005    total.evictions = total.evictions.saturating_add(value.evictions);
1006    total.clears = total.clears.saturating_add(value.clears);
1007}
1008
1009/// A cheap cloneable handle to shared CPU execution coordination.
1010///
1011/// Clones share topology, execution engines, arbitration, and engine-owned
1012/// buffer resources.
1013///
1014/// # Examples
1015///
1016/// ```
1017/// use tenferro_cpu::CpuBackend;
1018///
1019/// let backend = CpuBackend::new();
1020/// let clone = backend.clone();
1021/// assert_eq!(backend.kind(), clone.kind());
1022/// ```
1023#[derive(Clone)]
1024pub struct CpuBackend {
1025    runtime_identity: CpuRuntimeIdentity,
1026    shared: Arc<CpuBackendState>,
1027    requested: CpuPlacement,
1028    resolved: ResolvedCpuExecution,
1029    engine: Arc<CpuEngine>,
1030    provider_bundle: CpuProviderBundle,
1031    allocation_domain: Option<Arc<dyn SharedTensorAllocationDomain>>,
1032}
1033
1034/// Opaque identity for one CPU backend executable witness.
1035///
1036/// The token carries no backend, execution, storage, or mutation authority.
1037/// Cloning a token is cheap and preserves identity; separately constructed
1038/// backends and backends returned after immutable witness resources change use
1039/// distinct tokens.
1040///
1041/// # Examples
1042///
1043/// ```
1044/// use tenferro_cpu::CpuBackend;
1045///
1046/// let identity = CpuBackend::new().runtime_identity();
1047/// assert_eq!(identity, identity.clone());
1048/// ```
1049#[derive(Clone, Debug)]
1050pub struct CpuRuntimeIdentity {
1051    marker: Arc<()>,
1052}
1053
1054impl CpuRuntimeIdentity {
1055    fn fresh() -> Self {
1056        Self {
1057            marker: Arc::new(()),
1058        }
1059    }
1060}
1061
1062impl PartialEq for CpuRuntimeIdentity {
1063    fn eq(&self, other: &Self) -> bool {
1064        Arc::ptr_eq(&self.marker, &other.marker)
1065    }
1066}
1067
1068impl Eq for CpuRuntimeIdentity {}
1069
1070fn resolve_discovered_topology(
1071    kind: CpuBackendKind,
1072    topology: Result<CpuTopology, CpuTopologyError>,
1073) -> Result<CpuTopology, CpuPlacementError> {
1074    topology.map_err(|source| CpuPlacementError::TopologyDiscovery {
1075        requested: CpuPlacement::Auto,
1076        backend: kind,
1077        source,
1078    })
1079}
1080
1081fn coordinator_node_domain_ids(topology: &CpuTopology) -> BTreeMap<NumaNodeId, CpuDomainId> {
1082    topology
1083        .nodes()
1084        .iter()
1085        .enumerate()
1086        .filter_map(|(index, node)| {
1087            u64::try_from(index)
1088                .ok()
1089                .and_then(|index| index.checked_add(1))
1090                .map(|id| (node.id(), CpuDomainId::new(id)))
1091        })
1092        .collect()
1093}
1094
1095impl fmt::Debug for CpuBackend {
1096    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1097        f.debug_struct("CpuBackend")
1098            .field("kind", &self.kind())
1099            .field("provider_bundle", &self.provider_bundle)
1100            .field("requested_placement", &self.requested)
1101            .field("resolved_execution", &self.resolved)
1102            .field("engine_placement", &self.engine.placement())
1103            .field("num_threads", &self.num_threads())
1104            .field("allocation_domain", &self.allocation_domain())
1105            .field("buffer_pool_cache_stats", &self.buffer_pool_cache_stats())
1106            .field("buffer_pool_limit_bytes", &self.buffer_pool_limit_bytes())
1107            .finish_non_exhaustive()
1108    }
1109}
1110
1111impl CpuBackend {
1112    fn from_thread_budget_and_kind(
1113        thread_budget: usize,
1114        kind: CpuBackendKind,
1115        max_retained_capacity_bytes: usize,
1116    ) -> Result<Self, CpuPlacementError> {
1117        let topology = resolve_discovered_topology(kind, discover_cpu_topology())?;
1118        let resolved = resolve_placement(kind, CpuPlacement::Auto, &topology)?;
1119        #[cfg(not(any(target_os = "linux", target_os = "android")))]
1120        {
1121            let context = CpuContext::with_threads(thread_budget).map_err(|error| {
1122                CpuPlacementError::EngineConstruction {
1123                    requested: CpuPlacement::Auto,
1124                    backend: kind,
1125                    source: CpuEngineConstructionError::Tensor(error),
1126                }
1127            })?;
1128            Ok(Self::compatibility_with_topology(
1129                Arc::new(context),
1130                max_retained_capacity_bytes,
1131                kind,
1132                topology,
1133                resolved,
1134            ))
1135        }
1136        #[cfg(any(target_os = "linux", target_os = "android"))]
1137        {
1138            let engine_placement = ResolvedCpuPlacement::AllAllowed {
1139                cpus: topology.allowed_cpus().clone(),
1140            };
1141            let engine = Arc::new(
1142                CpuEngine::new_managed(
1143                    CpuDomainId::new(0),
1144                    engine_placement,
1145                    thread_budget,
1146                    max_retained_capacity_bytes,
1147                )
1148                .map_err(|error| CpuPlacementError::EngineConstruction {
1149                    requested: CpuPlacement::Auto,
1150                    backend: kind,
1151                    source: CpuEngineConstructionError::Context(error),
1152                })?,
1153            );
1154            let all_allowed = OnceLock::new();
1155            let _ = all_allowed.set(Arc::clone(&engine));
1156            Ok(Self {
1157                shared: Arc::new(CpuBackendState {
1158                    engines: CpuEngineRegistry::ManagedLazy(ManagedEngineRegistry {
1159                        node_engines: Mutex::new(BTreeMap::new()),
1160                        node_domain_ids: coordinator_node_domain_ids(&topology),
1161                        all_allowed,
1162                        all_allowed_build: Mutex::new(()),
1163                        base_engine: Arc::clone(&engine),
1164                        thread_budget,
1165                    }),
1166                    topology,
1167                    arbiter: ResourceArbiter::global(),
1168                    kind,
1169                    buffer_limit: AtomicUsize::new(max_retained_capacity_bytes),
1170                    indexed_plan_cache_limits: Mutex::new(DEFAULT_INDEXED_PLAN_CACHE_LIMITS),
1171                }),
1172                runtime_identity: CpuRuntimeIdentity::fresh(),
1173                requested: CpuPlacement::Auto,
1174                resolved,
1175                engine,
1176                provider_bundle: CpuProviderBundle::standard(kind, kind == CpuBackendKind::Blas),
1177                allocation_domain: None,
1178            })
1179        }
1180    }
1181
1182    fn compatibility(
1183        ctx: Arc<CpuContext>,
1184        max_retained_capacity_bytes: usize,
1185        kind: CpuBackendKind,
1186    ) -> Self {
1187        let topology = discover_cpu_topology().unwrap_or_else(|_| {
1188            let allowed = crate::process_cpu_affinity().unwrap_or_else(|| {
1189                CpuSet::new((0..crate::available_parallelism()).map(CpuId::new))
1190                    .unwrap_or_else(|_| CpuSet::singleton(CpuId::new(0)))
1191            });
1192            CpuTopology::all_allowed(allowed)
1193        });
1194        let resolved = if kind == CpuBackendKind::Blas {
1195            ResolvedCpuExecution::ProviderDefaultExclusive
1196        } else {
1197            ResolvedCpuExecution::Compatibility
1198        };
1199        Self::compatibility_with_topology(
1200            ctx,
1201            max_retained_capacity_bytes,
1202            kind,
1203            topology,
1204            resolved,
1205        )
1206    }
1207
1208    fn compatibility_with_topology(
1209        ctx: Arc<CpuContext>,
1210        max_retained_capacity_bytes: usize,
1211        kind: CpuBackendKind,
1212        topology: CpuTopology,
1213        resolved: ResolvedCpuExecution,
1214    ) -> Self {
1215        let placement = ResolvedCpuPlacement::AllAllowed {
1216            cpus: topology.allowed_cpus().clone(),
1217        };
1218        let base_engine = Arc::new(CpuEngine::from_context(
1219            CpuDomainId::new(0),
1220            placement,
1221            ctx,
1222            max_retained_capacity_bytes,
1223        ));
1224        Self {
1225            shared: Arc::new(CpuBackendState {
1226                engines: CpuEngineRegistry::ManagedLazy(ManagedEngineRegistry {
1227                    node_engines: Mutex::new(BTreeMap::new()),
1228                    node_domain_ids: coordinator_node_domain_ids(&topology),
1229                    all_allowed: OnceLock::new(),
1230                    all_allowed_build: Mutex::new(()),
1231                    base_engine: Arc::clone(&base_engine),
1232                    thread_budget: base_engine.domain().thread_budget().get(),
1233                }),
1234                topology,
1235                arbiter: ResourceArbiter::global(),
1236                kind,
1237                buffer_limit: AtomicUsize::new(max_retained_capacity_bytes),
1238                indexed_plan_cache_limits: Mutex::new(DEFAULT_INDEXED_PLAN_CACHE_LIMITS),
1239            }),
1240            runtime_identity: CpuRuntimeIdentity::fresh(),
1241            requested: CpuPlacement::Auto,
1242            resolved,
1243            engine: base_engine,
1244            provider_bundle: CpuProviderBundle::standard(kind, kind == CpuBackendKind::Blas),
1245            allocation_domain: None,
1246        }
1247    }
1248
1249    /// Create a CPU backend using the environment-driven CPU context.
1250    ///
1251    /// # Examples
1252    ///
1253    /// ```
1254    /// use tenferro_cpu::CpuBackend;
1255    ///
1256    /// let backend = CpuBackend::new();
1257    /// ```
1258    pub fn new() -> Self {
1259        let context = Arc::new(CpuContext::from_env());
1260        Self::from_thread_budget_and_kind(
1261            context.num_threads(),
1262            CpuBackendKind::default_compiled(),
1263            crate::buffer_pool::DEFAULT_MAX_RETAINED_CAPACITY_BYTES,
1264        )
1265        .unwrap_or_else(|error| {
1266            eprintln!(
1267                "tenferro_cpu: using the unpinned compatibility context after placement error: {error}"
1268            );
1269            Self::from_context(context)
1270        })
1271    }
1272
1273    /// Create one coordinator from caller-owned CPU domain executors.
1274    ///
1275    /// The descriptors are moved into prebuilt engines. `Auto` selects
1276    /// `default_domain`; explicit placement requests are registry-only and
1277    /// never construct a managed context or thread pool.
1278    ///
1279    /// # Examples
1280    ///
1281    /// ```
1282    /// use std::num::NonZeroUsize;
1283    /// use std::sync::Arc;
1284    /// use tenferro_cpu::{
1285    ///     discover_cpu_topology, CpuBackend, CpuBackendError, CpuContext,
1286    ///     CpuExecutionMode, CpuPlacementGuarantee, CpuProviderBundleInstallError,
1287    ///     ExternalCpuDomain, ResolvedCpuPlacement,
1288    /// };
1289    /// use tenferro_tensor::CpuDomainId;
1290    ///
1291    /// let topology = discover_cpu_topology()?;
1292    /// let id = CpuDomainId::new(7);
1293    /// let domain = ExternalCpuDomain::new(
1294    ///     id,
1295    ///     ResolvedCpuPlacement::AllAllowed {
1296    ///         cpus: topology.allowed_cpus().clone(),
1297    ///     },
1298    ///     Arc::new(CpuContext::with_threads(1)?),
1299    ///     NonZeroUsize::new(1).unwrap(),
1300    ///     CpuPlacementGuarantee::AdvisoryDeclared,
1301    /// )?;
1302    /// match CpuBackend::from_external_managed_domains(id, [domain]) {
1303    ///     Ok(backend) => assert_eq!(
1304    ///         backend.execution_info().execution_mode(),
1305    ///         CpuExecutionMode::ExternalManaged,
1306    ///     ),
1307    ///     Err(CpuBackendError::Tensor(error)) => assert!(
1308    ///         std::error::Error::source(&error)
1309    ///             .and_then(|source| source.downcast_ref::<CpuProviderBundleInstallError>())
1310    ///             .is_some(),
1311    ///         "an uncontrolled compiled provider must retain its typed source",
1312    ///     ),
1313    ///     Err(error) => return Err(error.into()),
1314    /// }
1315    /// # Ok::<(), Box<dyn std::error::Error>>(())
1316    /// ```
1317    ///
1318    /// # Errors
1319    ///
1320    /// Returns [`CpuBackendError::Placement`] when process topology discovery
1321    /// fails. Returns [`CpuBackendError::ExternalRegistry`] for an empty
1322    /// registry, duplicate domain or placement identity, a CPU outside the
1323    /// process-allowed set, a missing default domain, or an exact
1324    /// [`ResolvedCpuPlacement::AllAllowed`] declaration that differs from the
1325    /// process-allowed CPU set. Returns [`CpuBackendError::Tensor`] with a
1326    /// [`CpuProviderBundleInstallError`] source when the compiled standard
1327    /// provider cannot satisfy an external domain contract. Applications that
1328    /// supply controlled providers can use
1329    /// [`CpuBackend::from_external_managed_domains_with_provider_bundle`].
1330    pub fn from_external_managed_domains(
1331        default_domain: CpuDomainId,
1332        domains: impl IntoIterator<Item = ExternalCpuDomain>,
1333    ) -> Result<Self, CpuBackendError> {
1334        let op = "CpuBackend::from_external_managed_domains";
1335        let kind = CpuBackendKind::default_compiled();
1336        let topology = resolve_discovered_topology(kind, discover_cpu_topology())
1337            .map_err(|source| CpuBackendError::placement(op, source))?;
1338        Self::from_external_managed_domains_with_topology_arbiter_and_provider_bundle(
1339            default_domain,
1340            domains,
1341            topology,
1342            ResourceArbiter::global(),
1343            CpuProviderBundle::standard(kind, false),
1344        )
1345    }
1346
1347    /// Create one coordinator from caller-owned CPU domain executors and an
1348    /// immutable provider bundle.
1349    ///
1350    /// Domain registry construction and provider compatibility validation are
1351    /// atomic: no backend is returned unless `provider_bundle` satisfies every
1352    /// supplied domain. The bundle currently selects `dot_general` operation-
1353    /// family providers; linalg operation-family selection still follows the
1354    /// compiled [`CpuBackendKind`] and is not replaced by this API.
1355    ///
1356    /// # Examples
1357    ///
1358    /// ```
1359    /// use std::num::NonZeroUsize;
1360    /// use std::sync::Arc;
1361    /// use tenferro_cpu::{
1362    ///     discover_cpu_topology, CpuBackend, CpuBackendKind, CpuContext,
1363    ///     CpuExecutionMode, CpuPlacementGuarantee, CpuProviderBundle,
1364    ///     ExternalCpuDomain, ResolvedCpuPlacement,
1365    /// };
1366    /// use tenferro_tensor::CpuDomainId;
1367    ///
1368    /// let topology = discover_cpu_topology()?;
1369    /// let id = CpuDomainId::new(7);
1370    /// let domain = ExternalCpuDomain::new(
1371    ///     id,
1372    ///     ResolvedCpuPlacement::AllAllowed {
1373    ///         cpus: topology.allowed_cpus().clone(),
1374    ///     },
1375    ///     Arc::new(CpuContext::with_threads(1)?),
1376    ///     NonZeroUsize::new(1).unwrap(),
1377    ///     CpuPlacementGuarantee::AdvisoryDeclared,
1378    /// )?;
1379    /// let bundle = CpuProviderBundle::builder(CpuBackendKind::Faer).build()?;
1380    /// let backend = CpuBackend::from_external_managed_domains_with_provider_bundle(
1381    ///     id,
1382    ///     [domain],
1383    ///     bundle.clone(),
1384    /// )?;
1385    /// assert_eq!(
1386    ///     backend.execution_info().execution_mode(),
1387    ///     CpuExecutionMode::ExternalManaged,
1388    /// );
1389    /// assert!(backend.provider_bundle().shares_identity_with(&bundle));
1390    /// # Ok::<(), Box<dyn std::error::Error>>(())
1391    /// ```
1392    ///
1393    /// # Errors
1394    ///
1395    /// Returns the same topology and registry errors as
1396    /// [`CpuBackend::from_external_managed_domains`]. Provider incompatibility
1397    /// is returned as [`CpuBackendError::Tensor`]. Calling
1398    /// [`std::error::Error::source`] on that value yields the typed
1399    /// [`CpuProviderBundleInstallError`], whose own source is the rejected
1400    /// [`crate::CpuProviderDomainError`].
1401    pub fn from_external_managed_domains_with_provider_bundle(
1402        default_domain: CpuDomainId,
1403        domains: impl IntoIterator<Item = ExternalCpuDomain>,
1404        provider_bundle: CpuProviderBundle,
1405    ) -> Result<Self, CpuBackendError> {
1406        let op = "CpuBackend::from_external_managed_domains_with_provider_bundle";
1407        let kind = CpuBackendKind::default_compiled();
1408        let topology = resolve_discovered_topology(kind, discover_cpu_topology())
1409            .map_err(|source| CpuBackendError::placement(op, source))?;
1410        Self::from_external_managed_domains_with_topology_arbiter_and_provider_bundle(
1411            default_domain,
1412            domains,
1413            topology,
1414            ResourceArbiter::global(),
1415            provider_bundle,
1416        )
1417    }
1418
1419    fn from_external_managed_domains_with_topology_arbiter_and_provider_bundle(
1420        default_domain: CpuDomainId,
1421        domains: impl IntoIterator<Item = ExternalCpuDomain>,
1422        topology: CpuTopology,
1423        arbiter: ResourceArbiter,
1424        provider_bundle: CpuProviderBundle,
1425    ) -> Result<Self, CpuBackendError> {
1426        let domains: Vec<_> = domains.into_iter().collect();
1427        if domains.is_empty() {
1428            return Err(ExternalCpuDomainRegistryError::EmptyRegistry.into());
1429        }
1430
1431        let mut domain_ids = BTreeSet::new();
1432        let mut node_ids = BTreeSet::new();
1433        let mut has_all_allowed = false;
1434        for domain in &domains {
1435            if !domain_ids.insert(domain.id()) {
1436                return Err(
1437                    ExternalCpuDomainRegistryError::DuplicateDomainId { id: domain.id() }.into(),
1438                );
1439            }
1440            match domain.placement() {
1441                ResolvedCpuPlacement::NumaNode { id, .. } => {
1442                    if !node_ids.insert(*id) {
1443                        return Err(ExternalCpuDomainRegistryError::DuplicatePlacementIdentity {
1444                            placement: CpuPlacement::NumaNode(*id),
1445                        }
1446                        .into());
1447                    }
1448                }
1449                ResolvedCpuPlacement::AllAllowed { cpus } => {
1450                    if has_all_allowed {
1451                        return Err(ExternalCpuDomainRegistryError::DuplicatePlacementIdentity {
1452                            placement: CpuPlacement::AllAllowed,
1453                        }
1454                        .into());
1455                    }
1456                    has_all_allowed = true;
1457                    if domain.placement_guarantee() == CpuPlacementGuarantee::ExactDeclared
1458                        && cpus != topology.allowed_cpus()
1459                    {
1460                        return Err(ExternalCpuDomainRegistryError::ExactAllAllowedMismatch {
1461                            domain: domain.id(),
1462                            declared: cpus.clone(),
1463                            allowed: topology.allowed_cpus().clone(),
1464                        }
1465                        .into());
1466                    }
1467                }
1468            }
1469            if let Some(cpu) = domain
1470                .cpus()
1471                .as_slice()
1472                .iter()
1473                .copied()
1474                .find(|cpu| !topology.allowed_cpus().contains(*cpu))
1475            {
1476                return Err(ExternalCpuDomainRegistryError::CpuOutsideAllowedSet {
1477                    domain: domain.id(),
1478                    cpu,
1479                }
1480                .into());
1481            }
1482        }
1483        if !domain_ids.contains(&default_domain) {
1484            return Err(
1485                ExternalCpuDomainRegistryError::MissingDefaultDomain { default_domain }.into(),
1486            );
1487        }
1488
1489        let buffer_limit = crate::buffer_pool::DEFAULT_MAX_RETAINED_CAPACITY_BYTES;
1490        let mut by_id = BTreeMap::new();
1491        let mut by_node = BTreeMap::new();
1492        let mut all_allowed = None;
1493        for domain in domains {
1494            let id = domain.id();
1495            let placement = domain.placement().clone();
1496            let engine = Arc::new(CpuEngine::from_external(domain, buffer_limit));
1497            match placement {
1498                ResolvedCpuPlacement::NumaNode { id, .. } => {
1499                    by_node.insert(id, Arc::clone(&engine));
1500                }
1501                ResolvedCpuPlacement::AllAllowed { .. } => {
1502                    all_allowed = Some(Arc::clone(&engine));
1503                }
1504            }
1505            by_id.insert(id, engine);
1506        }
1507        let Some(engine) = by_id.get(&default_domain).cloned() else {
1508            return Err(
1509                ExternalCpuDomainRegistryError::MissingDefaultDomain { default_domain }.into(),
1510            );
1511        };
1512        let resolved = ResolvedCpuExecution::ExternalManaged(engine.placement().clone());
1513        let kind = CpuBackendKind::default_compiled();
1514        let backend = Self {
1515            runtime_identity: CpuRuntimeIdentity::fresh(),
1516            shared: Arc::new(CpuBackendState {
1517                topology,
1518                engines: CpuEngineRegistry::ExternalPrebuilt(ExternalEngineRegistry {
1519                    by_id,
1520                    by_node,
1521                    all_allowed,
1522                    default_domain,
1523                }),
1524                arbiter,
1525                kind,
1526                buffer_limit: AtomicUsize::new(buffer_limit),
1527                indexed_plan_cache_limits: Mutex::new(DEFAULT_INDEXED_PLAN_CACHE_LIMITS),
1528            }),
1529            requested: CpuPlacement::Auto,
1530            resolved,
1531            engine,
1532            provider_bundle,
1533            allocation_domain: None,
1534        };
1535        backend
1536            .validate_provider_bundle_for_domains(&backend.provider_bundle)
1537            .map_err(|source| {
1538                CpuBackendError::Tensor(crate::Error::backend_source(
1539                    "CpuBackend ExternalManaged provider validation",
1540                    source,
1541                ))
1542            })?;
1543        Ok(backend)
1544    }
1545
1546    /// Create a CPU backend using the selected compiled provider.
1547    ///
1548    /// # Examples
1549    ///
1550    /// ```
1551    /// use tenferro_cpu::{CpuBackend, CpuBackendKind};
1552    ///
1553    /// let backend = CpuBackend::with_kind(CpuBackendKind::default_compiled()).unwrap();
1554    /// assert_eq!(backend.kind(), CpuBackendKind::default_compiled());
1555    /// ```
1556    ///
1557    /// # Errors
1558    ///
1559    /// Returns [`CpuBackendError::Tensor`] when the provider is unavailable or
1560    /// its configuration is invalid, and [`CpuBackendError::Placement`] when
1561    /// CPU topology discovery or placement initialization fails.
1562    pub fn with_kind(kind: CpuBackendKind) -> Result<Self, CpuBackendError> {
1563        let op = "CpuBackend::with_kind";
1564        ensure_cpu_backend_kind_available(kind, op)
1565            .map_err(|error| constructor_tensor_error(op, error))?;
1566        let context = CpuContext::from_env();
1567        Self::from_thread_budget_and_kind(
1568            context.num_threads(),
1569            kind,
1570            crate::buffer_pool::DEFAULT_MAX_RETAINED_CAPACITY_BYTES,
1571        )
1572        .map_err(|error| CpuBackendError::placement(op, error))
1573    }
1574
1575    /// Try to create a CPU backend using `RAYON_NUM_THREADS`.
1576    ///
1577    /// # Examples
1578    ///
1579    /// ```
1580    /// use tenferro_cpu::CpuBackend;
1581    ///
1582    /// let backend = CpuBackend::try_new()
1583    ///     .unwrap_or_else(|_| CpuBackend::with_threads(1).unwrap());
1584    /// let _ = backend.num_threads();
1585    /// ```
1586    ///
1587    /// # Errors
1588    ///
1589    /// Returns [`CpuBackendError::Tensor`] when `RAYON_NUM_THREADS` is zero,
1590    /// malformed, or the compiled provider cannot be selected, and
1591    /// [`CpuBackendError::Placement`] when CPU topology or managed placement
1592    /// initialization is unavailable.
1593    pub fn try_new() -> Result<Self, CpuBackendError> {
1594        let op = "CpuBackend::try_new";
1595        let context =
1596            CpuContext::try_from_env().map_err(|error| constructor_tensor_error(op, error))?;
1597        Self::from_thread_budget_and_kind(
1598            context.num_threads(),
1599            CpuBackendKind::default_compiled(),
1600            crate::buffer_pool::DEFAULT_MAX_RETAINED_CAPACITY_BYTES,
1601        )
1602        .map_err(|error| CpuBackendError::placement(op, error))
1603    }
1604
1605    /// Create a CPU backend from an existing context.
1606    ///
1607    /// # Examples
1608    ///
1609    /// ```
1610    /// use std::sync::Arc;
1611    /// use tenferro_cpu::{CpuBackend, CpuContext};
1612    ///
1613    /// let ctx = Arc::new(CpuContext::with_threads(2).unwrap());
1614    /// let backend = CpuBackend::from_context(ctx);
1615    /// assert_eq!(backend.num_threads(), 2);
1616    /// ```
1617    pub fn from_context(ctx: Arc<CpuContext>) -> Self {
1618        Self::compatibility(
1619            ctx,
1620            crate::buffer_pool::DEFAULT_MAX_RETAINED_CAPACITY_BYTES,
1621            CpuBackendKind::default_compiled(),
1622        )
1623    }
1624
1625    /// Create a CPU backend from an existing context and buffer-pool retention cap.
1626    ///
1627    /// The cap is measured in retained vector capacity bytes. A cap of zero
1628    /// disables buffer retention.
1629    ///
1630    /// # Examples
1631    ///
1632    /// ```
1633    /// use std::sync::Arc;
1634    /// use tenferro_cpu::{CpuBackend, CpuContext};
1635    ///
1636    /// let ctx = Arc::new(CpuContext::with_threads(1).unwrap());
1637    /// let backend = CpuBackend::from_context_with_buffer_pool_limit(ctx, 0);
1638    /// assert_eq!(backend.buffer_pool_limit_bytes(), 0);
1639    /// ```
1640    pub fn from_context_with_buffer_pool_limit(
1641        ctx: Arc<CpuContext>,
1642        max_retained_capacity_bytes: usize,
1643    ) -> Self {
1644        Self::from_context_with_buffer_pool_limit_and_kind(
1645            ctx,
1646            max_retained_capacity_bytes,
1647            CpuBackendKind::default_compiled(),
1648        )
1649    }
1650
1651    fn from_context_with_buffer_pool_limit_and_kind(
1652        ctx: Arc<CpuContext>,
1653        max_retained_capacity_bytes: usize,
1654        kind: CpuBackendKind,
1655    ) -> Self {
1656        Self::compatibility(ctx, max_retained_capacity_bytes, kind)
1657    }
1658
1659    /// Create a CPU backend with a custom thread count.
1660    ///
1661    /// # Examples
1662    ///
1663    /// ```
1664    /// use tenferro_cpu::CpuBackend;
1665    ///
1666    /// let backend = CpuBackend::with_threads(2).unwrap();
1667    /// assert_eq!(backend.num_threads(), 2);
1668    /// ```
1669    ///
1670    /// # Errors
1671    ///
1672    /// Returns [`CpuBackendError::Tensor`] with `ValidationError::InvalidArgument`
1673    /// when `num_threads` is zero or the context cannot be configured, and
1674    /// [`CpuBackendError::Placement`] when CPU topology or placement fails.
1675    pub fn with_threads(num_threads: usize) -> Result<Self, CpuBackendError> {
1676        let op = "CpuBackend::with_threads";
1677        let context = CpuContext::with_threads(num_threads)
1678            .map_err(|error| constructor_tensor_error(op, error))?;
1679        Self::from_thread_budget_and_kind(
1680            context.num_threads(),
1681            CpuBackendKind::default_compiled(),
1682            crate::buffer_pool::DEFAULT_MAX_RETAINED_CAPACITY_BYTES,
1683        )
1684        .map_err(|error| CpuBackendError::placement(op, error))
1685    }
1686
1687    /// Create a CPU backend with a custom thread count and provider.
1688    ///
1689    /// # Examples
1690    ///
1691    /// ```
1692    /// use tenferro_cpu::{CpuBackend, CpuBackendKind};
1693    ///
1694    /// let backend = CpuBackend::with_threads_and_kind(
1695    ///     1,
1696    ///     CpuBackendKind::default_compiled(),
1697    /// )?;
1698    /// assert_eq!(backend.num_threads(), 1);
1699    /// # Ok::<(), tenferro_tensor::Error>(())
1700    /// ```
1701    ///
1702    /// # Errors
1703    ///
1704    /// Returns [`CpuBackendError::Tensor`] with `ValidationError::InvalidArgument`
1705    /// when `num_threads` is zero or the provider is unavailable, and
1706    /// [`CpuBackendError::Placement`] when CPU topology or placement fails.
1707    pub fn with_threads_and_kind(
1708        num_threads: usize,
1709        kind: CpuBackendKind,
1710    ) -> Result<Self, CpuBackendError> {
1711        let op = "CpuBackend::with_threads_and_kind";
1712        ensure_cpu_backend_kind_available(kind, op)
1713            .map_err(|error| constructor_tensor_error(op, error))?;
1714        let context = CpuContext::with_threads(num_threads)
1715            .map_err(|error| constructor_tensor_error(op, error))?;
1716        Self::from_thread_budget_and_kind(
1717            context.num_threads(),
1718            kind,
1719            crate::buffer_pool::DEFAULT_MAX_RETAINED_CAPACITY_BYTES,
1720        )
1721        .map_err(|error| CpuBackendError::placement(op, error))
1722    }
1723
1724    /// Clone this backend coordinator with a specific CPU placement request.
1725    ///
1726    /// Managed explicit placement is supported for faer/native execution.
1727    /// Externally managed coordinators resolve explicit requests only to
1728    /// matching registered domains and never construct a fallback engine.
1729    ///
1730    /// # Examples
1731    ///
1732    /// ```
1733    /// use tenferro_cpu::{CpuBackend, CpuPlacement};
1734    ///
1735    /// let backend = CpuBackend::new();
1736    /// if backend.supports_placement(CpuPlacement::AllAllowed) {
1737    ///     let placed = backend.for_placement(CpuPlacement::AllAllowed)?;
1738    ///     assert_eq!(placed.placement(), CpuPlacement::AllAllowed);
1739    /// }
1740    /// # Ok::<(), tenferro_cpu::CpuPlacementError>(())
1741    /// ```
1742    ///
1743    /// # Errors
1744    ///
1745    /// Returns [`CpuPlacementError`] when the requested placement is not
1746    /// available for this backend or its affinity cannot be configured.
1747    pub fn for_placement(&self, requested: CpuPlacement) -> Result<Self, CpuPlacementError> {
1748        self.for_placement_with_affinity(
1749            requested,
1750            cfg!(any(target_os = "linux", target_os = "android")),
1751        )
1752    }
1753
1754    fn for_placement_with_affinity(
1755        &self,
1756        requested: CpuPlacement,
1757        managed_affinity_available: bool,
1758    ) -> Result<Self, CpuPlacementError> {
1759        if self.shared.is_external() {
1760            let engine = self.shared.external_engine_for(requested)?;
1761            return Ok(Self {
1762                runtime_identity: CpuRuntimeIdentity::fresh(),
1763                shared: Arc::clone(&self.shared),
1764                requested,
1765                resolved: ResolvedCpuExecution::ExternalManaged(engine.placement().clone()),
1766                engine,
1767                provider_bundle: self.provider_bundle.clone(),
1768                allocation_domain: self.allocation_domain.clone(),
1769            });
1770        }
1771        let resolved = resolve_placement_with_affinity(
1772            self.kind(),
1773            requested,
1774            &self.shared.topology,
1775            managed_affinity_available,
1776        )?;
1777        if requested == CpuPlacement::Auto && !managed_affinity_available {
1778            return Ok(Self {
1779                runtime_identity: CpuRuntimeIdentity::fresh(),
1780                shared: Arc::clone(&self.shared),
1781                requested,
1782                resolved,
1783                engine: self.shared.managed_base_engine(requested)?,
1784                provider_bundle: self.provider_bundle.clone(),
1785                allocation_domain: self.allocation_domain.clone(),
1786            });
1787        }
1788        let engine_placement = match &resolved {
1789            ResolvedCpuExecution::Managed(placement) => placement.clone(),
1790            ResolvedCpuExecution::ExternalManaged(_) => {
1791                return Err(CpuPlacementError::InternalState {
1792                    requested,
1793                    backend: self.kind(),
1794                    message: "managed resolver returned an external execution mode",
1795                });
1796            }
1797            ResolvedCpuExecution::ProviderDefaultExclusive => ResolvedCpuPlacement::AllAllowed {
1798                cpus: self.shared.topology.allowed_cpus().clone(),
1799            },
1800            ResolvedCpuExecution::Compatibility => {
1801                return Err(CpuPlacementError::InternalState {
1802                    requested,
1803                    backend: self.kind(),
1804                    message: "placement resolution returned an internal compatibility mode",
1805                });
1806            }
1807        };
1808        let engine = self
1809            .shared
1810            .managed_engine_for(&engine_placement, requested)?;
1811        Ok(Self {
1812            runtime_identity: CpuRuntimeIdentity::fresh(),
1813            shared: Arc::clone(&self.shared),
1814            requested,
1815            resolved,
1816            engine,
1817            provider_bundle: self.provider_bundle.clone(),
1818            allocation_domain: self.allocation_domain.clone(),
1819        })
1820    }
1821
1822    /// Return the placement requested by this handle.
1823    ///
1824    /// # Examples
1825    ///
1826    /// ```
1827    /// use tenferro_cpu::{CpuBackend, CpuPlacement};
1828    ///
1829    /// assert_eq!(CpuBackend::new().placement(), CpuPlacement::Auto);
1830    /// ```
1831    pub fn placement(&self) -> CpuPlacement {
1832        self.requested
1833    }
1834
1835    /// Return the concrete managed placement or external placement declaration.
1836    ///
1837    /// Provider-default-exclusive and compatibility contexts return `None`.
1838    ///
1839    /// # Examples
1840    ///
1841    /// ```
1842    /// use tenferro_cpu::{CpuBackend, CpuPlacement};
1843    ///
1844    /// let backend = CpuBackend::new();
1845    /// if backend.supports_placement(CpuPlacement::AllAllowed) {
1846    ///     assert!(backend
1847    ///         .for_placement(CpuPlacement::AllAllowed)?
1848    ///         .resolved_placement()
1849    ///         .is_some());
1850    /// }
1851    /// # Ok::<(), tenferro_cpu::CpuPlacementError>(())
1852    /// ```
1853    pub fn resolved_placement(&self) -> Option<&ResolvedCpuPlacement> {
1854        match &self.resolved {
1855            ResolvedCpuExecution::Managed(placement)
1856            | ResolvedCpuExecution::ExternalManaged(placement) => Some(placement),
1857            ResolvedCpuExecution::Compatibility
1858            | ResolvedCpuExecution::ProviderDefaultExclusive => None,
1859        }
1860    }
1861
1862    /// Return the process-visible topology shared by all coordinator clones.
1863    ///
1864    /// # Examples
1865    ///
1866    /// ```
1867    /// use tenferro_cpu::CpuBackend;
1868    ///
1869    /// assert!(!CpuBackend::new().topology().allowed_cpus().is_empty());
1870    /// ```
1871    pub fn topology(&self) -> &CpuTopology {
1872        &self.shared.topology
1873    }
1874
1875    /// Report whether this coordinator can resolve a placement request.
1876    ///
1877    /// # Examples
1878    ///
1879    /// ```
1880    /// use tenferro_cpu::{CpuBackend, CpuPlacement};
1881    ///
1882    /// assert!(CpuBackend::new().supports_placement(CpuPlacement::Auto));
1883    /// ```
1884    pub fn supports_placement(&self, placement: CpuPlacement) -> bool {
1885        if self.shared.is_external() {
1886            self.shared.external_engine_for(placement).is_ok()
1887        } else {
1888            resolve_placement(self.kind(), placement, &self.shared.topology).is_ok()
1889        }
1890    }
1891
1892    /// Return a snapshot suitable for diagnostics and placement reporting.
1893    ///
1894    /// # Examples
1895    ///
1896    /// ```
1897    /// let backend = tenferro_cpu::CpuBackend::new();
1898    /// assert_eq!(backend.execution_info().backend_kind(), backend.kind());
1899    /// ```
1900    pub fn execution_info(&self) -> CpuExecutionInfo {
1901        let domain = self.engine.domain();
1902        let capabilities = domain.executor_capabilities();
1903        let (executor_affinity, executor_shutdown) =
1904            if domain.ownership() == CpuDomainOwnership::ExternalManaged {
1905                (
1906                    CpuExecutorAffinity::CallerDeclaredUnverified,
1907                    CpuExecutorShutdown::CallerOwned,
1908                )
1909            } else {
1910                (capabilities.affinity, capabilities.shutdown)
1911            };
1912        CpuExecutionInfo {
1913            backend_kind: self.kind(),
1914            execution_mode: match &self.resolved {
1915                ResolvedCpuExecution::Managed(_) => CpuExecutionMode::Managed,
1916                ResolvedCpuExecution::ExternalManaged(_) => CpuExecutionMode::ExternalManaged,
1917                ResolvedCpuExecution::ProviderDefaultExclusive => {
1918                    CpuExecutionMode::ProviderDefaultExclusive
1919                }
1920                ResolvedCpuExecution::Compatibility => CpuExecutionMode::Compatibility,
1921            },
1922            requested_placement: self.requested,
1923            resolved_placement: self.resolved_placement().cloned(),
1924            topology: self.shared.topology.clone(),
1925            domain_id: domain.id(),
1926            domain_cpus: domain.cpus().clone(),
1927            worker_count: capabilities.worker_count.get(),
1928            thread_budget: domain.thread_budget().get(),
1929            placement_guarantee: domain.placement_guarantee(),
1930            domain_ownership: domain.ownership(),
1931            executor_affinity,
1932            executor_shutdown,
1933            provider_diagnostic: provider_diagnostic(self.kind(), domain.ownership()),
1934        }
1935    }
1936
1937    #[cfg(all(
1938        test,
1939        feature = "cpu-faer",
1940        any(target_os = "linux", target_os = "android")
1941    ))]
1942    fn coordinator_id_for_test(&self) -> usize {
1943        Arc::as_ptr(&self.shared) as usize
1944    }
1945
1946    #[cfg(test)]
1947    pub(crate) fn context_id_for_test(&self) -> usize {
1948        Arc::as_ptr(self.engine.domain().executor()) as *const () as usize
1949    }
1950
1951    /// Return the runtime CPU provider selected by this backend.
1952    ///
1953    /// # Examples
1954    ///
1955    /// ```
1956    /// use tenferro_cpu::{CpuBackend, CpuBackendKind};
1957    ///
1958    /// let backend = CpuBackend::new();
1959    /// assert_eq!(backend.kind(), CpuBackendKind::default_compiled());
1960    /// ```
1961    pub fn kind(&self) -> CpuBackendKind {
1962        self.shared.kind
1963    }
1964
1965    /// Return the immutable CPU provider slots selected for this handle.
1966    pub fn provider_bundle(&self) -> &CpuProviderBundle {
1967        &self.provider_bundle
1968    }
1969
1970    /// Return the opaque identity of this backend's executable witness.
1971    ///
1972    /// The identity has no access to backend execution or storage resources.
1973    /// Clones of this backend retain the identity, while separately constructed
1974    /// backends and backends returned after changing immutable witness resources
1975    /// receive a distinct identity.
1976    pub fn runtime_identity(&self) -> CpuRuntimeIdentity {
1977        self.runtime_identity.clone()
1978    }
1979
1980    /// Return this backend with an immutable construction-time provider bundle.
1981    ///
1982    /// Existing clones retain their original bundle identity.
1983    ///
1984    /// # Examples
1985    ///
1986    /// ```
1987    /// use tenferro_cpu::{CpuBackend, CpuBackendKind, CpuProviderBundle};
1988    /// let bundle = CpuProviderBundle::builder(CpuBackendKind::Faer).build()?;
1989    /// let backend = CpuBackend::new().with_provider_bundle(bundle.clone())?;
1990    /// assert!(backend.provider_bundle().shares_identity_with(&bundle));
1991    /// # Ok::<(), Box<dyn std::error::Error>>(())
1992    /// ```
1993    ///
1994    /// # Errors
1995    ///
1996    /// Returns [`CpuProviderBundleInstallError::IncompatibleDomain`] if a
1997    /// provider cannot satisfy one of this backend's resource-domain
1998    /// contracts.
1999    pub fn with_provider_bundle(
2000        mut self,
2001        bundle: CpuProviderBundle,
2002    ) -> Result<Self, CpuProviderBundleInstallError> {
2003        self.validate_provider_bundle_for_domains(&bundle)?;
2004        self.provider_bundle = bundle;
2005        self.runtime_identity = CpuRuntimeIdentity::fresh();
2006        Ok(self)
2007    }
2008
2009    fn validate_provider_bundle_for_domains(
2010        &self,
2011        bundle: &CpuProviderBundle,
2012    ) -> Result<(), CpuProviderBundleInstallError> {
2013        let allowed = self.shared.topology.allowed_cpus();
2014        let validate_engine = |engine: &CpuEngine| {
2015            let domain = engine.domain();
2016            bundle.validate_for_domain(
2017                domain.id(),
2018                domain.thread_budget(),
2019                domain.placement_guarantee(),
2020                domain.cpus(),
2021                allowed,
2022            )
2023        };
2024
2025        match &self.shared.engines {
2026            CpuEngineRegistry::ExternalPrebuilt(registry) => {
2027                for engine in registry.by_id.values() {
2028                    validate_engine(engine)?;
2029                }
2030            }
2031            CpuEngineRegistry::ManagedLazy(registry) => {
2032                validate_engine(&registry.base_engine)?;
2033
2034                // A placed clone retains the installed bundle. Validate every
2035                // lazily constructible managed NUMA domain now rather than
2036                // allowing a later `for_placement` call to bypass the bundle
2037                // contract.
2038                #[cfg(any(target_os = "linux", target_os = "android"))]
2039                for node in self.shared.topology.nodes() {
2040                    let Some(domain_id) = registry.node_domain_ids.get(&node.id()).copied() else {
2041                        continue;
2042                    };
2043                    let budget =
2044                        std::num::NonZeroUsize::new(registry.thread_budget.min(node.cpus().len()))
2045                            .expect("usable topology nodes have non-empty CPU sets");
2046                    bundle.validate_for_domain(
2047                        domain_id,
2048                        budget,
2049                        CpuPlacementGuarantee::ExactDeclared,
2050                        node.cpus(),
2051                        allowed,
2052                    )?;
2053                }
2054            }
2055        }
2056        Ok(())
2057    }
2058
2059    /// Return the selected CPU domain's thread budget.
2060    ///
2061    /// # Examples
2062    ///
2063    /// ```
2064    /// use tenferro_cpu::CpuBackend;
2065    ///
2066    /// let backend = CpuBackend::with_threads(2).unwrap();
2067    /// assert_eq!(backend.num_threads(), 2);
2068    /// ```
2069    pub fn num_threads(&self) -> usize {
2070        self.engine.domain().thread_budget().get()
2071    }
2072
2073    /// Number of retained typed host buffers currently held by this backend.
2074    ///
2075    /// # Examples
2076    ///
2077    /// ```
2078    /// use tenferro_cpu::CpuBackend;
2079    ///
2080    /// let backend = CpuBackend::new();
2081    /// assert_eq!(backend.buffer_pool_len()?, 0);
2082    /// # Ok::<(), tenferro_tensor::Error>(())
2083    /// ```
2084    ///
2085    /// # Errors
2086    ///
2087    /// Returns [`crate::Error::RuntimeState`] when the engine registry or an
2088    /// initialized engine's resources lock is poisoned.
2089    pub fn buffer_pool_len(&self) -> crate::Result<usize> {
2090        self.shared
2091            .initialized_engines("CpuBackend::buffer_pool_len")?
2092            .iter()
2093            .try_fold(0, |total, engine| {
2094                Ok(total
2095                    + lock_engine_resources(engine, "CpuBackend::buffer_pool_len")?
2096                        .buffers
2097                        .len())
2098            })
2099    }
2100
2101    /// Snapshot reusable typed host buffers currently retained by this backend.
2102    ///
2103    /// # Examples
2104    ///
2105    /// ```
2106    /// use tenferro_cpu::CpuBackend;
2107    ///
2108    /// let backend = CpuBackend::new();
2109    /// let stats = backend.buffer_pool_stats()?;
2110    /// assert_eq!(stats.buffers, 0);
2111    /// assert_eq!(stats.capacity_bytes, 0);
2112    /// # Ok::<(), tenferro_tensor::Error>(())
2113    /// ```
2114    ///
2115    /// # Errors
2116    ///
2117    /// Returns [`crate::Error::RuntimeState`] when the engine registry or an
2118    /// initialized engine's resources lock is poisoned.
2119    pub fn buffer_pool_stats(&self) -> crate::Result<BufferPoolStats> {
2120        self.shared
2121            .initialized_engines("CpuBackend::buffer_pool_stats")?
2122            .iter()
2123            .try_fold(BufferPoolStats::default(), |mut total, engine| {
2124                let stats = lock_engine_resources(engine, "CpuBackend::buffer_pool_stats")?
2125                    .buffers
2126                    .stats();
2127                total.buffers += stats.buffers;
2128                total.capacity_bytes += stats.capacity_bytes;
2129                Ok(total)
2130            })
2131    }
2132
2133    /// Return cache-style stats for the CPU buffer pool.
2134    ///
2135    /// # Examples
2136    ///
2137    /// ```
2138    /// use tenferro_cpu::CpuBackend;
2139    ///
2140    /// let backend = CpuBackend::new();
2141    /// let stats = backend.buffer_pool_cache_stats()?;
2142    /// assert_eq!(stats.entries, 0);
2143    /// assert_eq!(stats.retained_bytes, 0);
2144    /// # Ok::<(), tenferro_tensor::Error>(())
2145    /// ```
2146    ///
2147    /// # Errors
2148    ///
2149    /// Returns [`crate::Error::RuntimeState`] when the engine registry or an
2150    /// initialized engine's resources lock is poisoned.
2151    pub fn buffer_pool_cache_stats(&self) -> crate::Result<CacheStats> {
2152        let stats = self.buffer_pool_stats()?;
2153        Ok(CacheStats {
2154            entries: stats.buffers,
2155            retained_bytes: stats.capacity_bytes,
2156            hits: 0,
2157            misses: 0,
2158            evictions: 0,
2159            clears: 0,
2160        })
2161    }
2162
2163    /// Return the limits applied to each CPU engine's indexed-plan cache.
2164    ///
2165    /// # Examples
2166    ///
2167    /// ```
2168    /// use tenferro_cpu::CpuBackend;
2169    ///
2170    /// let backend = CpuBackend::new();
2171    /// assert!(backend.indexed_plan_cache_limits()?.max_entries() > 0);
2172    /// # Ok::<(), tenferro_tensor::Error>(())
2173    /// ```
2174    ///
2175    /// # Errors
2176    ///
2177    /// Returns [`crate::Error::RuntimeState`] when the shared cache
2178    /// configuration lock is poisoned.
2179    pub fn indexed_plan_cache_limits(&self) -> crate::Result<IndexedPlanCacheLimits> {
2180        self.shared
2181            .indexed_plan_cache_limits
2182            .lock()
2183            .map(|limits| *limits)
2184            .map_err(|_| {
2185                poisoned_cpu_lock(
2186                    "CpuBackend::indexed_plan_cache_limits",
2187                    "CPU indexed-plan cache configuration",
2188                )
2189            })
2190    }
2191
2192    /// Update indexed-plan cache limits for current and future CPU engines.
2193    ///
2194    /// Shrinking either bound evicts least-recently-used plans immediately. A
2195    /// zero entry or byte bound disables retention.
2196    ///
2197    /// # Examples
2198    ///
2199    /// ```
2200    /// use tenferro_cpu::{CpuBackend, IndexedPlanCacheLimits};
2201    ///
2202    /// let mut backend = CpuBackend::new();
2203    /// backend.set_indexed_plan_cache_limits(IndexedPlanCacheLimits::new(8, 4096))?;
2204    /// assert_eq!(backend.indexed_plan_cache_limits()?.max_entries(), 8);
2205    /// # Ok::<(), tenferro_tensor::Error>(())
2206    /// ```
2207    ///
2208    /// # Errors
2209    ///
2210    /// Returns [`crate::Error::RuntimeState`] without changing the configured
2211    /// limits when an engine registry or resource lock is poisoned.
2212    pub fn set_indexed_plan_cache_limits(
2213        &mut self,
2214        limits: IndexedPlanCacheLimits,
2215    ) -> crate::Result<()> {
2216        // INVARIANT: keep the configuration guard while snapshotting the
2217        // registry and updating every initialized engine. Lazy creation takes
2218        // the same guard before any registry or resource lock.
2219        let mut configured_limits = self.shared.indexed_plan_cache_limits.lock().map_err(|_| {
2220            poisoned_cpu_lock(
2221                "CpuBackend::set_indexed_plan_cache_limits",
2222                "CPU indexed-plan cache configuration",
2223            )
2224        })?;
2225        let engines = self
2226            .shared
2227            .initialized_engines("CpuBackend::set_indexed_plan_cache_limits")?;
2228        let mut resources = engines
2229            .iter()
2230            .map(|engine| {
2231                lock_engine_resources(engine, "CpuBackend::set_indexed_plan_cache_limits")
2232            })
2233            .collect::<crate::Result<Vec<_>>>()?;
2234        *configured_limits = limits;
2235        for resource in &mut resources {
2236            resource.indexed_plan_cache.set_limits(limits);
2237        }
2238        Ok(())
2239    }
2240
2241    /// Snapshot aggregate indexed-plan cache statistics across initialized CPU engines.
2242    ///
2243    /// # Examples
2244    ///
2245    /// ```
2246    /// use tenferro_cpu::CpuBackend;
2247    ///
2248    /// let backend = CpuBackend::new();
2249    /// assert_eq!(backend.indexed_plan_cache_stats()?.entries, 0);
2250    /// # Ok::<(), tenferro_tensor::Error>(())
2251    /// ```
2252    ///
2253    /// # Errors
2254    ///
2255    /// Returns [`crate::Error::RuntimeState`] when an engine registry or
2256    /// resource lock is poisoned.
2257    pub fn indexed_plan_cache_stats(&self) -> crate::Result<CacheStats> {
2258        self.shared
2259            .initialized_engines("CpuBackend::indexed_plan_cache_stats")?
2260            .iter()
2261            .try_fold(CacheStats::default(), |mut total, engine| {
2262                let stats = lock_engine_resources(engine, "CpuBackend::indexed_plan_cache_stats")?
2263                    .indexed_plan_cache
2264                    .stats();
2265                saturating_add_tensor_cache_stats(&mut total, stats);
2266                Ok(total)
2267            })
2268    }
2269
2270    /// Clear indexed traversal plans retained by all initialized CPU engines.
2271    ///
2272    /// # Examples
2273    ///
2274    /// ```
2275    /// use tenferro_cpu::CpuBackend;
2276    ///
2277    /// let mut backend = CpuBackend::new();
2278    /// backend.clear_indexed_plan_cache()?;
2279    /// assert_eq!(backend.indexed_plan_cache_stats()?.entries, 0);
2280    /// # Ok::<(), tenferro_tensor::Error>(())
2281    /// ```
2282    ///
2283    /// # Errors
2284    ///
2285    /// Returns [`crate::Error::RuntimeState`] without clearing any engine when
2286    /// an engine registry or resource lock is poisoned.
2287    pub fn clear_indexed_plan_cache(&mut self) -> crate::Result<()> {
2288        let engines = self
2289            .shared
2290            .initialized_engines("CpuBackend::clear_indexed_plan_cache")?;
2291        let mut resources = engines
2292            .iter()
2293            .map(|engine| lock_engine_resources(engine, "CpuBackend::clear_indexed_plan_cache"))
2294            .collect::<crate::Result<Vec<_>>>()?;
2295        for resource in &mut resources {
2296            resource.indexed_plan_cache.clear();
2297        }
2298        Ok(())
2299    }
2300
2301    /// Current CPU buffer-pool retention limit in bytes.
2302    ///
2303    /// # Examples
2304    ///
2305    /// ```
2306    /// use std::sync::Arc;
2307    /// use tenferro_cpu::{CpuBackend, CpuContext};
2308    ///
2309    /// let backend = CpuBackend::from_context_with_buffer_pool_limit(
2310    ///     Arc::new(CpuContext::with_threads(1).unwrap()),
2311    ///     4096,
2312    /// );
2313    /// assert_eq!(backend.buffer_pool_limit_bytes(), 4096);
2314    /// ```
2315    pub fn buffer_pool_limit_bytes(&self) -> usize {
2316        self.shared.buffer_limit.load(Ordering::Relaxed)
2317    }
2318
2319    /// Update the CPU buffer-pool retention limit in bytes.
2320    ///
2321    /// Shrinking the limit evicts retained buffers immediately. A limit of zero
2322    /// disables buffer retention.
2323    ///
2324    /// # Examples
2325    ///
2326    /// ```
2327    /// use tenferro_cpu::CpuBackend;
2328    ///
2329    /// let mut backend = CpuBackend::new();
2330    /// backend.set_buffer_pool_limit_bytes(0)?;
2331    /// assert_eq!(backend.buffer_pool_limit_bytes(), 0);
2332    /// assert_eq!(backend.buffer_pool_len()?, 0);
2333    /// # Ok::<(), tenferro_tensor::Error>(())
2334    /// ```
2335    ///
2336    /// # Errors
2337    ///
2338    /// Returns [`crate::Error::RuntimeState`] without changing the configured
2339    /// limit when the engine registry or any initialized engine's resources
2340    /// lock is poisoned.
2341    pub fn set_buffer_pool_limit_bytes(
2342        &mut self,
2343        max_retained_capacity_bytes: usize,
2344    ) -> crate::Result<()> {
2345        let engines = self
2346            .shared
2347            .initialized_engines("CpuBackend::set_buffer_pool_limit_bytes")?;
2348        let mut resources = engines
2349            .iter()
2350            .map(|engine| lock_engine_resources(engine, "CpuBackend::set_buffer_pool_limit_bytes"))
2351            .collect::<crate::Result<Vec<_>>>()?;
2352        self.shared
2353            .buffer_limit
2354            .store(max_retained_capacity_bytes, Ordering::Relaxed);
2355        for resource in &mut resources {
2356            resource
2357                .buffers
2358                .set_max_retained_capacity_bytes(max_retained_capacity_bytes);
2359        }
2360        Ok(())
2361    }
2362
2363    /// Reset reusable typed host buffers currently retained by this backend.
2364    ///
2365    /// This releases pool-owned vectors to the process allocator. Operating
2366    /// system RSS may not fall immediately because allocators can retain freed
2367    /// pages for future allocations.
2368    ///
2369    /// # Examples
2370    ///
2371    /// ```
2372    /// use tenferro_cpu::CpuBackend;
2373    ///
2374    /// let mut backend = CpuBackend::new();
2375    /// backend.reset_buffer_pool()?;
2376    /// assert_eq!(backend.buffer_pool_len()?, 0);
2377    /// # Ok::<(), tenferro_tensor::Error>(())
2378    /// ```
2379    ///
2380    /// # Errors
2381    ///
2382    /// Returns [`crate::Error::RuntimeState`] without clearing any initialized
2383    /// engine when the engine registry or any engine's resources lock is
2384    /// poisoned.
2385    pub fn reset_buffer_pool(&mut self) -> crate::Result<()> {
2386        let engines = self
2387            .shared
2388            .initialized_engines("CpuBackend::reset_buffer_pool")?;
2389        let mut resources = engines
2390            .iter()
2391            .map(|engine| lock_engine_resources(engine, "CpuBackend::reset_buffer_pool"))
2392            .collect::<crate::Result<Vec<_>>>()?;
2393        for resource in &mut resources {
2394            resource.buffers.clear();
2395        }
2396        Ok(())
2397    }
2398
2399    pub(crate) fn runtime_cache_stats(
2400        &self,
2401    ) -> crate::Result<tenferro_runtime::runtime::CacheStats> {
2402        let resources = lock_engine_resources(&self.engine, "CpuBackend::runtime_cache_stats")?;
2403        let buffers = resources.buffers.cache_stats();
2404        let gemm = tenferro_tensor::RuntimeCacheControl::stats(&resources.gemm_analysis_cache);
2405        let indexed = resources.indexed_plan_cache.stats();
2406        Ok(tenferro_runtime::runtime::CacheStats {
2407            entries: buffers
2408                .entries
2409                .saturating_add(gemm.entries)
2410                .saturating_add(indexed.entries),
2411            retained_bytes: buffers
2412                .retained_bytes
2413                .saturating_add(gemm.retained_bytes)
2414                .saturating_add(indexed.retained_bytes),
2415            hits: indexed.hits,
2416            misses: indexed.misses,
2417            evictions: indexed.evictions,
2418            clears: indexed.clears,
2419        })
2420    }
2421
2422    pub(crate) fn clear_runtime_caches(&self) -> crate::Result<()> {
2423        let mut resources =
2424            lock_engine_resources(&self.engine, "CpuBackend::clear_runtime_caches")?;
2425        resources.buffers.clear();
2426        tenferro_tensor::RuntimeCacheControl::clear(&mut resources.gemm_analysis_cache);
2427        resources.indexed_plan_cache.clear();
2428        Ok(())
2429    }
2430
2431    /// Run a closure in this backend's CPU execution scope.
2432    ///
2433    /// # Examples
2434    ///
2435    /// ```
2436    /// use tenferro_cpu::CpuBackend;
2437    ///
2438    /// let backend = CpuBackend::with_threads(1).unwrap();
2439    /// let value = backend.install(|| 1 + 1);
2440    /// assert_eq!(value, 2);
2441    /// ```
2442    ///
2443    /// # Panics
2444    ///
2445    /// Panics when re-entered while another CPU backend execution is active on
2446    /// the current thread or managed Rayon scope. This includes direct nesting
2447    /// and backend calls from parallel child tasks; either could violate CPU or
2448    /// provider exclusivity. For an externally managed domain, it also panics
2449    /// with the executor's typed diagnostic when synchronous executor entry
2450    /// fails because this convenience method cannot return a `Result`.
2451    pub fn install<R: Send>(&self, op: impl FnOnce() -> R + Send) -> R {
2452        let owner = inherited_or_new_execution_owner();
2453        let permit = self.acquire_execution_permit(owner);
2454        let entry = CpuOperationEntry::new(self.engine.domain(), &permit);
2455        match entry.enter(ParallelMode::Sequential, |_| op()) {
2456            Ok(result) => result,
2457            Err(error) => panic!("CpuBackend::install executor failed: {error}"),
2458        }
2459    }
2460
2461    fn try_install<R: Send>(
2462        &self,
2463        op: impl FnOnce() -> crate::Result<R> + Send,
2464    ) -> crate::Result<R> {
2465        let owner = inherited_or_new_execution_owner();
2466        let permit = self.acquire_execution_permit(owner);
2467        let entry = CpuOperationEntry::new(self.engine.domain(), &permit);
2468        let mode = entry.preferred_engine_mode();
2469        entry
2470            .enter(mode, |context| context.with_native_parallelism(op))
2471            .map_err(|error| crate::Error::backend_source("CPU tensor execution", error))?
2472    }
2473
2474    fn try_install_with_context<R: Send>(
2475        &self,
2476        op: impl FnOnce(&CpuExecutionContext<'_>) -> crate::Result<R> + Send,
2477    ) -> crate::Result<R> {
2478        let owner = inherited_or_new_execution_owner();
2479        let permit = self.acquire_execution_permit(owner);
2480        let entry = CpuOperationEntry::new(self.engine.domain(), &permit);
2481        let mode = entry.preferred_engine_mode();
2482        entry
2483            .enter(mode, |context| {
2484                context.with_native_parallelism(|| op(context))
2485            })
2486            .map_err(|error| crate::Error::backend_source("CPU tensor execution", error))?
2487    }
2488
2489    fn try_install_fresh<R: FreshCpuOutput + Send>(
2490        &self,
2491        op: impl FnOnce() -> crate::Result<R> + Send,
2492    ) -> crate::Result<R> {
2493        let domain = self.engine.domain().id();
2494        let mut output = self.try_install(op)?;
2495        output.tag_fresh(domain);
2496        Ok(output)
2497    }
2498
2499    fn try_install_fresh_with_context<R: FreshCpuOutput + Send>(
2500        &self,
2501        op: impl FnOnce(&CpuExecutionContext<'_>) -> crate::Result<R> + Send,
2502    ) -> crate::Result<R> {
2503        let domain = self.engine.domain().id();
2504        let mut output = self.try_install_with_context(op)?;
2505        output.tag_fresh(domain);
2506        Ok(output)
2507    }
2508
2509    fn install_with_pool_unmarked<R: Send>(
2510        &mut self,
2511        op: impl FnOnce(&mut BufferPool) -> crate::Result<R> + Send,
2512    ) -> crate::Result<R> {
2513        let owner = inherited_or_new_execution_owner();
2514        let permit = self.acquire_execution_permit(owner);
2515        let entry = CpuOperationEntry::new(self.engine.domain(), &permit);
2516        let mode = entry.preferred_engine_mode();
2517        entry
2518            .enter(mode, |context| {
2519                context.with_native_parallelism(|| {
2520                    self.with_execution_resources(&permit, |resources| {
2521                        let mut buffers = BufferPoolLoan::new(&mut resources.buffers);
2522                        op(buffers.get_mut())
2523                    })
2524                })
2525            })
2526            .map_err(|error| crate::Error::backend_source("CPU tensor execution", error))?
2527    }
2528
2529    fn install_with_pool_context_unmarked<R: Send>(
2530        &mut self,
2531        op: impl FnOnce(&CpuExecutionContext<'_>, &mut BufferPool) -> crate::Result<R> + Send,
2532    ) -> crate::Result<R> {
2533        let owner = inherited_or_new_execution_owner();
2534        let permit = self.acquire_execution_permit(owner);
2535        let entry = CpuOperationEntry::new(self.engine.domain(), &permit);
2536        let mode = entry.preferred_engine_mode();
2537        entry
2538            .enter(mode, |context| {
2539                context.with_native_parallelism(|| {
2540                    self.with_execution_resources(&permit, |resources| {
2541                        let mut buffers = BufferPoolLoan::new(&mut resources.buffers);
2542                        op(context, buffers.get_mut())
2543                    })
2544                })
2545            })
2546            .map_err(|error| crate::Error::backend_source("CPU tensor execution", error))?
2547    }
2548
2549    fn install_with_indexed_pool_context_unmarked<R: Send>(
2550        &mut self,
2551        op: impl FnOnce(
2552                &CpuExecutionContext<'_>,
2553                &mut BufferPool,
2554                &mut IndexedPlanCache,
2555            ) -> crate::Result<R>
2556            + Send,
2557    ) -> crate::Result<R> {
2558        let owner = inherited_or_new_execution_owner();
2559        let permit = self.acquire_execution_permit(owner);
2560        let entry = CpuOperationEntry::new(self.engine.domain(), &permit);
2561        let mode = entry.preferred_engine_mode();
2562        entry
2563            .enter(mode, |context| {
2564                context.with_native_parallelism(|| {
2565                    self.with_execution_resources(&permit, |resources| {
2566                        let EngineResources {
2567                            buffers,
2568                            indexed_plan_cache,
2569                            ..
2570                        } = resources;
2571                        let mut buffers = BufferPoolLoan::new(buffers);
2572                        op(context, buffers.get_mut(), indexed_plan_cache)
2573                    })
2574                })
2575            })
2576            .map_err(|error| crate::Error::backend_source("CPU tensor execution", error))?
2577    }
2578
2579    fn install_with_pool<R: FreshCpuOutput + Send>(
2580        &mut self,
2581        op: impl FnOnce(&mut BufferPool) -> crate::Result<R> + Send,
2582    ) -> crate::Result<R> {
2583        let domain = self.engine.domain().id();
2584        let mut output = self.install_with_pool_unmarked(op)?;
2585        output.tag_fresh(domain);
2586        Ok(output)
2587    }
2588
2589    fn install_with_pool_context<R: FreshCpuOutput + Send>(
2590        &mut self,
2591        op: impl FnOnce(&CpuExecutionContext<'_>, &mut BufferPool) -> crate::Result<R> + Send,
2592    ) -> crate::Result<R> {
2593        let domain = self.engine.domain().id();
2594        let mut output = self.install_with_pool_context_unmarked(op)?;
2595        output.tag_fresh(domain);
2596        Ok(output)
2597    }
2598
2599    fn install_with_indexed_pool_context<R: FreshCpuOutput + Send>(
2600        &mut self,
2601        op: impl FnOnce(
2602                &CpuExecutionContext<'_>,
2603                &mut BufferPool,
2604                &mut IndexedPlanCache,
2605            ) -> crate::Result<R>
2606            + Send,
2607    ) -> crate::Result<R> {
2608        let domain = self.engine.domain().id();
2609        let mut output = self.install_with_indexed_pool_context_unmarked(op)?;
2610        output.tag_fresh(domain);
2611        Ok(output)
2612    }
2613
2614    /// Run an external linalg implementation with one borrowed execution
2615    /// context and this backend's buffer pool.
2616    ///
2617    /// This is exposed for operation-family crates that own their backend
2618    /// implementation while still sharing the CPU backend's allocation pool.
2619    ///
2620    /// # Examples
2621    ///
2622    /// ```
2623    /// use tenferro_cpu::CpuBackend;
2624    /// let mut backend = CpuBackend::new();
2625    /// backend.with_linalg_pool(|context, _pool| {
2626    ///     assert!(context.thread_budget().get() >= 1);
2627    ///     Ok(())
2628    /// })?;
2629    /// # Ok::<(), Box<dyn std::error::Error>>(())
2630    /// ```
2631    ///
2632    /// # Errors
2633    ///
2634    /// Returns [`crate::Error::BackendSource`] with a
2635    /// [`crate::CpuDomainExecutorError`] source when authoritative executor
2636    /// admission fails. Errors returned by the operation-family closure are
2637    /// propagated unchanged.
2638    #[doc(hidden)]
2639    pub fn with_linalg_pool<R: Send>(
2640        &mut self,
2641        op: impl FnOnce(&CpuExecutionContext<'_>, &mut BufferPool) -> crate::Result<R> + Send,
2642    ) -> crate::Result<R> {
2643        let owner = inherited_or_new_execution_owner();
2644        let permit = self.acquire_execution_permit(owner);
2645        let entry = CpuOperationEntry::new(self.engine.domain(), &permit);
2646        let mode = entry.preferred_linalg_mode(self.kind());
2647        entry
2648            .enter(mode, |context| {
2649                context.with_native_parallelism(|| {
2650                    self.with_execution_resources(&permit, |resources| {
2651                        let mut buffers = BufferPoolLoan::new(&mut resources.buffers);
2652                        op(context, buffers.get_mut())
2653                    })
2654                })
2655            })
2656            .map_err(|error| crate::Error::backend_source("CPU linalg execution", error))?
2657    }
2658
2659    fn with_execution_resources<R>(
2660        &self,
2661        permit: &ResourcePermit,
2662        op: impl FnOnce(&mut EngineResources) -> R,
2663    ) -> R {
2664        if permit.is_reentrant() {
2665            let mut resources =
2666                EngineResources::new(self.shared.buffer_limit.load(Ordering::Relaxed));
2667            return op(&mut resources);
2668        }
2669        let mut resources = self
2670            .engine
2671            .resources
2672            .lock()
2673            .unwrap_or_else(std::sync::PoisonError::into_inner);
2674        op(&mut resources)
2675    }
2676
2677    fn acquire_execution_permit(&self, owner: ResourceOwner) -> ResourcePermit {
2678        match &self.resolved {
2679            ResolvedCpuExecution::Managed(placement)
2680            | ResolvedCpuExecution::ExternalManaged(placement) => self
2681                .shared
2682                .arbiter
2683                .acquire_recovering(placement.cpus().clone(), owner),
2684            ResolvedCpuExecution::Compatibility => self
2685                .shared
2686                .arbiter
2687                .acquire_recovering(self.shared.topology.allowed_cpus().clone(), owner),
2688            ResolvedCpuExecution::ProviderDefaultExclusive => self
2689                .shared
2690                .arbiter
2691                .acquire_provider_exclusive_recovering(owner),
2692        }
2693    }
2694
2695    #[cfg(test)]
2696    fn try_acquire_execution_permit_for_test(
2697        &self,
2698    ) -> Result<Option<ResourcePermit>, crate::arbiter::ResourceArbiterError> {
2699        match &self.resolved {
2700            ResolvedCpuExecution::Managed(placement)
2701            | ResolvedCpuExecution::ExternalManaged(placement) => {
2702                self.shared.arbiter.try_acquire(placement.cpus().clone())
2703            }
2704            ResolvedCpuExecution::Compatibility => self
2705                .shared
2706                .arbiter
2707                .try_acquire(self.shared.topology.allowed_cpus().clone()),
2708            ResolvedCpuExecution::ProviderDefaultExclusive => {
2709                self.shared.arbiter.try_acquire_provider_exclusive()
2710            }
2711        }
2712    }
2713}
2714
2715impl BackendRuntimeCache for CpuBackend {
2716    type RuntimeCache = gemm::GemmAnalysisCache;
2717}
2718
2719impl TensorElementwise for CpuBackend {
2720    fn elementwise_read_into(
2721        &mut self,
2722        op: ElementwiseReadOp,
2723        inputs: &[TensorRead<'_>],
2724        out: TensorWrite<'_>,
2725    ) -> crate::Result<()> {
2726        self.install_with_pool_context_unmarked(|context, buffers| {
2727            let exec_context = context.strided_exec_context();
2728            tenferro_tensor::backend::elementwise_read_into_with_context(
2729                op,
2730                inputs,
2731                out,
2732                &exec_context,
2733                |inputs, out| elementwise_read_into_fallback_with_pool(buffers, op, inputs, out),
2734            )
2735        })
2736    }
2737
2738    fn add(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2739        self.install_with_pool(|buffers| elementwise::add_with_pool(buffers, lhs, rhs))
2740    }
2741
2742    fn add_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2743        self.install_with_pool(|buffers| elementwise::add_read_with_pool(buffers, lhs, rhs))
2744    }
2745
2746    fn sub(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2747        self.install_with_pool(|buffers| elementwise::sub_with_pool(buffers, lhs, rhs))
2748    }
2749
2750    fn sub_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2751        self.install_with_pool(|buffers| elementwise::sub_read_with_pool(buffers, lhs, rhs))
2752    }
2753
2754    fn mul(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2755        self.install_with_pool(|buffers| elementwise::mul_with_pool(buffers, lhs, rhs))
2756    }
2757
2758    fn mul_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2759        self.install_with_pool(|buffers| elementwise::mul_read_with_pool(buffers, lhs, rhs))
2760    }
2761
2762    fn neg(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2763        self.install_with_pool(|buffers| elementwise::neg_with_pool(buffers, input))
2764    }
2765
2766    fn neg_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2767        self.install_with_pool(|buffers| elementwise::neg_read_with_pool(buffers, input))
2768    }
2769
2770    fn conj(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2771        self.install_with_pool(|buffers| elementwise::conj_with_pool(buffers, input))
2772    }
2773
2774    fn conj_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2775        self.install_with_pool(|buffers| elementwise::conj_read_with_pool(buffers, input))
2776    }
2777
2778    fn div(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2779        self.install_with_pool(|buffers| elementwise::div_with_pool(buffers, lhs, rhs))
2780    }
2781
2782    fn div_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2783        self.install_with_pool(|buffers| elementwise::div_read_with_pool(buffers, lhs, rhs))
2784    }
2785
2786    fn rem(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2787        self.install_with_pool(|buffers| elementwise::rem_with_pool(buffers, lhs, rhs))
2788    }
2789
2790    fn rem_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2791        self.install_with_pool(|buffers| elementwise::rem_read_with_pool(buffers, lhs, rhs))
2792    }
2793
2794    fn abs(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2795        self.install_with_pool(|buffers| elementwise::abs_with_pool(buffers, input))
2796    }
2797
2798    fn abs_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2799        self.install_with_pool(|buffers| elementwise::abs_read_with_pool(buffers, input))
2800    }
2801
2802    fn sign(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2803        self.install_with_pool(|buffers| elementwise::sign_with_pool(buffers, input))
2804    }
2805
2806    fn sign_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2807        self.install_with_pool(|buffers| elementwise::sign_read_with_pool(buffers, input))
2808    }
2809
2810    fn maximum(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2811        self.install_with_pool(|buffers| elementwise::maximum_with_pool(buffers, lhs, rhs))
2812    }
2813
2814    fn maximum_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2815        self.install_with_pool(|buffers| elementwise::maximum_read_with_pool(buffers, lhs, rhs))
2816    }
2817
2818    fn minimum(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2819        self.install_with_pool(|buffers| elementwise::minimum_with_pool(buffers, lhs, rhs))
2820    }
2821
2822    fn minimum_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2823        self.install_with_pool(|buffers| elementwise::minimum_read_with_pool(buffers, lhs, rhs))
2824    }
2825
2826    fn compare(&mut self, lhs: &Tensor, rhs: &Tensor, dir: &CompareDir) -> crate::Result<Tensor> {
2827        self.install_with_pool(|buffers| elementwise::compare_with_pool(buffers, lhs, rhs, dir))
2828    }
2829
2830    fn compare_read(
2831        &mut self,
2832        lhs: TensorRead<'_>,
2833        rhs: TensorRead<'_>,
2834        dir: &CompareDir,
2835    ) -> crate::Result<Tensor> {
2836        self.install_with_pool(|buffers| {
2837            elementwise::compare_read_with_pool(buffers, lhs, rhs, dir)
2838        })
2839    }
2840
2841    fn select(
2842        &mut self,
2843        pred: &Tensor,
2844        on_true: &Tensor,
2845        on_false: &Tensor,
2846    ) -> crate::Result<Tensor> {
2847        self.install_with_pool(|buffers| {
2848            elementwise::select_with_pool(buffers, pred, on_true, on_false)
2849        })
2850    }
2851
2852    fn select_read(
2853        &mut self,
2854        pred: TensorRead<'_>,
2855        on_true: TensorRead<'_>,
2856        on_false: TensorRead<'_>,
2857    ) -> crate::Result<Tensor> {
2858        self.install_with_pool(|buffers| {
2859            elementwise::select_read_with_pool(buffers, pred, on_true, on_false)
2860        })
2861    }
2862
2863    fn clamp(&mut self, input: &Tensor, lower: &Tensor, upper: &Tensor) -> crate::Result<Tensor> {
2864        self.install_with_pool(|buffers| elementwise::clamp_with_pool(buffers, input, lower, upper))
2865    }
2866
2867    fn clamp_read(
2868        &mut self,
2869        input: TensorRead<'_>,
2870        lower: TensorRead<'_>,
2871        upper: TensorRead<'_>,
2872    ) -> crate::Result<Tensor> {
2873        self.install_with_pool(|buffers| {
2874            elementwise::clamp_read_with_pool(buffers, input, lower, upper)
2875        })
2876    }
2877}
2878
2879impl TensorAnalytic for CpuBackend {
2880    fn exp(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2881        self.install_with_pool(|buffers| analytic::exp_with_pool(buffers, input))
2882    }
2883
2884    fn exp_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2885        self.install_with_pool(|buffers| analytic::exp_read_with_pool(buffers, input))
2886    }
2887
2888    fn log(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2889        self.install_with_pool(|buffers| analytic::log_with_pool(buffers, input))
2890    }
2891
2892    fn log_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2893        self.install_with_pool(|buffers| analytic::log_read_with_pool(buffers, input))
2894    }
2895
2896    fn sin(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2897        self.install_with_pool(|buffers| analytic::sin_with_pool(buffers, input))
2898    }
2899
2900    fn sin_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2901        self.install_with_pool(|buffers| analytic::sin_read_with_pool(buffers, input))
2902    }
2903
2904    fn cos(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2905        self.install_with_pool(|buffers| analytic::cos_with_pool(buffers, input))
2906    }
2907
2908    fn cos_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2909        self.install_with_pool(|buffers| analytic::cos_read_with_pool(buffers, input))
2910    }
2911
2912    fn tanh(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2913        self.install_with_pool(|buffers| analytic::tanh_with_pool(buffers, input))
2914    }
2915
2916    fn tanh_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2917        self.install_with_pool(|buffers| analytic::tanh_read_with_pool(buffers, input))
2918    }
2919
2920    fn sqrt(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2921        self.install_with_pool(|buffers| analytic::sqrt_with_pool(buffers, input))
2922    }
2923
2924    fn sqrt_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2925        self.install_with_pool(|buffers| analytic::sqrt_read_with_pool(buffers, input))
2926    }
2927
2928    fn rsqrt(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2929        self.install_with_pool(|buffers| analytic::rsqrt_with_pool(buffers, input))
2930    }
2931
2932    fn rsqrt_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2933        self.install_with_pool(|buffers| analytic::rsqrt_read_with_pool(buffers, input))
2934    }
2935
2936    fn pow(&mut self, lhs: &Tensor, rhs: &Tensor) -> crate::Result<Tensor> {
2937        self.install_with_pool(|buffers| analytic::pow_with_pool(buffers, lhs, rhs))
2938    }
2939
2940    fn pow_read(&mut self, lhs: TensorRead<'_>, rhs: TensorRead<'_>) -> crate::Result<Tensor> {
2941        self.install_with_pool(|buffers| analytic::pow_read_with_pool(buffers, lhs, rhs))
2942    }
2943
2944    fn expm1(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2945        self.install_with_pool(|buffers| analytic::expm1_with_pool(buffers, input))
2946    }
2947
2948    fn expm1_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2949        self.install_with_pool(|buffers| analytic::expm1_read_with_pool(buffers, input))
2950    }
2951
2952    fn log1p(&mut self, input: &Tensor) -> crate::Result<Tensor> {
2953        self.install_with_pool(|buffers| analytic::log1p_with_pool(buffers, input))
2954    }
2955
2956    fn log1p_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2957        self.install_with_pool(|buffers| analytic::log1p_read_with_pool(buffers, input))
2958    }
2959}
2960
2961impl TensorStructural for CpuBackend {
2962    fn to_contiguous_read(&mut self, input: TensorRead<'_>) -> crate::Result<Tensor> {
2963        self.install_with_pool(|buffers| {
2964            materialize_tensor_read(buffers, "CpuBackend::to_contiguous_read", input)
2965        })
2966    }
2967
2968    fn copy_read_into(&mut self, src: TensorRead<'_>, dst: TensorWrite<'_>) -> crate::Result<()> {
2969        self.try_install(|| copy_tensor_read_into("CpuBackend::copy_read_into", src, dst))
2970    }
2971
2972    fn transpose(&mut self, input: &Tensor, perm: &[usize]) -> crate::Result<Tensor> {
2973        self.install_with_pool(|buffers| structural::transpose_with_pool(buffers, input, perm))
2974    }
2975
2976    fn transpose_read(&mut self, input: TensorRead<'_>, perm: &[usize]) -> crate::Result<Tensor> {
2977        self.install_with_pool(|buffers| structural::transpose_read_with_pool(buffers, input, perm))
2978    }
2979
2980    fn reshape(&mut self, input: &Tensor, shape: &[usize]) -> crate::Result<Tensor> {
2981        self.try_install(|| structural::reshape(input, shape))
2982    }
2983
2984    fn reshape_read(&mut self, input: TensorRead<'_>, shape: &[usize]) -> crate::Result<Tensor> {
2985        let materializes = matches!(&input, TensorRead::View(_));
2986        if materializes {
2987            self.install_with_pool(|buffers| {
2988                structural::reshape_read_with_pool(buffers, input, shape)
2989            })
2990        } else {
2991            self.install_with_pool_unmarked(|buffers| {
2992                structural::reshape_read_with_pool(buffers, input, shape)
2993            })
2994        }
2995    }
2996
2997    fn broadcast_in_dim(
2998        &mut self,
2999        input: &Tensor,
3000        shape: &[usize],
3001        dims: &[usize],
3002    ) -> crate::Result<Tensor> {
3003        self.install_with_pool(|buffers| {
3004            structural::broadcast_in_dim_with_pool(buffers, input, shape, dims)
3005        })
3006    }
3007
3008    fn broadcast_in_dim_read(
3009        &mut self,
3010        input: TensorRead<'_>,
3011        shape: &[usize],
3012        dims: &[usize],
3013    ) -> crate::Result<Tensor> {
3014        self.install_with_pool(|buffers| {
3015            structural::broadcast_in_dim_read_with_pool(buffers, input, shape, dims)
3016        })
3017    }
3018
3019    fn cast(&mut self, input: &Tensor, to: crate::DType) -> crate::Result<Tensor> {
3020        self.install_with_pool(|buffers| structural::cast_with_pool(buffers, input, to))
3021    }
3022
3023    fn extract_diagonal(
3024        &mut self,
3025        input: &Tensor,
3026        axis_a: usize,
3027        axis_b: usize,
3028    ) -> crate::Result<Tensor> {
3029        self.install_with_pool(|buffers| {
3030            structural::extract_diagonal_with_pool(buffers, input, axis_a, axis_b)
3031        })
3032    }
3033
3034    fn embed_diagonal(
3035        &mut self,
3036        input: &Tensor,
3037        axis_a: usize,
3038        axis_b: usize,
3039    ) -> crate::Result<Tensor> {
3040        self.install_with_pool(|buffers| {
3041            structural::embed_diagonal_with_pool(buffers, input, axis_a, axis_b)
3042        })
3043    }
3044
3045    fn tril(&mut self, input: &Tensor, k: i64) -> crate::Result<Tensor> {
3046        self.install_with_pool(|buffers| structural::tril_with_pool(buffers, input, k))
3047    }
3048
3049    fn triu(&mut self, input: &Tensor, k: i64) -> crate::Result<Tensor> {
3050        self.install_with_pool(|buffers| structural::triu_with_pool(buffers, input, k))
3051    }
3052}
3053
3054impl TensorReduction for CpuBackend {
3055    fn reduce_sum(&mut self, input: &Tensor, axes: &[usize]) -> crate::Result<Tensor> {
3056        self.try_install_fresh_with_context(|context| {
3057            let exec_context = context.strided_exec_context();
3058            reduction::reduce_sum(input, axes, &exec_context)
3059        })
3060    }
3061
3062    fn reduce_sum_read(&mut self, input: TensorRead<'_>, axes: &[usize]) -> crate::Result<Tensor> {
3063        self.install_with_pool_context(|context, buffers| {
3064            let exec_context = context.strided_exec_context();
3065            reduction::reduce_sum_read(buffers, input, axes, &exec_context)
3066        })
3067    }
3068
3069    fn reduce_sum_squares_read(
3070        &mut self,
3071        input: TensorRead<'_>,
3072        axes: &[usize],
3073    ) -> crate::Result<Tensor> {
3074        self.install_with_pool_context(|context, buffers| {
3075            let exec_context = context.strided_exec_context();
3076            reduction::reduce_sum_squares_read(buffers, input, axes, &exec_context)
3077        })
3078    }
3079
3080    fn reduce_prod(&mut self, input: &Tensor, axes: &[usize]) -> crate::Result<Tensor> {
3081        self.try_install_fresh_with_context(|context| {
3082            let exec_context = context.strided_exec_context();
3083            reduction::reduce_prod(input, axes, &exec_context)
3084        })
3085    }
3086
3087    fn reduce_prod_read(&mut self, input: TensorRead<'_>, axes: &[usize]) -> crate::Result<Tensor> {
3088        self.install_with_pool_context(|context, buffers| {
3089            let exec_context = context.strided_exec_context();
3090            reduction::reduce_prod_read(buffers, input, axes, &exec_context)
3091        })
3092    }
3093
3094    fn reduce_max(&mut self, input: &Tensor, axes: &[usize]) -> crate::Result<Tensor> {
3095        self.try_install_fresh(|| reduction::reduce_max(input, axes))
3096    }
3097
3098    fn reduce_max_read(&mut self, input: TensorRead<'_>, axes: &[usize]) -> crate::Result<Tensor> {
3099        self.install_with_pool(|buffers| reduction::reduce_max_read(buffers, input, axes))
3100    }
3101
3102    fn reduce_min(&mut self, input: &Tensor, axes: &[usize]) -> crate::Result<Tensor> {
3103        self.try_install_fresh(|| reduction::reduce_min(input, axes))
3104    }
3105
3106    fn reduce_min_read(&mut self, input: TensorRead<'_>, axes: &[usize]) -> crate::Result<Tensor> {
3107        self.install_with_pool(|buffers| reduction::reduce_min_read(buffers, input, axes))
3108    }
3109}
3110
3111impl TensorDot for CpuBackend {
3112    fn dot_general(
3113        &mut self,
3114        lhs: &Tensor,
3115        rhs: &Tensor,
3116        config: &DotGeneralConfig,
3117    ) -> crate::Result<Tensor> {
3118        self.run_backend_session_cached(None, move |session| session.dot_general(lhs, rhs, config))
3119    }
3120
3121    fn dot_general_read(
3122        &mut self,
3123        lhs: TensorRead<'_>,
3124        rhs: TensorRead<'_>,
3125        config: &DotGeneralConfig,
3126    ) -> crate::Result<Tensor> {
3127        self.run_backend_session_cached(None, move |session| {
3128            session.dot_general_read(lhs, rhs, config)
3129        })
3130    }
3131
3132    fn dot_general_read_into(
3133        &mut self,
3134        lhs: TensorRead<'_>,
3135        rhs: TensorRead<'_>,
3136        config: &DotGeneralConfig,
3137        out: TensorWrite<'_>,
3138    ) -> crate::Result<()> {
3139        self.run_backend_session_cached(None, move |session| {
3140            session.dot_general_read_into(lhs, rhs, config, out)
3141        })
3142    }
3143
3144    fn dot_general_read_into_accum(
3145        &mut self,
3146        lhs: TensorRead<'_>,
3147        rhs: TensorRead<'_>,
3148        config: &DotGeneralConfig,
3149        accumulation: DotGeneralAccumulation,
3150        out: TensorWrite<'_>,
3151    ) -> crate::Result<()> {
3152        self.run_backend_session_cached(None, move |session| {
3153            session.dot_general_read_into_accum(lhs, rhs, config, accumulation, out)
3154        })
3155    }
3156
3157    fn dot_general_with_conj(
3158        &mut self,
3159        lhs: &Tensor,
3160        rhs: &Tensor,
3161        config: &DotGeneralConfig,
3162        lhs_conj: bool,
3163        rhs_conj: bool,
3164    ) -> crate::Result<Tensor> {
3165        self.run_backend_session_cached(None, move |session| {
3166            session.dot_general_with_conj(lhs, rhs, config, lhs_conj, rhs_conj)
3167        })
3168    }
3169}
3170
3171impl BackendCachedDot for CpuBackend {
3172    fn dot_general_cached(
3173        &mut self,
3174        cache: &mut Self::RuntimeCache,
3175        cache_slot: Option<usize>,
3176        lhs: &Tensor,
3177        rhs: &Tensor,
3178        config: &DotGeneralConfig,
3179    ) -> crate::Result<Tensor> {
3180        self.run_backend_session_cached(Some(cache), move |session| {
3181            session.dot_general_cached(cache_slot, lhs, rhs, config)
3182        })
3183    }
3184
3185    fn dot_general_with_conj_cached(
3186        &mut self,
3187        cache: &mut Self::RuntimeCache,
3188        cache_slot: Option<usize>,
3189        lhs: &Tensor,
3190        rhs: &Tensor,
3191        config: &DotGeneralConfig,
3192        lhs_conj: bool,
3193        rhs_conj: bool,
3194    ) -> crate::Result<Tensor> {
3195        self.run_backend_session_cached(Some(cache), move |session| {
3196            session.dot_general_with_conj_cached(cache_slot, lhs, rhs, config, lhs_conj, rhs_conj)
3197        })
3198    }
3199
3200    fn dot_general_read_into_accum_cached(
3201        &mut self,
3202        cache: &mut Self::RuntimeCache,
3203        cache_slot: Option<usize>,
3204        lhs: TensorRead<'_>,
3205        rhs: TensorRead<'_>,
3206        config: &DotGeneralConfig,
3207        accumulation: DotGeneralAccumulation,
3208        out: TensorWrite<'_>,
3209    ) -> crate::Result<()> {
3210        self.run_backend_session_cached(Some(cache), move |session| {
3211            session.dot_general_read_into_accum_cached(
3212                cache_slot,
3213                lhs,
3214                rhs,
3215                config,
3216                accumulation,
3217                out,
3218            )
3219        })
3220    }
3221
3222    fn grouped_gemm_cached(
3223        &mut self,
3224        cache: &mut Self::RuntimeCache,
3225        cache_slot: Option<usize>,
3226        lhs: TensorRead<'_>,
3227        rhs: TensorRead<'_>,
3228        config: &GroupedGemmConfig<'_>,
3229        out: TensorWrite<'_>,
3230    ) -> crate::Result<()> {
3231        self.run_backend_session_cached(Some(cache), move |session| {
3232            session.grouped_gemm_cached(cache_slot, lhs, rhs, config, out)
3233        })
3234    }
3235}
3236
3237impl TensorIndexing for CpuBackend {
3238    fn gather(
3239        &mut self,
3240        operand: &Tensor,
3241        start_indices: &Tensor,
3242        config: &GatherConfig,
3243    ) -> crate::Result<Tensor> {
3244        self.install_with_indexed_pool_context(|context, buffers, cache| {
3245            let exec_context = context.strided_exec_context();
3246            indexing::gather_with_pool(
3247                buffers,
3248                cache,
3249                &exec_context,
3250                operand,
3251                start_indices,
3252                config,
3253            )
3254        })
3255    }
3256
3257    fn scatter(
3258        &mut self,
3259        operand: &Tensor,
3260        scatter_indices: &Tensor,
3261        updates: &Tensor,
3262        config: &ScatterConfig,
3263    ) -> crate::Result<Tensor> {
3264        self.install_with_indexed_pool_context(|context, buffers, cache| {
3265            let exec_context = context.strided_exec_context();
3266            indexing::scatter_with_pool(
3267                buffers,
3268                cache,
3269                &exec_context,
3270                operand,
3271                scatter_indices,
3272                updates,
3273                config,
3274            )
3275        })
3276    }
3277
3278    fn slice(&mut self, input: &Tensor, config: &SliceConfig) -> crate::Result<Tensor> {
3279        self.install_with_pool_context(|context, buffers| {
3280            let exec_context = context.strided_exec_context();
3281            indexing::try_slice_with_pool(buffers, &exec_context, input, config)
3282        })
3283    }
3284
3285    fn dynamic_slice(
3286        &mut self,
3287        input: &Tensor,
3288        starts: &Tensor,
3289        slice_sizes: &[usize],
3290    ) -> crate::Result<Tensor> {
3291        self.install_with_indexed_pool_context(|context, buffers, cache| {
3292            let exec_context = context.strided_exec_context();
3293            indexing::dynamic_slice_with_pool(
3294                buffers,
3295                cache,
3296                &exec_context,
3297                input,
3298                starts,
3299                slice_sizes,
3300            )
3301        })
3302    }
3303
3304    fn dynamic_update_slice(
3305        &mut self,
3306        operand: &Tensor,
3307        update: &Tensor,
3308        starts: &Tensor,
3309    ) -> crate::Result<Tensor> {
3310        self.install_with_indexed_pool_context(|context, buffers, cache| {
3311            let exec_context = context.strided_exec_context();
3312            indexing::dynamic_update_slice_with_pool(
3313                buffers,
3314                cache,
3315                &exec_context,
3316                operand,
3317                update,
3318                starts,
3319            )
3320        })
3321    }
3322
3323    fn pad(&mut self, input: &Tensor, config: &PadConfig) -> crate::Result<Tensor> {
3324        self.install_with_pool_context(|context, buffers| {
3325            let exec_context = context.strided_exec_context();
3326            indexing::try_pad_with_pool(buffers, &exec_context, input, config)
3327        })
3328    }
3329
3330    fn concatenate(&mut self, inputs: &[&Tensor], axis: usize) -> crate::Result<Tensor> {
3331        self.install_with_pool_context(|context, buffers| {
3332            let exec_context = context.strided_exec_context();
3333            indexing::try_concatenate_with_pool(buffers, &exec_context, inputs, axis)
3334        })
3335    }
3336
3337    fn reverse(&mut self, input: &Tensor, axes: &[usize]) -> crate::Result<Tensor> {
3338        self.install_with_pool_context(|context, buffers| {
3339            let exec_context = context.strided_exec_context();
3340            indexing::reverse_with_pool(buffers, &exec_context, input, axes)
3341        })
3342    }
3343}
3344
3345impl CpuBackend {
3346    /// Bind this backend handle to a shared-allocation domain.
3347    ///
3348    /// Host-only CPU behavior is unchanged. Operation crates can use the domain
3349    /// to require guarded access to matching managed allocations.
3350    ///
3351    /// # Examples
3352    ///
3353    /// ```rust
3354    /// use tenferro_cpu::CpuBackend;
3355    /// use std::sync::Arc;
3356    /// use tenferro_tensor::{AllocationDomainId, DType, SharedTensorAllocationDomain, Tensor};
3357    ///
3358    /// #[derive(Debug)]
3359    /// struct Domain(AllocationDomainId);
3360    /// impl SharedTensorAllocationDomain for Domain {
3361    ///     fn id(&self) -> AllocationDomainId { self.0 }
3362    ///     fn allocate(&self, _: DType, _: &[usize]) -> tenferro_tensor::Result<Tensor> {
3363    ///         Err(tenferro_tensor::Error::unsupported("example", "not implemented"))
3364    ///     }
3365    /// }
3366    /// let id = AllocationDomainId::fresh();
3367    /// let backend = CpuBackend::new().with_allocation_domain(Arc::new(Domain(id)));
3368    /// assert_eq!(backend.allocation_domain(), Some(id));
3369    /// ```
3370    pub fn with_allocation_domain(mut self, domain: Arc<dyn SharedTensorAllocationDomain>) -> Self {
3371        self.allocation_domain = Some(domain);
3372        self.runtime_identity = CpuRuntimeIdentity::fresh();
3373        self
3374    }
3375
3376    /// Return the configured shared-allocation domain.
3377    ///
3378    /// # Examples
3379    ///
3380    /// ```rust
3381    /// use tenferro_cpu::CpuBackend;
3382    ///
3383    /// assert_eq!(CpuBackend::new().allocation_domain(), None);
3384    /// ```
3385    pub fn allocation_domain(&self) -> Option<AllocationDomainId> {
3386        self.allocation_domain.as_ref().map(|domain| domain.id())
3387    }
3388
3389    /// Return the allocator for this backend's shared domain.
3390    ///
3391    /// # Examples
3392    ///
3393    /// ```rust
3394    /// use tenferro_cpu::CpuBackend;
3395    ///
3396    /// assert!(CpuBackend::new().shared_allocation_domain().is_none());
3397    /// ```
3398    pub fn shared_allocation_domain(&self) -> Option<&Arc<dyn SharedTensorAllocationDomain>> {
3399        self.allocation_domain.as_ref()
3400    }
3401
3402    fn run_backend_session_cached<R: Send>(
3403        &mut self,
3404        cache: Option<&mut gemm::GemmAnalysisCache>,
3405        f: impl FnOnce(&mut dyn BackendSession) -> R + Send,
3406    ) -> R {
3407        let providers = self.provider_bundle.clone();
3408        let owner = inherited_or_new_execution_owner();
3409        let permit = self.acquire_execution_permit(owner);
3410        let entry = CpuOperationEntry::new(self.engine.domain(), &permit);
3411        let enter_managed_session = entry.supports_infallible_session_entry()
3412            && !matches!(
3413                &self.resolved,
3414                ResolvedCpuExecution::ProviderDefaultExclusive
3415            );
3416        let run = |entered| {
3417            self.with_execution_resources(&permit, |resources| {
3418                let mut buffers = BufferPoolLoan::new(&mut resources.buffers);
3419                let cache = cache.unwrap_or(&mut resources.gemm_analysis_cache);
3420                let session_started = Instant::now();
3421                let mut session = CpuExecSession {
3422                    entry,
3423                    entered,
3424                    buffers: buffers.get_mut(),
3425                    gemm_analysis_cache: cache,
3426                    indexed_plan_cache: &mut resources.indexed_plan_cache,
3427                    providers: &providers,
3428                    backend_kind: self.kind(),
3429                    allocation_domain: self.allocation_domain.as_ref(),
3430                };
3431                record_cpu_session_profile(
3432                    "with_backend_session_cached.session_construct",
3433                    session_started.elapsed(),
3434                );
3435                let exec_started = Instant::now();
3436                let result = f(&mut session);
3437                record_cpu_session_profile(
3438                    "with_backend_session_cached.exec_body",
3439                    exec_started.elapsed(),
3440                );
3441                result
3442            })
3443        };
3444        if enter_managed_session {
3445            entry.enter_managed_session(|context| run(Some(context)))
3446        } else {
3447            with_execution_owner(owner, || run(None))
3448        }
3449    }
3450}
3451
3452impl BackendSessionHost for CpuBackend {
3453    fn with_backend_session<R: Send>(
3454        &mut self,
3455        f: impl FnOnce(&mut dyn BackendSession) -> R + Send,
3456    ) -> R {
3457        self.run_backend_session_cached(None, f)
3458    }
3459
3460    fn with_backend_session_cached<R: Send>(
3461        &mut self,
3462        cache: &mut Self::RuntimeCache,
3463        f: impl FnOnce(&mut dyn BackendSession) -> R + Send,
3464    ) -> R {
3465        if !cpu_session_profile_enabled() {
3466            return self.run_backend_session_cached(Some(cache), f);
3467        }
3468        let total_started = Instant::now();
3469        let result =
3470            profile_cpu_session_section("with_backend_session_cached.exec_session", || {
3471                self.run_backend_session_cached(Some(cache), f)
3472            });
3473        record_cpu_session_profile("with_backend_session_cached.total", total_started.elapsed());
3474        maybe_print_cpu_session_profile();
3475        result
3476    }
3477}
3478
3479impl TensorBuffer for CpuBackend {
3480    fn reclaim_buffer(&mut self, tensor: Tensor) {
3481        let owner = inherited_or_new_execution_owner();
3482        with_execution_owner(owner, || {
3483            let permit = self.acquire_execution_permit(owner);
3484            self.with_execution_resources(&permit, |resources| {
3485                let buffers = &mut resources.buffers;
3486                match tensor {
3487                    Tensor::F32(t) => reclaim_typed(buffers, t),
3488                    Tensor::F64(t) => reclaim_typed(buffers, t),
3489                    Tensor::I32(t) => reclaim_typed(buffers, t),
3490                    Tensor::I64(t) => reclaim_typed(buffers, t),
3491                    Tensor::Bool(t) => reclaim_typed(buffers, t),
3492                    Tensor::C32(t) => reclaim_typed(buffers, t),
3493                    Tensor::C64(t) => reclaim_typed(buffers, t),
3494                }
3495            })
3496        })
3497    }
3498}
3499
3500impl<T, R> TensorViewCanonicalization<T, R> for CpuBackend
3501where
3502    T: TensorScalar + PoolScalar,
3503    R: TensorRank,
3504    R::Shape: Send + Sync,
3505    R::Strides: Send + Sync,
3506{
3507    fn to_contiguous(
3508        &mut self,
3509        view: &TypedTensorView<'_, T, R>,
3510    ) -> crate::Result<TypedTensor<T, R>> {
3511        self.install_with_pool(|buffers| {
3512            structural::typed_materialize_view_with_pool(buffers, view, "CpuBackend::to_contiguous")
3513        })
3514    }
3515
3516    fn copy_into(
3517        &mut self,
3518        src: &TypedTensorView<'_, T, R>,
3519        dst: &mut TypedTensorViewMut<'_, T, R>,
3520    ) -> crate::Result<()> {
3521        self.try_install(|| structural::typed_copy_view_into(src, dst, "CpuBackend::copy_into"))
3522    }
3523}
3524
3525impl TensorFusion for CpuBackend {
3526    fn execute_elementwise_fusion(
3527        &mut self,
3528        inputs: &[&Tensor],
3529        plan: &ElementwiseFusionPlan,
3530    ) -> crate::Result<Option<Vec<Tensor>>> {
3531        self.install_with_pool_context(|context, buffers| {
3532            let exec_context = context.strided_exec_context();
3533            elementwise::elementwise_fusion_with_pool(buffers, &exec_context, inputs, plan)
3534        })
3535    }
3536
3537    fn execute_broadcast_multiply(
3538        &mut self,
3539        lhs: TensorRead<'_>,
3540        lhs_shape: &[usize],
3541        lhs_dims: &[usize],
3542        rhs: TensorRead<'_>,
3543        rhs_shape: &[usize],
3544        rhs_dims: &[usize],
3545    ) -> crate::Result<Option<Tensor>> {
3546        self.install_with_pool(|buffers| {
3547            elementwise::broadcast_multiply_read_with_pool(
3548                buffers, lhs, lhs_shape, lhs_dims, rhs, rhs_shape, rhs_dims,
3549            )
3550        })
3551    }
3552
3553    fn execute_broadcast_multiply_value(
3554        &mut self,
3555        lhs: TensorRead<'_>,
3556        lhs_shape: &[usize],
3557        lhs_dims: &[usize],
3558        rhs: TensorRead<'_>,
3559        rhs_shape: &[usize],
3560        rhs_dims: &[usize],
3561    ) -> crate::Result<Option<TensorValue>> {
3562        let domain = self.engine.domain().id();
3563        self.install_with_pool_unmarked(|buffers| {
3564            elementwise::broadcast_multiply_value_with_pool_and_tag(
3565                buffers,
3566                lhs,
3567                lhs_shape,
3568                lhs_dims,
3569                rhs,
3570                rhs_shape,
3571                rhs_dims,
3572                |tensor| tag_fresh_output(tensor, domain),
3573            )
3574        })
3575    }
3576}
3577
3578impl TensorDeviceTransfer for CpuBackend {
3579    fn download_to_host(&mut self, tensor: &Tensor) -> crate::Result<Tensor> {
3580        if tensor.is_backend_buffer() {
3581            return Err(crate::Error::runtime_state(
3582                "CpuBackend::download_to_host",
3583                "CPU backend received a backend buffer; download the tensor to host with its owning backend before CPU execution",
3584            ));
3585        }
3586        Ok(tensor.clone())
3587    }
3588
3589    fn upload_host_tensor(&mut self, tensor: &Tensor) -> crate::Result<Tensor> {
3590        if tensor.is_backend_buffer() {
3591            return Err(crate::Error::runtime_state(
3592                "CpuBackend::upload_host_tensor",
3593                "CPU backend upload_host_tensor expects a host tensor; download backend buffers to host before CPU execution",
3594            ));
3595        }
3596        Ok(tensor.clone())
3597    }
3598}
3599
3600impl TensorBackend for CpuBackend {}
3601
3602pub(crate) fn reclaim_typed<T: PoolScalar>(pool: &mut BufferPool, typed: TypedTensor<T>) {
3603    let (buffer, _, _) = typed.into_parts();
3604    match buffer {
3605        Buffer::Host(data) => T::pool_release(pool, data),
3606        Buffer::Backend(_) => {}
3607    }
3608}
3609
3610impl Default for CpuBackend {
3611    fn default() -> Self {
3612        Self::new()
3613    }
3614}
3615
3616#[cfg(test)]
3617mod tests;
tenferro_cpu/backend.rs

tenferro_cpu/
backend.rs