diff --git a/src/NumSharp.Core/APIs/np.where.cs b/src/NumSharp.Core/APIs/np.where.cs
new file mode 100644
index 00000000..14633e3c
--- /dev/null
+++ b/src/NumSharp.Core/APIs/np.where.cs
@@ -0,0 +1,230 @@
+using System;
+using NumSharp.Backends.Kernels;
+using NumSharp.Generic;
+
+namespace NumSharp
+{
+ public static partial class np
+ {
+ ///
+ /// Equivalent to : returns the indices where
+ /// is non-zero.
+ ///
+ /// Input array. Non-zero entries yield their indices.
+ /// Tuple of arrays with indices where condition is non-zero, one per dimension.
+ /// https://numpy.org/doc/stable/reference/generated/numpy.where.html
+ public static NDArray[] where(NDArray condition)
+ {
+ return nonzero(condition);
+ }
+
+ ///
+ /// Return elements chosen from `x` or `y` depending on `condition`.
+ ///
+ /// Where True, yield `x`, otherwise yield `y`.
+ /// Values from which to choose where condition is True.
+ /// Values from which to choose where condition is False.
+ /// An array with elements from `x` where `condition` is True, and elements from `y` elsewhere.
+ /// https://numpy.org/doc/stable/reference/generated/numpy.where.html
+ public static NDArray where(NDArray condition, NDArray x, NDArray y)
+ {
+ return where_internal(condition, x, y);
+ }
+
+ ///
+ /// Return elements chosen from `x` or `y` depending on `condition`.
+ /// Scalar overload for x.
+ ///
+ public static NDArray where(NDArray condition, object x, NDArray y)
+ {
+ return where_internal(condition, asanyarray(x), y);
+ }
+
+ ///
+ /// Return elements chosen from `x` or `y` depending on `condition`.
+ /// Scalar overload for y.
+ ///
+ public static NDArray where(NDArray condition, NDArray x, object y)
+ {
+ return where_internal(condition, x, asanyarray(y));
+ }
+
+ ///
+ /// Return elements chosen from `x` or `y` depending on `condition`.
+ /// Scalar overload for both x and y.
+ ///
+ public static NDArray where(NDArray condition, object x, object y)
+ {
+ return where_internal(condition, asanyarray(x), asanyarray(y));
+ }
+
+ ///
+ /// Internal implementation of np.where.
+ ///
+ private static NDArray where_internal(NDArray condition, NDArray x, NDArray y)
+ {
+ // Skip broadcast_arrays (which allocates 3 NDArrays + helper arrays) when all three
+ // already share a shape — the frequent case of np.where(mask, arr, other_arr).
+ NDArray cond, xArr, yArr;
+ if (condition.Shape == x.Shape && x.Shape == y.Shape)
+ {
+ cond = condition;
+ xArr = x;
+ yArr = y;
+ }
+ else
+ {
+ var broadcasted = broadcast_arrays(condition, x, y);
+ cond = broadcasted[0];
+ xArr = broadcasted[1];
+ yArr = broadcasted[2];
+ }
+
+ // When x and y already agree, skip the NEP50 promotion lookup. Otherwise defer to
+ // _FindCommonType which handles the scalar+array NEP50 rules.
+ var outType = x.GetTypeCode == y.GetTypeCode
+ ? x.GetTypeCode
+ : _FindCommonType(x, y);
+
+ if (xArr.GetTypeCode != outType)
+ xArr = xArr.astype(outType, copy: false);
+ if (yArr.GetTypeCode != outType)
+ yArr = yArr.astype(outType, copy: false);
+
+ // Use cond.shape (dimensions only) not cond.Shape (which may have broadcast strides)
+ var result = empty(cond.shape, outType);
+
+ // Handle empty arrays - nothing to iterate
+ if (result.size == 0)
+ return result;
+
+ // IL Kernel fast path: all arrays contiguous, bool condition, SIMD enabled
+ // Broadcasted arrays (stride=0) are NOT contiguous, so they use iterator path.
+ bool canUseKernel = ILKernelGenerator.Enabled &&
+ cond.typecode == NPTypeCode.Boolean &&
+ cond.Shape.IsContiguous &&
+ xArr.Shape.IsContiguous &&
+ yArr.Shape.IsContiguous;
+
+ if (canUseKernel)
+ {
+ WhereKernelDispatch(cond, xArr, yArr, result, outType);
+ return result;
+ }
+
+ // Iterator fallback for non-contiguous/broadcasted arrays
+ switch (outType)
+ {
+ case NPTypeCode.Boolean:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.Byte:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.Int16:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.UInt16:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.Int32:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.UInt32:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.Int64:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.UInt64:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.Char:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.Single:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.Double:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ case NPTypeCode.Decimal:
+ WhereImpl(cond, xArr, yArr, result);
+ break;
+ default:
+ throw new NotSupportedException($"Type {outType} not supported for np.where");
+ }
+
+ return result;
+ }
+
+ private static void WhereImpl(NDArray cond, NDArray x, NDArray y, NDArray result) where T : unmanaged
+ {
+ // Use iterators for proper handling of broadcasted/strided arrays
+ using var condIter = cond.AsIterator();
+ using var xIter = x.AsIterator();
+ using var yIter = y.AsIterator();
+ using var resultIter = result.AsIterator();
+
+ while (condIter.HasNext())
+ {
+ var c = condIter.MoveNext();
+ var xVal = xIter.MoveNext();
+ var yVal = yIter.MoveNext();
+ resultIter.MoveNextReference() = c ? xVal : yVal;
+ }
+ }
+
+ ///
+ /// IL Kernel dispatch for contiguous arrays.
+ /// Uses IL-generated kernels with SIMD optimization.
+ ///
+ private static unsafe void WhereKernelDispatch(NDArray cond, NDArray x, NDArray y, NDArray result, NPTypeCode outType)
+ {
+ var condPtr = (bool*)cond.Address;
+ var count = result.size;
+
+ switch (outType)
+ {
+ case NPTypeCode.Boolean:
+ ILKernelGenerator.WhereExecute(condPtr, (bool*)x.Address, (bool*)y.Address, (bool*)result.Address, count);
+ break;
+ case NPTypeCode.Byte:
+ ILKernelGenerator.WhereExecute(condPtr, (byte*)x.Address, (byte*)y.Address, (byte*)result.Address, count);
+ break;
+ case NPTypeCode.Int16:
+ ILKernelGenerator.WhereExecute(condPtr, (short*)x.Address, (short*)y.Address, (short*)result.Address, count);
+ break;
+ case NPTypeCode.UInt16:
+ ILKernelGenerator.WhereExecute(condPtr, (ushort*)x.Address, (ushort*)y.Address, (ushort*)result.Address, count);
+ break;
+ case NPTypeCode.Int32:
+ ILKernelGenerator.WhereExecute(condPtr, (int*)x.Address, (int*)y.Address, (int*)result.Address, count);
+ break;
+ case NPTypeCode.UInt32:
+ ILKernelGenerator.WhereExecute(condPtr, (uint*)x.Address, (uint*)y.Address, (uint*)result.Address, count);
+ break;
+ case NPTypeCode.Int64:
+ ILKernelGenerator.WhereExecute(condPtr, (long*)x.Address, (long*)y.Address, (long*)result.Address, count);
+ break;
+ case NPTypeCode.UInt64:
+ ILKernelGenerator.WhereExecute(condPtr, (ulong*)x.Address, (ulong*)y.Address, (ulong*)result.Address, count);
+ break;
+ case NPTypeCode.Char:
+ ILKernelGenerator.WhereExecute(condPtr, (char*)x.Address, (char*)y.Address, (char*)result.Address, count);
+ break;
+ case NPTypeCode.Single:
+ ILKernelGenerator.WhereExecute(condPtr, (float*)x.Address, (float*)y.Address, (float*)result.Address, count);
+ break;
+ case NPTypeCode.Double:
+ ILKernelGenerator.WhereExecute(condPtr, (double*)x.Address, (double*)y.Address, (double*)result.Address, count);
+ break;
+ case NPTypeCode.Decimal:
+ ILKernelGenerator.WhereExecute(condPtr, (decimal*)x.Address, (decimal*)y.Address, (decimal*)result.Address, count);
+ break;
+ default:
+ throw new NotSupportedException($"Type {outType} not supported for np.where");
+ }
+ }
+ }
+}
diff --git a/src/NumSharp.Core/Backends/Kernels/ILKernelGenerator.Where.cs b/src/NumSharp.Core/Backends/Kernels/ILKernelGenerator.Where.cs
new file mode 100644
index 00000000..72678ca7
--- /dev/null
+++ b/src/NumSharp.Core/Backends/Kernels/ILKernelGenerator.Where.cs
@@ -0,0 +1,699 @@
+using System;
+using System.Collections.Concurrent;
+using System.Reflection;
+using System.Reflection.Emit;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using NumSharp.Utilities;
+
+// =============================================================================
+// ILKernelGenerator.Where - IL-generated np.where(condition, x, y) kernels
+// =============================================================================
+//
+// RESPONSIBILITY:
+// - Generate optimized kernels for conditional selection
+// - result[i] = cond[i] ? x[i] : y[i]
+//
+// ARCHITECTURE:
+// Uses IL emission to generate type-specific kernels at runtime.
+// The challenge is bool mask expansion: condition is bool[] (1 byte per element),
+// but x/y can be any dtype (1-8 bytes per element).
+//
+// | Element Size | V256 Elements | Bools to Load |
+// |--------------|---------------|---------------|
+// | 1 byte | 32 | 32 |
+// | 2 bytes | 16 | 16 |
+// | 4 bytes | 8 | 8 |
+// | 8 bytes | 4 | 4 |
+//
+// KERNEL TYPES:
+// - WhereKernel: Main kernel delegate (cond*, x*, y*, result*, count)
+//
+// =============================================================================
+
+namespace NumSharp.Backends.Kernels
+{
+ ///
+ /// Delegate for where operation kernels.
+ ///
+ public unsafe delegate void WhereKernel(bool* cond, T* x, T* y, T* result, long count) where T : unmanaged;
+
+ public static partial class ILKernelGenerator
+ {
+ ///
+ /// Cache of IL-generated where kernels.
+ /// Key: Type
+ ///
+ private static readonly ConcurrentDictionary _whereKernelCache = new();
+
+ #region Public API
+
+ ///
+ /// Get or generate an IL-based where kernel for the specified type.
+ /// Returns null if IL generation is disabled or fails.
+ ///
+ public static WhereKernel? GetWhereKernel() where T : unmanaged
+ {
+ if (!Enabled)
+ return null;
+
+ var type = typeof(T);
+
+ if (_whereKernelCache.TryGetValue(type, out var cached))
+ return (WhereKernel)cached;
+
+ var kernel = TryGenerateWhereKernel();
+ if (kernel == null)
+ return null;
+
+ if (_whereKernelCache.TryAdd(type, kernel))
+ return kernel;
+
+ return (WhereKernel)_whereKernelCache[type];
+ }
+
+ ///
+ /// Execute where operation using IL-generated kernel or fallback to static helper.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static unsafe void WhereExecute(bool* cond, T* x, T* y, T* result, long count) where T : unmanaged
+ {
+ if (count == 0)
+ return;
+
+ var kernel = GetWhereKernel();
+ if (kernel != null)
+ {
+ kernel(cond, x, y, result, count);
+ }
+ else
+ {
+ // Fallback to scalar loop
+ WhereScalar(cond, x, y, result, count);
+ }
+ }
+
+ #endregion
+
+ #region Kernel Generation
+
+ private static WhereKernel? TryGenerateWhereKernel() where T : unmanaged
+ {
+ try
+ {
+ return GenerateWhereKernelIL();
+ }
+ catch (Exception ex)
+ {
+ System.Diagnostics.Debug.WriteLine($"[ILKernel] TryGenerateWhereKernel<{typeof(T).Name}>: {ex.GetType().Name}: {ex.Message}");
+ return null;
+ }
+ }
+
+ private static unsafe WhereKernel GenerateWhereKernelIL() where T : unmanaged
+ {
+ int elementSize = Unsafe.SizeOf();
+
+ // SIMD eligibility:
+ // - 1-byte types (byte) only touch portable Vector128/Vector256 APIs, so they work
+ // on any SIMD-capable platform (including ARM64/Neon).
+ // - 2/4/8-byte types need Sse41.ConvertToVector128Int* (V128 path) or
+ // Avx2.ConvertToVector256Int* (V256 path) to expand the bool-mask lanes.
+ // These x86 intrinsics throw PlatformNotSupportedException on ARM64.
+ bool canSimdDtype = elementSize <= 8 && IsSimdSupported();
+ bool needsX86 = elementSize > 1;
+ bool useV256 = VectorBits >= 256 && (!needsX86 || Avx2.IsSupported);
+ bool useV128 = !useV256 && VectorBits >= 128 && (!needsX86 || Sse41.IsSupported);
+ bool emitSimd = canSimdDtype && (useV256 || useV128);
+
+ var dm = new DynamicMethod(
+ name: $"IL_Where_{typeof(T).Name}",
+ returnType: typeof(void),
+ parameterTypes: new[] { typeof(bool*), typeof(T*), typeof(T*), typeof(T*), typeof(long) },
+ owner: typeof(ILKernelGenerator),
+ skipVisibility: true
+ );
+
+ var il = dm.GetILGenerator();
+
+ // Locals
+ var locI = il.DeclareLocal(typeof(long)); // loop counter
+
+ // Labels
+ var lblScalarLoop = il.DefineLabel();
+ var lblScalarLoopEnd = il.DefineLabel();
+
+ // i = 0
+ il.Emit(OpCodes.Ldc_I8, 0L);
+ il.Emit(OpCodes.Stloc, locI);
+
+ if (emitSimd)
+ {
+ EmitWhereSIMDLoop(il, locI, useV256);
+ }
+
+ // Scalar loop for remainder
+ il.MarkLabel(lblScalarLoop);
+
+ // if (i >= count) goto end
+ il.Emit(OpCodes.Ldloc, locI);
+ il.Emit(OpCodes.Ldarg, 4); // count
+ il.Emit(OpCodes.Bge, lblScalarLoopEnd);
+
+ // result[i] = cond[i] ? x[i] : y[i]
+ EmitWhereScalarElement(il, locI);
+
+ // i++
+ il.Emit(OpCodes.Ldloc, locI);
+ il.Emit(OpCodes.Ldc_I8, 1L);
+ il.Emit(OpCodes.Add);
+ il.Emit(OpCodes.Stloc, locI);
+
+ il.Emit(OpCodes.Br, lblScalarLoop);
+
+ il.MarkLabel(lblScalarLoopEnd);
+ il.Emit(OpCodes.Ret);
+
+ return (WhereKernel)dm.CreateDelegate(typeof(WhereKernel));
+ }
+
+ private static void EmitWhereSIMDLoop(ILGenerator il, LocalBuilder locI, bool useV256) where T : unmanaged
+ {
+ long elementSize = Unsafe.SizeOf();
+ long vectorCount = useV256 ? (32 / elementSize) : (16 / elementSize);
+ long unrollFactor = 4;
+ long unrollStep = vectorCount * unrollFactor;
+
+ var locUnrollEnd = il.DeclareLocal(typeof(long));
+ var locVectorEnd = il.DeclareLocal(typeof(long));
+
+ var lblUnrollLoop = il.DefineLabel();
+ var lblUnrollLoopEnd = il.DefineLabel();
+ var lblVectorLoop = il.DefineLabel();
+ var lblVectorLoopEnd = il.DefineLabel();
+
+ // unrollEnd = count - unrollStep (for 4x unrolled loop)
+ il.Emit(OpCodes.Ldarg, 4); // count
+ il.Emit(OpCodes.Ldc_I8, unrollStep);
+ il.Emit(OpCodes.Sub);
+ il.Emit(OpCodes.Stloc, locUnrollEnd);
+
+ // vectorEnd = count - vectorCount (for remainder loop)
+ il.Emit(OpCodes.Ldarg, 4); // count
+ il.Emit(OpCodes.Ldc_I8, vectorCount);
+ il.Emit(OpCodes.Sub);
+ il.Emit(OpCodes.Stloc, locVectorEnd);
+
+ // ========== 4x UNROLLED SIMD LOOP ==========
+ il.MarkLabel(lblUnrollLoop);
+
+ // if (i > unrollEnd) goto UnrollLoopEnd
+ il.Emit(OpCodes.Ldloc, locI);
+ il.Emit(OpCodes.Ldloc, locUnrollEnd);
+ il.Emit(OpCodes.Bgt, lblUnrollLoopEnd);
+
+ // Process 4 vectors per iteration
+ for (long u = 0; u < unrollFactor; u++)
+ {
+ long offset = vectorCount * u;
+ if (useV256)
+ EmitWhereV256BodyWithOffset(il, locI, elementSize, offset);
+ else
+ EmitWhereV128BodyWithOffset(il, locI, elementSize, offset);
+ }
+
+ // i += unrollStep
+ il.Emit(OpCodes.Ldloc, locI);
+ il.Emit(OpCodes.Ldc_I8, unrollStep);
+ il.Emit(OpCodes.Add);
+ il.Emit(OpCodes.Stloc, locI);
+
+ il.Emit(OpCodes.Br, lblUnrollLoop);
+
+ il.MarkLabel(lblUnrollLoopEnd);
+
+ // ========== REMAINDER SIMD LOOP (1 vector at a time) ==========
+ il.MarkLabel(lblVectorLoop);
+
+ // if (i > vectorEnd) goto VectorLoopEnd
+ il.Emit(OpCodes.Ldloc, locI);
+ il.Emit(OpCodes.Ldloc, locVectorEnd);
+ il.Emit(OpCodes.Bgt, lblVectorLoopEnd);
+
+ // Process 1 vector
+ if (useV256)
+ EmitWhereV256BodyWithOffset(il, locI, elementSize, 0L);
+ else
+ EmitWhereV128BodyWithOffset(il, locI, elementSize, 0L);
+
+ // i += vectorCount
+ il.Emit(OpCodes.Ldloc, locI);
+ il.Emit(OpCodes.Ldc_I8, vectorCount);
+ il.Emit(OpCodes.Add);
+ il.Emit(OpCodes.Stloc, locI);
+
+ il.Emit(OpCodes.Br, lblVectorLoop);
+
+ il.MarkLabel(lblVectorLoopEnd);
+ }
+
+ private static void EmitWhereV256BodyWithOffset(ILGenerator il, LocalBuilder locI, long elementSize, long offset) where T : unmanaged
+ {
+ var loadMethod = CachedMethods.V256LoadGeneric.MakeGenericMethod(typeof(T));
+ var storeMethod = CachedMethods.V256StoreGeneric.MakeGenericMethod(typeof(T));
+ var selectMethod = CachedMethods.V256ConditionalSelectGeneric.MakeGenericMethod(typeof(T));
+
+ // Load address: cond + (i + offset)
+ il.Emit(OpCodes.Ldarg_0); // cond
+ il.Emit(OpCodes.Ldloc, locI);
+ if (offset > 0)
+ {
+ il.Emit(OpCodes.Ldc_I8, offset);
+ il.Emit(OpCodes.Add);
+ }
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+
+ // Inline mask creation - emit AVX2 instructions directly instead of calling helper
+ EmitInlineMaskCreationV256(il, (int)elementSize);
+
+ // Load x vector: x + (i + offset) * elementSize
+ il.Emit(OpCodes.Ldarg_1); // x
+ il.Emit(OpCodes.Ldloc, locI);
+ if (offset > 0)
+ {
+ il.Emit(OpCodes.Ldc_I8, offset);
+ il.Emit(OpCodes.Add);
+ }
+ il.Emit(OpCodes.Ldc_I8, elementSize);
+ il.Emit(OpCodes.Mul);
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+ il.Emit(OpCodes.Call, loadMethod);
+
+ // Load y vector: y + (i + offset) * elementSize
+ il.Emit(OpCodes.Ldarg_2); // y
+ il.Emit(OpCodes.Ldloc, locI);
+ if (offset > 0)
+ {
+ il.Emit(OpCodes.Ldc_I8, offset);
+ il.Emit(OpCodes.Add);
+ }
+ il.Emit(OpCodes.Ldc_I8, elementSize);
+ il.Emit(OpCodes.Mul);
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+ il.Emit(OpCodes.Call, loadMethod);
+
+ // Stack: mask, xVec, yVec
+ // ConditionalSelect(mask, x, y)
+ il.Emit(OpCodes.Call, selectMethod);
+
+ // Store result: result + (i + offset) * elementSize
+ il.Emit(OpCodes.Ldarg_3); // result
+ il.Emit(OpCodes.Ldloc, locI);
+ if (offset > 0)
+ {
+ il.Emit(OpCodes.Ldc_I8, offset);
+ il.Emit(OpCodes.Add);
+ }
+ il.Emit(OpCodes.Ldc_I8, elementSize);
+ il.Emit(OpCodes.Mul);
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+ il.Emit(OpCodes.Call, storeMethod);
+ }
+
+ private static void EmitWhereV128BodyWithOffset(ILGenerator il, LocalBuilder locI, long elementSize, long offset) where T : unmanaged
+ {
+ var loadMethod = CachedMethods.V128LoadGeneric.MakeGenericMethod(typeof(T));
+ var storeMethod = CachedMethods.V128StoreGeneric.MakeGenericMethod(typeof(T));
+ var selectMethod = CachedMethods.V128ConditionalSelectGeneric.MakeGenericMethod(typeof(T));
+
+ // Load address: cond + (i + offset)
+ il.Emit(OpCodes.Ldarg_0);
+ il.Emit(OpCodes.Ldloc, locI);
+ if (offset > 0)
+ {
+ il.Emit(OpCodes.Ldc_I8, offset);
+ il.Emit(OpCodes.Add);
+ }
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+
+ // Inline mask creation - emit SSE4.1 instructions directly
+ EmitInlineMaskCreationV128(il, (int)elementSize);
+
+ // Load x vector
+ il.Emit(OpCodes.Ldarg_1);
+ il.Emit(OpCodes.Ldloc, locI);
+ if (offset > 0)
+ {
+ il.Emit(OpCodes.Ldc_I8, offset);
+ il.Emit(OpCodes.Add);
+ }
+ il.Emit(OpCodes.Ldc_I8, elementSize);
+ il.Emit(OpCodes.Mul);
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+ il.Emit(OpCodes.Call, loadMethod);
+
+ // Load y vector
+ il.Emit(OpCodes.Ldarg_2);
+ il.Emit(OpCodes.Ldloc, locI);
+ if (offset > 0)
+ {
+ il.Emit(OpCodes.Ldc_I8, offset);
+ il.Emit(OpCodes.Add);
+ }
+ il.Emit(OpCodes.Ldc_I8, elementSize);
+ il.Emit(OpCodes.Mul);
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+ il.Emit(OpCodes.Call, loadMethod);
+
+ // ConditionalSelect
+ il.Emit(OpCodes.Call, selectMethod);
+
+ // Store
+ il.Emit(OpCodes.Ldarg_3);
+ il.Emit(OpCodes.Ldloc, locI);
+ if (offset > 0)
+ {
+ il.Emit(OpCodes.Ldc_I8, offset);
+ il.Emit(OpCodes.Add);
+ }
+ il.Emit(OpCodes.Ldc_I8, elementSize);
+ il.Emit(OpCodes.Mul);
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+ il.Emit(OpCodes.Call, storeMethod);
+ }
+
+ private static void EmitWhereScalarElement(ILGenerator il, LocalBuilder locI) where T : unmanaged
+ {
+ long elementSize = Unsafe.SizeOf();
+ var typeCode = InfoOf.NPTypeCode;
+
+ // result[i] = cond[i] ? x[i] : y[i]
+ var lblFalse = il.DefineLabel();
+ var lblEnd = il.DefineLabel();
+
+ // Load result address: result + i * elementSize
+ il.Emit(OpCodes.Ldarg_3);
+ il.Emit(OpCodes.Ldloc, locI);
+ il.Emit(OpCodes.Ldc_I8, elementSize);
+ il.Emit(OpCodes.Mul);
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+
+ // Load cond[i]: cond + i (bool is 1 byte)
+ il.Emit(OpCodes.Ldarg_0);
+ il.Emit(OpCodes.Ldloc, locI);
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+ il.Emit(OpCodes.Ldind_U1); // Load bool as byte
+
+ // if (!cond[i]) goto lblFalse
+ il.Emit(OpCodes.Brfalse, lblFalse);
+
+ // True branch: load x[i]
+ il.Emit(OpCodes.Ldarg_1);
+ il.Emit(OpCodes.Ldloc, locI);
+ il.Emit(OpCodes.Ldc_I8, elementSize);
+ il.Emit(OpCodes.Mul);
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+ EmitLoadIndirect(il, typeCode);
+ il.Emit(OpCodes.Br, lblEnd);
+
+ // False branch: load y[i]
+ il.MarkLabel(lblFalse);
+ il.Emit(OpCodes.Ldarg_2);
+ il.Emit(OpCodes.Ldloc, locI);
+ il.Emit(OpCodes.Ldc_I8, elementSize);
+ il.Emit(OpCodes.Mul);
+ il.Emit(OpCodes.Conv_I);
+ il.Emit(OpCodes.Add);
+ EmitLoadIndirect(il, typeCode);
+
+ il.MarkLabel(lblEnd);
+ // Stack: result_ptr, value
+ EmitStoreIndirect(il, typeCode);
+ }
+
+ #endregion
+
+ #region Inline Mask IL Emission
+
+ // Vector-related MethodInfos for np.where are cached in the partial CachedMethods class
+ // below (see "Where Kernel Methods" region at the end of this file).
+
+ ///
+ /// Emit inline V256 mask creation. Stack: byte* -> Vector256{T} (as mask)
+ ///
+ private static void EmitInlineMaskCreationV256(ILGenerator il, int elementSize)
+ {
+ // Stack has: byte* pointing to condition bools
+
+ switch (elementSize)
+ {
+ case 8: // double/long: load 4 bytes, expand to 4 qwords
+ // *(uint*)ptr
+ il.Emit(OpCodes.Ldind_U4);
+ // Vector128.CreateScalar(value)
+ il.Emit(OpCodes.Call, CachedMethods.V128CreateScalarUInt);
+ // .AsByte()
+ il.Emit(OpCodes.Call, CachedMethods.V128UIntAsByte);
+ // Avx2.ConvertToVector256Int64(bytes)
+ il.Emit(OpCodes.Call, CachedMethods.Avx2ConvertToV256Int64);
+ // .AsUInt64()
+ il.Emit(OpCodes.Call, CachedMethods.V256LongAsULong);
+ // Vector256.Zero
+ il.Emit(OpCodes.Call, CachedMethods.V256GetZeroULong);
+ // Vector256.GreaterThan(expanded, zero)
+ il.Emit(OpCodes.Call, CachedMethods.V256GreaterThanULong);
+ break;
+
+ case 4: // float/int: load 8 bytes, expand to 8 dwords
+ // *(ulong*)ptr
+ il.Emit(OpCodes.Ldind_I8);
+ // Vector128.CreateScalar(value)
+ il.Emit(OpCodes.Call, CachedMethods.V128CreateScalarULong);
+ // .AsByte()
+ il.Emit(OpCodes.Call, CachedMethods.V128ULongAsByte);
+ // Avx2.ConvertToVector256Int32(bytes)
+ il.Emit(OpCodes.Call, CachedMethods.Avx2ConvertToV256Int32);
+ // .AsUInt32()
+ il.Emit(OpCodes.Call, CachedMethods.V256IntAsUInt);
+ // Vector256.Zero
+ il.Emit(OpCodes.Call, CachedMethods.V256GetZeroUInt);
+ // Vector256.GreaterThan(expanded, zero)
+ il.Emit(OpCodes.Call, CachedMethods.V256GreaterThanUInt);
+ break;
+
+ case 2: // short/char: load 16 bytes, expand to 16 words
+ // Vector128.Load(ptr)
+ il.Emit(OpCodes.Call, CachedMethods.V128LoadByte);
+ // Avx2.ConvertToVector256Int16(bytes)
+ il.Emit(OpCodes.Call, CachedMethods.Avx2ConvertToV256Int16);
+ // .AsUInt16()
+ il.Emit(OpCodes.Call, CachedMethods.V256ShortAsUShort);
+ // Vector256.Zero
+ il.Emit(OpCodes.Call, CachedMethods.V256GetZeroUShort);
+ // Vector256.GreaterThan(expanded, zero)
+ il.Emit(OpCodes.Call, CachedMethods.V256GreaterThanUShort);
+ break;
+
+ case 1: // byte/bool: load 32 bytes, compare directly
+ // Vector256.Load(ptr)
+ il.Emit(OpCodes.Call, CachedMethods.V256LoadByte);
+ // Vector256.Zero
+ il.Emit(OpCodes.Call, CachedMethods.V256GetZeroByte);
+ // Vector256.GreaterThan(vec, zero)
+ il.Emit(OpCodes.Call, CachedMethods.V256GreaterThanByte);
+ break;
+
+ default:
+ throw new NotSupportedException($"Element size {elementSize} not supported");
+ }
+ }
+
+ ///
+ /// Emit inline V128 mask creation. Stack: byte* -> Vector128{T} (as mask)
+ ///
+ private static void EmitInlineMaskCreationV128(ILGenerator il, int elementSize)
+ {
+ switch (elementSize)
+ {
+ case 8: // double/long: load 2 bytes, expand to 2 qwords
+ // *(ushort*)ptr
+ il.Emit(OpCodes.Ldind_U2);
+ // Vector128.CreateScalar(value)
+ il.Emit(OpCodes.Call, CachedMethods.V128CreateScalarUShort);
+ // .AsByte()
+ il.Emit(OpCodes.Call, CachedMethods.V128UShortAsByte);
+ // Sse41.ConvertToVector128Int64(bytes)
+ il.Emit(OpCodes.Call, CachedMethods.Sse41ConvertToV128Int64);
+ // .AsUInt64()
+ il.Emit(OpCodes.Call, CachedMethods.V128LongAsULong);
+ // Vector128.Zero
+ il.Emit(OpCodes.Call, CachedMethods.V128GetZeroULong);
+ // Vector128.GreaterThan(expanded, zero)
+ il.Emit(OpCodes.Call, CachedMethods.V128GreaterThanULong);
+ break;
+
+ case 4: // float/int: load 4 bytes, expand to 4 dwords
+ // *(uint*)ptr
+ il.Emit(OpCodes.Ldind_U4);
+ // Vector128.CreateScalar(value)
+ il.Emit(OpCodes.Call, CachedMethods.V128CreateScalarUInt);
+ // .AsByte()
+ il.Emit(OpCodes.Call, CachedMethods.V128UIntAsByte);
+ // Sse41.ConvertToVector128Int32(bytes)
+ il.Emit(OpCodes.Call, CachedMethods.Sse41ConvertToV128Int32);
+ // .AsUInt32()
+ il.Emit(OpCodes.Call, CachedMethods.V128IntAsUInt);
+ // Vector128.Zero
+ il.Emit(OpCodes.Call, CachedMethods.V128GetZeroUInt);
+ // Vector128.GreaterThan(expanded, zero)
+ il.Emit(OpCodes.Call, CachedMethods.V128GreaterThanUInt);
+ break;
+
+ case 2: // short/char: load 8 bytes, expand to 8 words
+ // *(ulong*)ptr
+ il.Emit(OpCodes.Ldind_I8);
+ // Vector128.CreateScalar(value)
+ il.Emit(OpCodes.Call, CachedMethods.V128CreateScalarULong);
+ // .AsByte()
+ il.Emit(OpCodes.Call, CachedMethods.V128ULongAsByte);
+ // Sse41.ConvertToVector128Int16(bytes)
+ il.Emit(OpCodes.Call, CachedMethods.Sse41ConvertToV128Int16);
+ // .AsUInt16()
+ il.Emit(OpCodes.Call, CachedMethods.V128ShortAsUShort);
+ // Vector128.Zero
+ il.Emit(OpCodes.Call, CachedMethods.V128GetZeroUShort);
+ // Vector128.GreaterThan(expanded, zero)
+ il.Emit(OpCodes.Call, CachedMethods.V128GreaterThanUShort);
+ break;
+
+ case 1: // byte/bool: load 16 bytes, compare directly
+ // Vector128.Load(ptr)
+ il.Emit(OpCodes.Call, CachedMethods.V128LoadByte);
+ // Vector128.Zero
+ il.Emit(OpCodes.Call, CachedMethods.V128GetZeroByte);
+ // Vector128.GreaterThan(vec, zero)
+ il.Emit(OpCodes.Call, CachedMethods.V128GreaterThanByte);
+ break;
+
+ default:
+ throw new NotSupportedException($"Element size {elementSize} not supported");
+ }
+ }
+
+ #endregion
+
+ #region Scalar Fallback
+
+ ///
+ /// Scalar fallback for where operation.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static unsafe void WhereScalar(bool* cond, T* x, T* y, T* result, long count) where T : unmanaged
+ {
+ for (long i = 0; i < count; i++)
+ {
+ result[i] = cond[i] ? x[i] : y[i];
+ }
+ }
+
+ #endregion
+
+ // Per the CachedMethods pattern in ILKernelGenerator.cs, reflection lookups for np.where
+ // live alongside the other cached entries. Fail-fast at type init so a renamed API shows
+ // up immediately instead of NREs at first use.
+ private static partial class CachedMethods
+ {
+ #region Where Kernel Methods
+
+ private static MethodInfo FindGenericMethod(Type container, string name, int? paramCount = null)
+ {
+ foreach (var m in container.GetMethods())
+ {
+ if (m.Name == name && m.IsGenericMethodDefinition &&
+ (paramCount is null || m.GetParameters().Length == paramCount.Value))
+ return m;
+ }
+ throw new MissingMethodException(container.FullName, name);
+ }
+
+ private static MethodInfo FindMethodExact(Type container, string name, Type[] argTypes)
+ => container.GetMethod(name, argTypes)
+ ?? throw new MissingMethodException(container.FullName, name);
+
+ private static MethodInfo GetZeroGetter(Type vectorOfT)
+ => vectorOfT.GetProperty("Zero")?.GetMethod
+ ?? throw new MissingMethodException(vectorOfT.FullName, "get_Zero");
+
+ // Generic definitions — caller must MakeGenericMethod(typeof(T)) before emitting.
+ public static readonly MethodInfo V256LoadGeneric = FindGenericMethod(typeof(Vector256), "Load", 1);
+ public static readonly MethodInfo V256StoreGeneric = FindGenericMethod(typeof(Vector256), "Store", 2);
+ public static readonly MethodInfo V256ConditionalSelectGeneric = FindGenericMethod(typeof(Vector256), "ConditionalSelect");
+
+ public static readonly MethodInfo V128LoadGeneric = FindGenericMethod(typeof(Vector128), "Load", 1);
+ public static readonly MethodInfo V128StoreGeneric = FindGenericMethod(typeof(Vector128), "Store", 2);
+ public static readonly MethodInfo V128ConditionalSelectGeneric = FindGenericMethod(typeof(Vector128), "ConditionalSelect");
+
+ // Already-specialised generic methods used during mask creation.
+ public static readonly MethodInfo V256LoadByte = FindGenericMethod(typeof(Vector256), "Load").MakeGenericMethod(typeof(byte));
+ public static readonly MethodInfo V128LoadByte = FindGenericMethod(typeof(Vector128), "Load").MakeGenericMethod(typeof(byte));
+
+ public static readonly MethodInfo V128CreateScalarUInt = FindGenericMethod(typeof(Vector128), "CreateScalar").MakeGenericMethod(typeof(uint));
+ public static readonly MethodInfo V128CreateScalarULong = FindGenericMethod(typeof(Vector128), "CreateScalar").MakeGenericMethod(typeof(ulong));
+ public static readonly MethodInfo V128CreateScalarUShort = FindGenericMethod(typeof(Vector128), "CreateScalar").MakeGenericMethod(typeof(ushort));
+
+ public static readonly MethodInfo V128UIntAsByte = FindGenericMethod(typeof(Vector128), "AsByte").MakeGenericMethod(typeof(uint));
+ public static readonly MethodInfo V128ULongAsByte = FindGenericMethod(typeof(Vector128), "AsByte").MakeGenericMethod(typeof(ulong));
+ public static readonly MethodInfo V128UShortAsByte = FindGenericMethod(typeof(Vector128), "AsByte").MakeGenericMethod(typeof(ushort));
+
+ public static readonly MethodInfo V256LongAsULong = FindGenericMethod(typeof(Vector256), "AsUInt64").MakeGenericMethod(typeof(long));
+ public static readonly MethodInfo V256IntAsUInt = FindGenericMethod(typeof(Vector256), "AsUInt32").MakeGenericMethod(typeof(int));
+ public static readonly MethodInfo V256ShortAsUShort = FindGenericMethod(typeof(Vector256), "AsUInt16").MakeGenericMethod(typeof(short));
+
+ public static readonly MethodInfo V128LongAsULong = FindGenericMethod(typeof(Vector128), "AsUInt64").MakeGenericMethod(typeof(long));
+ public static readonly MethodInfo V128IntAsUInt = FindGenericMethod(typeof(Vector128), "AsUInt32").MakeGenericMethod(typeof(int));
+ public static readonly MethodInfo V128ShortAsUShort = FindGenericMethod(typeof(Vector128), "AsUInt16").MakeGenericMethod(typeof(short));
+
+ public static readonly MethodInfo V256GreaterThanULong = FindGenericMethod(typeof(Vector256), "GreaterThan").MakeGenericMethod(typeof(ulong));
+ public static readonly MethodInfo V256GreaterThanUInt = FindGenericMethod(typeof(Vector256), "GreaterThan").MakeGenericMethod(typeof(uint));
+ public static readonly MethodInfo V256GreaterThanUShort = FindGenericMethod(typeof(Vector256), "GreaterThan").MakeGenericMethod(typeof(ushort));
+ public static readonly MethodInfo V256GreaterThanByte = FindGenericMethod(typeof(Vector256), "GreaterThan").MakeGenericMethod(typeof(byte));
+
+ public static readonly MethodInfo V128GreaterThanULong = FindGenericMethod(typeof(Vector128), "GreaterThan").MakeGenericMethod(typeof(ulong));
+ public static readonly MethodInfo V128GreaterThanUInt = FindGenericMethod(typeof(Vector128), "GreaterThan").MakeGenericMethod(typeof(uint));
+ public static readonly MethodInfo V128GreaterThanUShort = FindGenericMethod(typeof(Vector128), "GreaterThan").MakeGenericMethod(typeof(ushort));
+ public static readonly MethodInfo V128GreaterThanByte = FindGenericMethod(typeof(Vector128), "GreaterThan").MakeGenericMethod(typeof(byte));
+
+ // Non-generic exact overloads on Avx2/Sse41 for byte-lane sign-extend expansion.
+ public static readonly MethodInfo Avx2ConvertToV256Int64 = FindMethodExact(typeof(Avx2), "ConvertToVector256Int64", new[] { typeof(Vector128) });
+ public static readonly MethodInfo Avx2ConvertToV256Int32 = FindMethodExact(typeof(Avx2), "ConvertToVector256Int32", new[] { typeof(Vector128) });
+ public static readonly MethodInfo Avx2ConvertToV256Int16 = FindMethodExact(typeof(Avx2), "ConvertToVector256Int16", new[] { typeof(Vector128) });
+ public static readonly MethodInfo Sse41ConvertToV128Int64 = FindMethodExact(typeof(Sse41), "ConvertToVector128Int64", new[] { typeof(Vector128) });
+ public static readonly MethodInfo Sse41ConvertToV128Int32 = FindMethodExact(typeof(Sse41), "ConvertToVector128Int32", new[] { typeof(Vector128) });
+ public static readonly MethodInfo Sse41ConvertToV128Int16 = FindMethodExact(typeof(Sse41), "ConvertToVector128Int16", new[] { typeof(Vector128) });
+
+ // Vector*.Zero property getters — emitted as a call, not a field load, so we cache the getter MethodInfo.
+ public static readonly MethodInfo V256GetZeroULong = GetZeroGetter(typeof(Vector256));
+ public static readonly MethodInfo V256GetZeroUInt = GetZeroGetter(typeof(Vector256));
+ public static readonly MethodInfo V256GetZeroUShort = GetZeroGetter(typeof(Vector256));
+ public static readonly MethodInfo V256GetZeroByte = GetZeroGetter(typeof(Vector256));
+ public static readonly MethodInfo V128GetZeroULong = GetZeroGetter(typeof(Vector128));
+ public static readonly MethodInfo V128GetZeroUInt = GetZeroGetter(typeof(Vector128));
+ public static readonly MethodInfo V128GetZeroUShort = GetZeroGetter(typeof(Vector128));
+ public static readonly MethodInfo V128GetZeroByte = GetZeroGetter(typeof(Vector128));
+
+ #endregion
+ }
+ }
+}
diff --git a/src/NumSharp.Core/Backends/Kernels/ILKernelGenerator.cs b/src/NumSharp.Core/Backends/Kernels/ILKernelGenerator.cs
index 37536cf0..134ae6a0 100644
--- a/src/NumSharp.Core/Backends/Kernels/ILKernelGenerator.cs
+++ b/src/NumSharp.Core/Backends/Kernels/ILKernelGenerator.cs
@@ -290,7 +290,7 @@ public static partial class ILKernelGenerator
/// Caching these avoids repeated GetMethod() lookups during kernel generation.
/// All fields use ?? throw to fail fast at type load if a method is not found.
///
- private static class CachedMethods
+ private static partial class CachedMethods
{
// Math methods (double versions)
public static readonly MethodInfo MathPow = typeof(Math).GetMethod(nameof(Math.Pow), new[] { typeof(double), typeof(double) })
diff --git a/src/NumSharp.Core/Creation/np.asanyarray.cs b/src/NumSharp.Core/Creation/np.asanyarray.cs
index 5e83dc00..e575250c 100644
--- a/src/NumSharp.Core/Creation/np.asanyarray.cs
+++ b/src/NumSharp.Core/Creation/np.asanyarray.cs
@@ -1,4 +1,9 @@
using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
namespace NumSharp
{
@@ -18,29 +23,341 @@ public static NDArray asanyarray(in object a, Type dtype = null) //todo support
case null:
throw new ArgumentNullException(nameof(a));
case NDArray nd:
- return nd;
+ if (dtype == null || Equals(nd.dtype, dtype))
+ return nd;
+ return nd.astype(dtype, true);
+ case object[] objArr:
+ // object[] has no fixed dtype — route through type-promotion path.
+ // new NDArray(object[]) throws NotSupportedException since object isn't a
+ // supported element type.
+ ret = ConvertNonGenericEnumerable(objArr);
+ if (ret is null)
+ throw new NotSupportedException($"Unable to resolve asanyarray for object[] (length {objArr.Length}): element type is not a supported NumSharp dtype.");
+ break;
case Array array:
ret = new NDArray(array);
break;
case string str:
ret = str; //implicit cast located in NDArray.Implicit.Array
break;
+
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+ case IEnumerable e: ret = np.array(ToArrayFast(e)); break;
+
default:
var type = a.GetType();
- //is it a scalar
if (type.IsPrimitive || type == typeof(decimal))
{
ret = NDArray.Scalar(a);
break;
}
- throw new NotSupportedException($"Unable resolve asanyarray for type {a.GetType().Name}");
+ // Memory/ReadOnlyMemory do not implement IEnumerable.
+ if (type.IsGenericType)
+ {
+ var genericDef = type.GetGenericTypeDefinition();
+ if (genericDef == typeof(Memory<>) || genericDef == typeof(ReadOnlyMemory<>))
+ {
+ ret = ConvertMemory(a, type);
+ if (ret is not null)
+ break;
+ }
+ }
+
+ if (a is ITuple tuple)
+ {
+ ret = ConvertTuple(tuple);
+ if (ret is not null)
+ break;
+ }
+
+ if (a is IEnumerable enumerable)
+ {
+ ret = ConvertNonGenericEnumerable(enumerable);
+ if (ret is not null)
+ break;
+ }
+
+ if (a is IEnumerator enumerator)
+ {
+ ret = ConvertEnumerator(enumerator);
+ if (ret is not null)
+ break;
+ }
+
+ throw new NotSupportedException($"Unable to resolve asanyarray for type {type.Name}");
}
- if (dtype != null && a.GetType() != dtype)
+ if (dtype != null && !Equals(ret.dtype, dtype))
return ret.astype(dtype, true);
return ret;
}
+
+ ///
+ /// Copies an into a freshly allocated [].
+ /// Specialised for List<T> and ICollection<T> to skip the enumerator and to
+ /// use since we overwrite every slot.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static T[] ToArrayFast(IEnumerable source)
+ {
+ if (source is List list)
+ {
+ var span = CollectionsMarshal.AsSpan(list);
+ var arr = GC.AllocateUninitializedArray(span.Length);
+ span.CopyTo(arr);
+ return arr;
+ }
+
+ if (source is ICollection collection)
+ {
+ var arr = GC.AllocateUninitializedArray(collection.Count);
+ collection.CopyTo(arr, 0);
+ return arr;
+ }
+
+ return source.ToArray();
+ }
+
+ ///
+ /// Converts Memory<T> or ReadOnlyMemory<T> to an NDArray.
+ /// Uses Span.CopyTo + GC.AllocateUninitializedArray for optimal performance.
+ ///
+ private static NDArray ConvertMemory(object a, Type type)
+ {
+ var elementType = type.GetGenericArguments()[0];
+ var isReadOnly = type.GetGenericTypeDefinition() == typeof(ReadOnlyMemory<>);
+
+ if (elementType == typeof(bool)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(byte)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(short)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(ushort)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(int)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(uint)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(long)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(ulong)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(char)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(float)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(double)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+ if (elementType == typeof(decimal)) return np.array(SpanToArrayFast(isReadOnly ? ((ReadOnlyMemory)a).Span : ((Memory)a).Span));
+
+ return null;
+ }
+
+ ///
+ /// Optimized Span to Array conversion using GC.AllocateUninitializedArray.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static T[] SpanToArrayFast(ReadOnlySpan span)
+ {
+ var arr = GC.AllocateUninitializedArray(span.Length);
+ span.CopyTo(arr);
+ return arr;
+ }
+
+ ///
+ /// Converts a non-generic IEnumerable to an NDArray.
+ /// Element type is detected from the first item.
+ ///
+ private static NDArray ConvertNonGenericEnumerable(IEnumerable enumerable)
+ => ConvertEnumerator(enumerable.GetEnumerator());
+
+ ///
+ /// Converts a non-generic IEnumerator to an NDArray.
+ /// Element type is detected from items with NumPy-like type promotion.
+ /// Empty collections return empty double[] to match NumPy's float64 default.
+ ///
+ private static NDArray ConvertEnumerator(IEnumerator enumerator)
+ {
+ List