Spaces:

Amogh1221
/

deepcastle-api-2

Running

App Files Files Community

Amogh1221 commited on 18 days ago

Commit

78d4aa4

verified ·

1 Parent(s): 1251e32

Upload 70 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
Dockerfile +79 -0
main.py +949 -0
openings.json +0 -0
output.nnue +3 -0
small_output.nnue +3 -0
src/Makefile +1184 -0
src/benchmark.cpp +516 -0
src/benchmark.h +42 -0
src/bitboard.cpp +189 -0
src/bitboard.h +458 -0
src/engine.cpp +411 -0
src/engine.h +134 -0
src/evaluate.cpp +124 -0
src/evaluate.h +58 -0
src/history.h +273 -0
src/incbin/UNLICENCE +26 -0
src/incbin/incbin.h +476 -0
src/main.cpp +43 -0
src/memory.cpp +199 -0
src/memory.h +333 -0
src/misc.cpp +549 -0
src/misc.h +538 -0
src/movegen.cpp +312 -0
src/movegen.h +73 -0
src/movepick.cpp +313 -0
src/movepick.h +80 -0
src/nnue/features/full_threats.cpp +343 -0
src/nnue/features/full_threats.h +106 -0
src/nnue/features/half_ka_v2_hm.cpp +69 -0
src/nnue/features/half_ka_v2_hm.h +128 -0
src/nnue/layers/affine_transform.h +312 -0
src/nnue/layers/affine_transform_sparse_input.h +379 -0
src/nnue/layers/clipped_relu.h +170 -0
src/nnue/layers/sqr_clipped_relu.h +109 -0
src/nnue/network.cpp +415 -0
src/nnue/network.h +161 -0
src/nnue/nnue_accumulator.cpp +952 -0
src/nnue/nnue_accumulator.h +206 -0
src/nnue/nnue_architecture.h +165 -0
src/nnue/nnue_common.h +298 -0
src/nnue/nnue_feature_transformer.h +456 -0
src/nnue/nnue_misc.cpp +193 -0
src/nnue/nnue_misc.h +74 -0
src/nnue/simd.h +440 -0
src/numa.h +1718 -0
src/perft.h +67 -0
src/position.cpp +1566 -0
src/position.h +414 -0
src/score.cpp +48 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+output.nnue filter=lfs diff=lfs merge=lfs -text
+small_output.nnue filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,79 @@

+# Use Python 3.12 slim
+FROM python:3.12-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    make \
+    g++ \
+    wget \
+    curl \
+    xz-utils \
+    findutils \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy ALL files from the repository
+COPY . .
+# ============================================================
+# CUSTOM DEEPCASTLE ENGINE BUILD
+# Supports both repo layouts:
+# 1) /app/engine/src (full repo)
+# 2) /app/src        (HF minimal repo)
+# ============================================================
+RUN if [ -d /app/engine/src ]; then BUILD_DIR=/app/engine/src; \
+    elif [ -d /app/src ]; then BUILD_DIR=/app/src; \
+    else echo "Engine source dir not found"; exit 1; fi && \
+    cd "$BUILD_DIR" && \
+    wget -q -O nn-9a0cc2a62c52.nnue https://tests.stockfishchess.org/api/nn/nn-9a0cc2a62c52.nnue && \
+    wget -q -O nn-47fc8b7fff06.nnue https://tests.stockfishchess.org/api/nn/nn-47fc8b7fff06.nnue && \
+    if [ ! -f ../scripts/net.sh ]; then \
+      mkdir -p ../scripts; \
+      printf '#!/bin/sh\n# HF minimal layout fallback: skip default net fetch\nexit 0\n' > ../scripts/net.sh; \
+      chmod +x ../scripts/net.sh; \
+    fi && \
+    make -j2 build ARCH=x86-64 && \
+    mkdir -p /app/engine_bin && \
+    cp stockfish /app/engine_bin/deepcastle && \
+    chmod +x /app/engine_bin/deepcastle
+# ============================================================
+# LAUNCHER PREPARATION
+# ============================================================
+WORKDIR /app
+RUN LAUNCHER_PATH=$(find /app -name "main.py" | head -n 1) && \
+    cp "$LAUNCHER_PATH" /app/launcher.py
+# ============================================================
+# BRAIN PLACEMENT
+# ============================================================
+# Map your custom brains for the server
+RUN if [ -f /app/output.nnue ]; then cp /app/output.nnue /app/engine_bin/output.nnue; fi && \
+    if [ -f /app/small_output.nnue ]; then cp /app/small_output.nnue /app/engine_bin/small_output.nnue; fi
+# Force permissions
+RUN chmod -R 777 /app/engine_bin
+# ============================================================
+# BACKEND SETUP
+# ============================================================
+RUN if [ -f /app/server/requirements.txt ]; then \
+      pip install --no-cache-dir -r /app/server/requirements.txt; \
+    else \
+      pip install --no-cache-dir fastapi "uvicorn[standard]" websockets python-chess pydantic psutil; \
+    fi
+# Explicit Paths
+ENV ENGINE_PATH=/app/engine_bin/deepcastle
+ENV DEEPCASTLE_ENGINE_PATH=/app/engine_bin/deepcastle
+ENV NNUE_PATH=/app/engine_bin/output.nnue
+ENV NNUE_SMALL_PATH=/app/engine_bin/small_output.nnue
+ENV PYTHONPATH="/app:/app/server"
+EXPOSE 7860
+# START
+CMD ["python3", "/app/launcher.py"]

main.py ADDED Viewed

	@@ -0,0 +1,949 @@

+from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from typing import Optional, List, Dict, Tuple
+from contextlib import asynccontextmanager
+import os
+import math
+import chess
+import chess.engine
+import asyncio
+import json
+import gc
+import ctypes
+import psutil
+# ─── Force memory back to OS (Linux/HF compatible) ────────────────────────────
+def force_memory_release():
+    """
+    Run GC twice (catches cyclic references missed on first pass),
+    then call malloc_trim to return freed pages back to the OS.
+    Without this, Python holds freed memory in its own pool and
+    the OS still shows high RAM even after objects are deleted.
+    """
+    gc.collect()
+    gc.collect()
+    try:
+        ctypes.CDLL("libc.so.6").malloc_trim(0)
+    except Exception:
+        pass
+# ─── Multiplayer / Challenge Manager ──────────────────────────────────────────
+class ConnectionManager:
+    def __init__(self):
+        self.active_connections: Dict[str, List[WebSocket]] = {}
+    async def connect(self, websocket: WebSocket, match_id: str):
+        await websocket.accept()
+        if match_id not in self.active_connections:
+            self.active_connections[match_id] = []
+        self.active_connections[match_id].append(websocket)
+    def disconnect(self, websocket: WebSocket, match_id: str):
+        if match_id in self.active_connections:
+            if websocket in self.active_connections[match_id]:
+                self.active_connections[match_id].remove(websocket)
+            # FIX: Clean up empty rooms so dict doesn't grow forever
+            if not self.active_connections[match_id]:
+                del self.active_connections[match_id]
+    async def broadcast(self, message: str, match_id: str, exclude: WebSocket = None):
+        if match_id not in self.active_connections:
+            return
+        dead = []
+        for connection in self.active_connections[match_id]:
+            if connection == exclude:
+                continue
+            try:
+                await connection.send_text(message)
+            except Exception:
+                # FIX: Track dead sockets instead of silently ignoring them
+                dead.append(connection)
+        # FIX: Remove dead sockets after iteration to free memory
+        for d in dead:
+            self.active_connections[match_id].remove(d)
+        # FIX: Clean up empty room after removing dead sockets
+        if match_id in self.active_connections and not self.active_connections[match_id]:
+            del self.active_connections[match_id]
+manager = ConnectionManager()
+# Paths relative to the Docker container
+DEEPCASTLE_ENGINE_PATH = os.environ.get(
+    "DEEPCASTLE_ENGINE_PATH",
+    os.environ.get("ENGINE_PATH", "/app/engine_bin/deepcastle"),
+)
+NNUE_PATH = os.environ.get("NNUE_PATH", "/app/engine_bin/output.nnue")
+NNUE_SMALL_PATH = os.environ.get("NNUE_SMALL_PATH", "/app/engine_bin/small_output.nnue")
+class MoveRequest(BaseModel):
+    fen: str
+    time: float = 1.0
+    depth: Optional[int] = None
+class MoveResponse(BaseModel):
+    bestmove: str
+    score: float
+    depth: int
+    nodes: int
+    nps: int
+    pv: str
+    mate_in: Optional[int] = None
+    opening: Optional[str] = None
+class AnalyzeRequest(BaseModel):
+    moves: List[str]
+    time_per_move: float = 0.1
+    player_color: str = "white"
+    start_fen: Optional[str] = None
+class MoveAnalysis(BaseModel):
+    move_num: int
+    san: str
+    best_move: str
+    classification: str
+    opening: Optional[str] = None
+    cpl: float
+    score_before: float
+    score_after: float
+class AnalyzeResponse(BaseModel):
+    accuracy: float
+    estimated_elo: int
+    moves: List[MoveAnalysis]
+    counts: Dict[str, int]
+# Global engine instance
+_GLOBAL_DEEPCASTLE_ENGINE = None
+_ENGINE_LOCK = asyncio.Lock()
+_ENGINE_IO_LOCK = asyncio.Lock()
+def _engine_hash_mb() -> int:
+    try:
+        # Default used for transposition table size testing.
+        v = int(os.environ.get("ENGINE_HASH_MB", "512"))
+    except ValueError:
+        v = 128
+    return max(8, min(2048, v))
+async def _get_or_start_engine(engine_path: str, *, role: str, options: Optional[dict] = None):
+    global _GLOBAL_DEEPCASTLE_ENGINE
+    current_engine = _GLOBAL_DEEPCASTLE_ENGINE
+    if current_engine is not None:
+        try:
+            if not current_engine.is_terminated():
+                return current_engine
+        except Exception:
+            _GLOBAL_DEEPCASTLE_ENGINE = None
+        else:
+            _GLOBAL_DEEPCASTLE_ENGINE = None
+    async with _ENGINE_LOCK:
+        current_engine = _GLOBAL_DEEPCASTLE_ENGINE
+        if current_engine is not None:
+            try:
+                if not current_engine.is_terminated():
+                    return current_engine
+            except Exception:
+                _GLOBAL_DEEPCASTLE_ENGINE = None
+            else:
+                _GLOBAL_DEEPCASTLE_ENGINE = None
+        if not os.path.exists(engine_path):
+            raise HTTPException(status_code=500, detail=f"{role} binary NOT FOUND at {engine_path}")
+        try:
+            _, engine = await chess.engine.popen_uci(engine_path)
+            if options:
+                await engine.configure(options)
+            if os.path.exists(NNUE_PATH):
+                try:
+                    await engine.configure({"EvalFile": NNUE_PATH})
+                except Exception as ne:
+                    print(f"[ERROR] EvalFile load failed: {str(ne)}")
+            else:
+                print(f"[WARNING] EvalFile not found at {NNUE_PATH}")
+            if os.path.exists(NNUE_SMALL_PATH):
+                try:
+                    await engine.configure({"EvalFileSmall": NNUE_SMALL_PATH})
+                except Exception as ne:
+                    print(f"[ERROR] EvalFileSmall load failed: {str(ne)}")
+            else:
+                print(f"[WARNING] EvalFileSmall not found at {NNUE_SMALL_PATH}")
+            _GLOBAL_DEEPCASTLE_ENGINE = engine
+            return engine
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"{role} crash: {str(e)}")
+async def get_deepcastle_engine(hash_mb: Optional[int] = None):
+    h = hash_mb if hash_mb is not None else _engine_hash_mb()
+    return await _get_or_start_engine(
+        DEEPCASTLE_ENGINE_PATH,
+        role="deepcastle",
+        options={"Hash": h, "Threads": 1},
+    )
+async def get_stockfish_engine(hash_mb: Optional[int] = None):
+    return await get_deepcastle_engine(hash_mb=hash_mb)
+async def _clear_engine_hash(engine) -> None:
+    """Best-effort clear of transposition table/internal game state."""
+    try:
+        # Preferred path for Stockfish-family engines.
+        await engine.configure({"Clear Hash": True})
+    except Exception:
+        pass
+    try:
+        # Also reset game state so the engine does not keep game-history context.
+        if hasattr(engine, "protocol") and hasattr(engine.protocol, "send_line"):
+            engine.protocol.send_line("ucinewgame")
+        await asyncio.wait_for(engine.ping(), timeout=5.0)
+    except Exception as e:
+        print(f"[WARNING] Failed to clear engine hash: {e}")
+async def shutdown_engine_async() -> None:
+    global _GLOBAL_DEEPCASTLE_ENGINE
+    async with _ENGINE_IO_LOCK:
+        async with _ENGINE_LOCK:
+            eng = _GLOBAL_DEEPCASTLE_ENGINE
+            _GLOBAL_DEEPCASTLE_ENGINE = None
+    if eng:
+        try:
+            await asyncio.wait_for(eng.quit(), timeout=5.0)
+        except Exception:
+            pass
+async def _detach_and_quit_engine(engine) -> None:
+    global _GLOBAL_DEEPCASTLE_ENGINE
+    async with _ENGINE_LOCK:
+        if _GLOBAL_DEEPCASTLE_ENGINE is engine:
+            _GLOBAL_DEEPCASTLE_ENGINE = None
+    try:
+        await asyncio.wait_for(engine.quit(), timeout=5.0)
+    except Exception:
+        pass
+def _search_timeout_sec(request_time: float, depth: Optional[int] = None) -> float:
+    try:
+        cap = float(os.environ.get("ENGINE_SEARCH_TIMEOUT_SEC", "120"))
+    except ValueError:
+        cap = 120.0
+    cap = max(15.0, min(600.0, cap))
+    if request_time and request_time > 0:
+        return min(cap, max(request_time * 3.0 + 10.0, 30.0))
+    return cap
+def _analyze_ply_timeout(time_per_move: float) -> float:
+    try:
+        cap = float(os.environ.get("ENGINE_SEARCH_TIMEOUT_SEC", "120"))
+    except ValueError:
+        cap = 120.0
+    cap = max(15.0, min(600.0, cap))
+    if time_per_move and time_per_move > 0:
+        return min(cap, max(time_per_move * 80.0 + 15.0, 30.0))
+    return cap
+async def _engine_call(engine, coro, timeout_sec: float):
+    try:
+        return await asyncio.wait_for(coro, timeout=timeout_sec)
+    except asyncio.TimeoutError:
+        await _detach_and_quit_engine(engine)
+        raise HTTPException(status_code=504, detail="Engine search timed out")
+# ─── Background Memory Cleanup Task ───────────────────────────────────────────
+_RAM_CLEANUP_THRESHOLD_MB = float(os.environ.get("RAM_CLEANUP_THRESHOLD_MB", "300"))
+_RAM_CLEANUP_INTERVAL_SEC = int(os.environ.get("RAM_CLEANUP_INTERVAL_SEC", "60"))
+_CLEAR_HASH_AFTER_MOVE = os.environ.get("CLEAR_HASH_AFTER_MOVE", "1").strip().lower() not in {"0", "false", "no", "off"}
+_RESTART_ENGINE_AFTER_MOVE = os.environ.get("RESTART_ENGINE_AFTER_MOVE", "1").strip().lower() not in {"0", "false", "no", "off"}
+async def memory_cleanup_task():
+    """
+    Background task that runs every 5 minutes.
+    - Always runs GC twice and malloc_trim to return memory to OS.
+    - If RAM exceeds threshold, also clears engine hash table.
+    """
+    while True:
+        await asyncio.sleep(_RAM_CLEANUP_INTERVAL_SEC)
+        try:
+            process = psutil.Process(os.getpid())
+            mem_mb = process.memory_info().rss / 1024 / 1024
+            if mem_mb > _RAM_CLEANUP_THRESHOLD_MB:
+                print(f"[CLEANUP] RAM at {mem_mb:.1f}MB (threshold {_RAM_CLEANUP_THRESHOLD_MB}MB) — clearing engine hash")
+                engine = _GLOBAL_DEEPCASTLE_ENGINE
+                if engine is not None:
+                    try:
+                        if not engine.is_terminated():
+                            async with _ENGINE_IO_LOCK:
+                                await _clear_engine_hash(engine)
+                    except Exception:
+                        pass
+                force_memory_release()
+                after_mb = process.memory_info().rss / 1024 / 1024
+                print(f"[CLEANUP] Done. RAM: {mem_mb:.1f}MB → {after_mb:.1f}MB")
+            else:
+                # Always nudge GC + malloc_trim even when RAM is fine
+                force_memory_release()
+                print(f"[CLEANUP] RAM at {mem_mb:.1f}MB — OK")
+        except Exception as e:
+            print(f"[CLEANUP] Error during cleanup: {e}")
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    cleanup_task = asyncio.create_task(memory_cleanup_task())
+    print(f"[STARTUP] Memory cleanup task started (every {_RAM_CLEANUP_INTERVAL_SEC}s, threshold {_RAM_CLEANUP_THRESHOLD_MB}MB)")
+    print(
+        f"[STARTUP] Engine config: hash_mb={_engine_hash_mb()} "
+        f"clear_after_move={_CLEAR_HASH_AFTER_MOVE} restart_after_move={_RESTART_ENGINE_AFTER_MOVE}"
+    )
+    yield
+    cleanup_task.cancel()
+    try:
+        await cleanup_task
+    except asyncio.CancelledError:
+        pass
+    await shutdown_engine_async()
+app = FastAPI(title="Deepcastle Engine API", lifespan=lifespan)
+# FIX: Global timeout middleware — kills hung requests so they don't queue in memory
+@app.middleware("http")
+async def timeout_middleware(request: Request, call_next):
+    try:
+        return await asyncio.wait_for(call_next(request), timeout=180.0)
+    except asyncio.TimeoutError:
+        return JSONResponse({"detail": "Request timed out"}, status_code=504)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ─── WebSocket ─────────────────────────────────────────────────────────────────
+@app.websocket("/ws/{match_id}")
+async def websocket_endpoint(websocket: WebSocket, match_id: str):
+    await manager.connect(websocket, match_id)
+    await manager.broadcast(json.dumps({"type": "join"}), match_id, exclude=websocket)
+    try:
+        while True:
+            data = await websocket.receive_text()
+            await manager.broadcast(data, match_id, exclude=websocket)
+    except WebSocketDisconnect:
+        manager.disconnect(websocket, match_id)
+        await manager.broadcast(json.dumps({"type": "opponent_disconnected"}), match_id)
+        force_memory_release()
+    except Exception:
+        manager.disconnect(websocket, match_id)
+        await manager.broadcast(json.dumps({"type": "opponent_disconnected"}), match_id)
+        force_memory_release()
+# ─── Health & Monitoring ───────────────────────────────────────────────────────
+@app.get("/")
+def home():
+    return {"status": "online", "engine": "Deepcastle Hybrid Neural", "platform": "Hugging Face Spaces"}
+# FIX: Accept HEAD requests from UptimeRobot (was returning 405)
+@app.api_route("/health", methods=["GET", "HEAD"])
+def health():
+    if not os.path.exists(DEEPCASTLE_ENGINE_PATH):
+        return {"status": "error", "message": "Missing engine binary: deepcastle"}
+    force_memory_release()
+    return {"status": "ok", "engine": "deepcastle"}
+@app.get("/health/ready")
+async def health_ready():
+    if not os.path.exists(DEEPCASTLE_ENGINE_PATH):
+        raise HTTPException(status_code=503, detail="Missing engine binary")
+    try:
+        engine = await get_deepcastle_engine()
+        async with _ENGINE_IO_LOCK:
+            await asyncio.wait_for(engine.ping(), timeout=5.0)
+        return {"status": "ok", "engine": "responsive"}
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=503, detail=str(e))
+@app.get("/ram")
+def ram_usage():
+    """Monitor RAM usage for API process + child engine processes."""
+    process = psutil.Process(os.getpid())
+    mem = process.memory_info()
+    mem_mb = mem.rss / 1024 / 1024
+    child_rss_mb = 0.0
+    child_count = 0
+    try:
+        for child in process.children(recursive=True):
+            try:
+                child_rss_mb += child.memory_info().rss / 1024 / 1024
+                child_count += 1
+            except Exception:
+                pass
+    except Exception:
+        pass
+    total_mb = mem_mb + child_rss_mb
+    return {
+        "rss_mb": round(mem_mb, 2),
+        "child_rss_mb": round(child_rss_mb, 2),
+        "total_process_tree_rss_mb": round(total_mb, 2),
+        "child_process_count": child_count,
+        "vms_mb": round(mem.vms / 1024 / 1024, 2),
+        "threshold_mb": _RAM_CLEANUP_THRESHOLD_MB,
+        "cleanup_interval_sec": _RAM_CLEANUP_INTERVAL_SEC,
+        "status": "high" if total_mb > _RAM_CLEANUP_THRESHOLD_MB else "ok",
+        "active_rooms": len(manager.active_connections),
+        "active_connections": sum(len(v) for v in manager.active_connections.values()),
+    }
+# FIX: Call from frontend on game start/end to clear engine hash
+@app.post("/new-game")
+async def new_game():
+    """
+    Clear engine hash table between games.
+    Call this from the frontend at these moments:
+      - When user starts a new game vs bot
+      - When game ends (checkmate / resign / draw)
+      - When multiplayer match starts
+      - When multiplayer match ends
+    """
+    try:
+        engine = await get_deepcastle_engine()
+        async with _ENGINE_IO_LOCK:
+            await _clear_engine_hash(engine)
+        force_memory_release()
+        return {"status": "ok", "message": "Engine hash cleared"}
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ─── Helpers ───────────────────────────────────────────────────────────────────
+def get_normalized_score(info) -> Tuple[float, Optional[int]]:
+    if "score" not in info:
+        return 0.0, None
+    raw = info["score"].white()
+    if raw.is_mate():
+        m = raw.mate() or 0
+        return (10000.0 if m > 0 else -10000.0), m
+    return float(raw.score() or 0.0), None
+def normalize_search_stats(info: dict) -> Tuple[int, int, int]:
+    depth = int(info.get("depth") or 0)
+    nodes = int(info.get("nodes") or 0)
+    t = info.get("time")
+    nps_raw = int(info.get("nps") or 0)
+    if t is not None and float(t) > 0 and nodes > 0:
+        nps = max(0, int(round(nodes / float(t))))
+    else:
+        nps = nps_raw
+    return depth, nodes, nps
+# ─── Bot Move (/move) ──────────────────────────────────────────────────────────
+@app.post("/move", response_model=MoveResponse)
+async def get_move(request: MoveRequest):
+    try:
+        engine = await get_deepcastle_engine()
+        board = chess.Board(request.fen)
+        limit = chess.engine.Limit(time=request.time, depth=request.depth)
+        tsec = _search_timeout_sec(request.time, request.depth)
+        async with _ENGINE_IO_LOCK:
+            result = await _engine_call(
+                engine,
+                engine.play(board, limit, info=chess.engine.INFO_ALL),
+                tsec,
+            )
+            info = dict(result.info)
+            if not info:
+                info = await _engine_call(
+                    engine,
+                    engine.analyse(board, limit, info=chess.engine.INFO_ALL),
+                    tsec,
+                )
+            score_cp, mate_in = get_normalized_score(info)
+            depth, nodes, nps = normalize_search_stats(info)
+            pv_board = board.copy()
+            pv_parts = []
+            for m in info.get("pv", [])[:5]:
+                if m in pv_board.legal_moves:
+                    try:
+                        pv_parts.append(pv_board.san(m))
+                        pv_board.push(m)
+                    except Exception:
+                        break
+                else:
+                    break
+            pv = " ".join(pv_parts)
+            del pv_board
+            score_pawns = score_cp / 100.0 if abs(score_cp) < 9900 else (100.0 if score_cp > 0 else -100.0)
+            board_fen_only = board.fen().split(" ")[0]
+            opening_name = openings_db.get(board_fen_only)
+            best_move = result.move.uci()
+            response = MoveResponse(
+                bestmove=best_move,
+                score=score_pawns,
+                depth=depth,
+                nodes=nodes,
+                nps=nps,
+                pv=pv,
+                mate_in=mate_in,
+                opening=opening_name
+            )
+            # IMPORTANT: do reset/clear while holding the engine IO lock so no
+            # other /move call can reuse a half-cleared engine.
+            if _RESTART_ENGINE_AFTER_MOVE:
+                await _detach_and_quit_engine(engine)
+                force_memory_release()
+            elif _CLEAR_HASH_AFTER_MOVE:
+                await _clear_engine_hash(engine)
+                force_memory_release()
+            del result
+            del info
+            return response
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# ─── Hint Move (/analysis-move) ───────────────────────────────────────────────
+@app.post("/analysis-move", response_model=MoveResponse)
+async def get_analysis_move(request: MoveRequest):
+    try:
+        engine = await get_stockfish_engine(hash_mb=2048)
+        board = chess.Board(request.fen)
+        limit = chess.engine.Limit(time=request.time, depth=request.depth)
+        tsec = _search_timeout_sec(request.time, request.depth)
+        async with _ENGINE_IO_LOCK:
+            result = await _engine_call(
+                engine,
+                engine.play(board, limit, info=chess.engine.INFO_ALL),
+                tsec,
+            )
+            info = dict(result.info)
+            if not info:
+                info = await _engine_call(
+                    engine,
+                    engine.analyse(board, limit, info=chess.engine.INFO_ALL),
+                    tsec,
+                )
+        score_cp, mate_in = get_normalized_score(info)
+        depth, nodes, nps = normalize_search_stats(info)
+        pv_board = board.copy()
+        pv_parts = []
+        for m in info.get("pv", [])[:5]:
+            if m in pv_board.legal_moves:
+                try:
+                    pv_parts.append(pv_board.san(m))
+                    pv_board.push(m)
+                except Exception:
+                    break
+            else:
+                break
+        pv = " ".join(pv_parts)
+        del pv_board
+        score_pawns = score_cp / 100.0 if abs(score_cp) < 9900 else (100.0 if score_cp > 0 else -100.0)
+        board_fen_only = board.fen().split(" ")[0]
+        opening_name = openings_db.get(board_fen_only)
+        best_move = result.move.uci()
+        del result
+        del info
+        # FIX: clear/restart engine to keep memory stable after hint
+        async with _ENGINE_IO_LOCK:
+            if _RESTART_ENGINE_AFTER_MOVE:
+                await _detach_and_quit_engine(engine)
+            elif _CLEAR_HASH_AFTER_MOVE:
+                await _clear_engine_hash(engine)
+        force_memory_release()
+        return MoveResponse(
+            bestmove=best_move,
+            score=score_pawns,
+            depth=depth,
+            nodes=nodes,
+            nps=nps,
+            pv=pv,
+            mate_in=mate_in,
+            opening=opening_name
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"Analysis move error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# ─── Openings DB ───────────────────────────────────────────────────────────────
+openings_db = {}
+openings_path = os.path.join(os.path.dirname(__file__), "openings.json")
+if os.path.exists(openings_path):
+    try:
+        with open(openings_path, "r", encoding="utf-8") as f:
+            openings_db = json.load(f)
+    except Exception:
+        pass
+# ─── Move Classification Helpers ───────────────────────────────────────────────
+def get_win_percentage_from_cp(cp: int) -> float:
+    cp_ceiled = max(-1000, min(1000, cp))
+    MULTIPLIER = -0.00368208
+    win_chances = 2.0 / (1.0 + math.exp(MULTIPLIER * cp_ceiled)) - 1.0
+    return 50.0 + 50.0 * win_chances
+def get_win_percentage(info: dict) -> float:
+    score = info.get("score")
+    if not score:
+        return 50.0
+    white_score = score.white()
+    if white_score.is_mate():
+        mate_val = white_score.mate()
+        return 100.0 if mate_val > 0 else 0.0
+    return get_win_percentage_from_cp(white_score.score())
+def is_losing_or_alt_winning(pos_win_pct: float, alt_win_pct: float, is_white_move: bool) -> bool:
+    is_losing = pos_win_pct < 50.0 if is_white_move else pos_win_pct > 50.0
+    is_alt_winning = alt_win_pct > 97.0 if is_white_move else alt_win_pct < 3.0
+    return is_losing or is_alt_winning
+def get_has_changed_outcome(last_win_pct: float, pos_win_pct: float, is_white_move: bool) -> bool:
+    diff = (pos_win_pct - last_win_pct) * (1 if is_white_move else -1)
+    return diff > 10.0 and (
+        (last_win_pct < 50.0 and pos_win_pct > 50.0) or
+        (last_win_pct > 50.0 and pos_win_pct < 50.0)
+    )
+def get_is_only_good_move(pos_win_pct: float, alt_win_pct: float, is_white_move: bool) -> bool:
+    diff = (pos_win_pct - alt_win_pct) * (1 if is_white_move else -1)
+    return diff > 10.0
+def is_simple_recapture(fen_two_moves_ago: str, previous_move: chess.Move, played_move: chess.Move) -> bool:
+    if previous_move.to_square != played_move.to_square:
+        return False
+    b = chess.Board(fen_two_moves_ago)
+    result = b.piece_at(previous_move.to_square) is not None
+    del b
+    return result
+def get_material_difference(board: chess.Board) -> int:
+    values = {
+        chess.PAWN: 1, chess.KNIGHT: 3, chess.BISHOP: 3,
+        chess.ROOK: 5, chess.QUEEN: 9, chess.KING: 0
+    }
+    w = sum(values.get(p.piece_type, 0) for p in board.piece_map().values() if p.color == chess.WHITE)
+    b = sum(values.get(p.piece_type, 0) for p in board.piece_map().values() if p.color == chess.BLACK)
+    return w - b
+def get_is_piece_sacrifice(board: chess.Board, played_move: chess.Move, best_pv: list) -> bool:
+    if not best_pv:
+        return False
+    start_diff = get_material_difference(board)
+    white_to_play = board.turn == chess.WHITE
+    sim_board = board.copy()
+    moves = [played_move] + best_pv
+    if len(moves) % 2 == 1:
+        moves = moves[:-1]
+    captured_w = []
+    captured_b = []
+    non_capturing = 1
+    for m in moves:
+        if m in sim_board.legal_moves:
+            captured_piece = sim_board.piece_at(m.to_square)
+            if sim_board.is_en_passant(m):
+                captured_piece = chess.Piece(chess.PAWN, not sim_board.turn)
+            if captured_piece:
+                if sim_board.turn == chess.WHITE:
+                    captured_b.append(captured_piece.piece_type)
+                else:
+                    captured_w.append(captured_piece.piece_type)
+                non_capturing = 1
+            else:
+                non_capturing -= 1
+                if non_capturing < 0:
+                    break
+            sim_board.push(m)
+        else:
+            break
+    for p in captured_w[:]:
+        if p in captured_b:
+            captured_w.remove(p)
+            captured_b.remove(p)
+    if abs(len(captured_w) - len(captured_b)) <= 1 and all(p == chess.PAWN for p in captured_w + captured_b):
+        del sim_board
+        return False
+    end_diff = get_material_difference(sim_board)
+    del sim_board
+    mat_diff = end_diff - start_diff
+    player_rel = mat_diff if white_to_play else -mat_diff
+    return player_rel < 0
+def get_move_classification(
+    last_win_pct: float,
+    pos_win_pct: float,
+    is_white_move: bool,
+    played_move: chess.Move,
+    best_move_before: chess.Move,
+    alt_win_pct: Optional[float],
+    fen_two_moves_ago: Optional[str],
+    uci_next_two_moves: Optional[Tuple[chess.Move, chess.Move]],
+    board_before_move: chess.Board,
+    best_pv_after: list
+) -> str:
+    diff = (pos_win_pct - last_win_pct) * (1 if is_white_move else -1)
+    if alt_win_pct is not None and diff >= -2.0:
+        if get_is_piece_sacrifice(board_before_move, played_move, best_pv_after):
+            if not is_losing_or_alt_winning(pos_win_pct, alt_win_pct, is_white_move):
+                return "Brilliant"
+    if alt_win_pct is not None and diff >= -2.0:
+        is_recapture = False
+        if fen_two_moves_ago and uci_next_two_moves:
+            is_recapture = is_simple_recapture(
+                fen_two_moves_ago, uci_next_two_moves[0], uci_next_two_moves[1]
+            )
+        if not is_recapture and not is_losing_or_alt_winning(pos_win_pct, alt_win_pct, is_white_move):
+            if get_has_changed_outcome(last_win_pct, pos_win_pct, is_white_move) or \
+               get_is_only_good_move(pos_win_pct, alt_win_pct, is_white_move):
+                return "Great"
+    if best_move_before and played_move == best_move_before:
+        return "Best"
+    if diff < -20.0: return "Blunder"
+    if diff < -10.0: return "Mistake"
+    if diff < -5.0:  return "Inaccuracy"
+    if diff < -2.0:  return "Good"
+    return "Excellent"
+# ─── Game Analysis (/analyze-game) ────────────────────────────────────────────
+@app.post("/analyze-game", response_model=AnalyzeResponse)
+async def analyze_game(request: AnalyzeRequest):
+    try:
+        engine = await get_stockfish_engine(hash_mb=2048)
+        board = chess.Board(request.start_fen) if request.start_fen else chess.Board()
+        limit = chess.engine.Limit(time=request.time_per_move)
+        analysis_results = []
+        ply_timeout = _analyze_ply_timeout(request.time_per_move)
+        async with _ENGINE_IO_LOCK:
+            infos_before = await _engine_call(
+                engine,
+                engine.analyse(board, limit, multipv=2),
+                ply_timeout,
+            )
+            # Restart after initial evaluation to clear memory
+            await _detach_and_quit_engine(engine)
+        force_memory_release()
+        infos_before = infos_before if isinstance(infos_before, list) else [infos_before]
+        counts = {
+            "Book": 0, "Brilliant": 0, "Great": 0, "Best": 0,
+            "Excellent": 0, "Good": 0, "Inaccuracy": 0,
+            "Mistake": 0, "Blunder": 0
+        }
+        player_is_white = (request.player_color.lower() == "white")
+        # FIX: Sliding window — only keep last 3 FENs and last 2 moves, never grows
+        fen_window: List[str] = [board.fen()]
+        move_window: List[chess.Move] = []
+        total_cpl = 0.0
+        player_moves_count = 0
+        current_score, _ = get_normalized_score(infos_before[0])
+        for i, san_move in enumerate(request.moves):
+            is_white_turn = board.turn == chess.WHITE
+            is_player_turn = is_white_turn if player_is_white else not is_white_turn
+            try:
+                move = board.parse_san(san_move)
+            except Exception:
+                break
+            info_dict = infos_before[0]
+            pv_list = info_dict.get("pv", [])
+            best_move_before = pv_list[0] if pv_list else None
+            score_before, _ = get_normalized_score(info_dict)
+            win_pct_before = get_win_percentage(info_dict)
+            alt_win_pct_before: Optional[float] = None
+            if len(infos_before) > 1:
+                for line in infos_before:
+                    if line.get("pv") and line.get("pv")[0] != move:
+                        alt_win_pct_before = get_win_percentage(line)
+                        break
+            board_before_move = board.copy()
+            board.push(move)
+            # FIX: Sliding window — discard oldest beyond what we need
+            move_window.append(move)
+            if len(move_window) > 2:
+                move_window.pop(0)
+            fen_window.append(board.fen())
+            if len(fen_window) > 3:
+                fen_window.pop(0)
+            async with _ENGINE_IO_LOCK:
+                engine = await get_stockfish_engine(hash_mb=2048)
+                infos_after_raw = await _engine_call(
+                    engine,
+                    engine.analyse(board, limit, multipv=2),
+                    ply_timeout,
+                )
+                # Restart engine after each move in full game review
+                await _detach_and_quit_engine(engine)
+            force_memory_release()
+            infos_after: List[dict] = infos_after_raw if isinstance(infos_after_raw, list) else [infos_after_raw]
+            info_after_dict: dict = infos_after[0]
+            win_pct_after = get_win_percentage(info_after_dict)
+            score_after, _ = get_normalized_score(info_after_dict)
+            current_score = score_after
+            best_pv_after = info_after_dict.get("pv", [])
+            fen_two_moves_ago = fen_window[0] if len(fen_window) == 3 else None
+            uci_next_two_moves = tuple(move_window[-2:]) if len(move_window) >= 2 else None
+            cls = "Book"
+            opening_name = None
+            board_fen_only = board.fen().split(" ")[0]
+            if board_fen_only in openings_db:
+                cls = "Book"
+                opening_name = openings_db[board_fen_only]
+            else:
+                cls = get_move_classification(
+                    last_win_pct=win_pct_before,
+                    pos_win_pct=win_pct_after,
+                    is_white_move=is_white_turn,
+                    played_move=move,
+                    best_move_before=best_move_before,
+                    alt_win_pct=alt_win_pct_before,
+                    fen_two_moves_ago=fen_two_moves_ago,
+                    uci_next_two_moves=uci_next_two_moves,
+                    board_before_move=board_before_move,
+                    best_pv_after=best_pv_after
+                )
+            # FIX: Free board copy immediately after classification
+            del board_before_move
+            move_gain = score_after - score_before if is_white_turn else score_before - score_after
+            cpl = max(0.0, min(-move_gain, 1000.0))
+            if is_player_turn:
+                total_cpl += cpl
+                player_moves_count += 1
+                counts[cls] = counts.get(cls, 0) + 1
+            analysis_results.append(MoveAnalysis(
+                move_num=i + 1,
+                san=san_move,
+                classification=cls,
+                cpl=float(cpl),
+                score_before=float(score_before / 100.0),
+                score_after=float(score_after / 100.0),
+                best_move=best_move_before.uci() if best_move_before else "",
+                opening=opening_name
+            ))
+            # FIX: Release large engine result objects after each ply
+            infos_before = infos_after
+            infos_after = None
+            info_after_dict = None
+            infos_after_raw = None
+        # FIX: Free sliding windows after loop
+        del fen_window
+        del move_window
+        avg_cpl = total_cpl / max(1, player_moves_count)
+        accuracy = max(10.0, min(100.0, 100.0 * math.exp(-0.005 * avg_cpl)))
+        estimated_elo = int(max(400, min(3600, round(3600 * math.exp(-0.015 * avg_cpl)))))
+        # FIX: Clear/restart engine to keep memory stable after full game analysis
+        async with _ENGINE_IO_LOCK:
+            if _RESTART_ENGINE_AFTER_MOVE:
+                await _detach_and_quit_engine(engine)
+            elif _CLEAR_HASH_AFTER_MOVE:
+                await _clear_engine_hash(engine)
+        force_memory_release()
+        return AnalyzeResponse(
+            accuracy=round(accuracy, 1),
+            estimated_elo=estimated_elo,
+            moves=analysis_results,
+            counts=counts
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"Analysis Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

openings.json ADDED Viewed

The diff for this file is too large to render. See raw diff

output.nnue ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a91f3d2cd46fcac0c95fcc06c34435b2d57682ed7a25b0eb280117219da8d636
+size 6480707

small_output.nnue ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6e84f3019891b33db86da21922f79dce91eabb9918f96c10a60633bf37f30fa
+size 3508587

src/Makefile ADDED Viewed

	@@ -0,0 +1,1184 @@

+# Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+# Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+#
+# Stockfish is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Stockfish is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+### ==========================================================================
+### Section 1. General Configuration
+### ==========================================================================
+### Establish the operating system name
+KERNEL := $(shell uname -s)
+ifeq ($(KERNEL),Linux)
+	OS := $(shell uname -o)
+endif
+### Command prefix to run the built executable (e.g. wine, sde, qemu)
+### Backward compatible alias: WINE_PATH (deprecated)
+ifneq ($(strip $(WINE_PATH)),)
+ifeq ($(strip $(RUN_PREFIX)),)
+RUN_PREFIX := $(WINE_PATH)
+endif
+ifeq ($(MAKELEVEL),0)
+ifneq ($(strip $(RUN_PREFIX)),$(strip $(WINE_PATH)))
+$(warning *** Both RUN_PREFIX and WINE_PATH are set; ignoring WINE_PATH. ***)
+else
+$(warning *** WINE_PATH is deprecated; use RUN_PREFIX instead. ***)
+endif
+endif
+endif
+### Target Windows OS
+ifeq ($(OS),Windows_NT)
+	ifneq ($(COMP),ndk)
+		target_windows = yes
+	endif
+else ifeq ($(COMP),mingw)
+	target_windows = yes
+	ifeq ($(RUN_PREFIX),)
+		RUN_PREFIX := $(shell which wine)
+	endif
+endif
+### Executable name
+ifeq ($(target_windows),yes)
+	EXE = stockfish.exe
+else
+	EXE = stockfish
+endif
+### Installation dir definitions
+PREFIX = /usr/local
+BINDIR = $(PREFIX)/bin
+### Built-in benchmark for pgo-builds
+PGOBENCH = $(RUN_PREFIX) ./$(EXE) bench
+### Source and object files
+SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \
+	misc.cpp movegen.cpp movepick.cpp position.cpp \
+	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
+	nnue/nnue_accumulator.cpp nnue/nnue_misc.cpp nnue/network.cpp \
+	nnue/features/half_ka_v2_hm.cpp nnue/features/full_threats.cpp \
+	engine.cpp score.cpp memory.cpp
+HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h history.h \
+		nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/features/full_threats.h \
+		nnue/layers/affine_transform.h nnue/layers/affine_transform_sparse_input.h \
+		nnue/layers/clipped_relu.h nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h \
+		nnue/nnue_architecture.h nnue/nnue_common.h nnue/nnue_feature_transformer.h nnue/simd.h \
+		position.h search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \
+		tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h memory.h shm.h shm_linux.h
+OBJS = $(notdir $(SRCS:.cpp=.o))
+VPATH = syzygy:nnue:nnue/features
+### ==========================================================================
+### Section 2. High-level Configuration
+### ==========================================================================
+#
+# flag                --- Comp switch        --- Description
+# ----------------------------------------------------------------------------
+#
+# debug = yes/no      --- -DNDEBUG           --- Enable/Disable debug mode
+# sanitize = none/<sanitizer> ... (-fsanitize )
+#                     --- ( undefined )      --- enable undefined behavior checks
+#                     --- ( thread    )      --- enable threading error checks
+#                     --- ( address   )      --- enable memory access checks
+#                     --- ...etc...          --- see compiler documentation for supported sanitizers
+# optimize = yes/no   --- (-O3/-fast etc.)   --- Enable/Disable optimizations
+# arch = (name)       --- (-arch)            --- Target architecture
+# bits = 64/32        --- -DIS_64BIT         --- 64-/32-bit operating system
+# prefetch = yes/no   --- -DUSE_PREFETCH     --- Use prefetch asm-instruction
+# popcnt = yes/no     --- -DUSE_POPCNT       --- Use popcnt asm-instruction
+# pext = yes/no       --- -DUSE_PEXT         --- Use pext x86_64 asm-instruction
+# sse = yes/no        --- -msse              --- Use Intel Streaming SIMD Extensions
+# mmx = yes/no        --- -mmmx              --- Use Intel MMX instructions
+# sse2 = yes/no       --- -msse2             --- Use Intel Streaming SIMD Extensions 2
+# ssse3 = yes/no      --- -mssse3            --- Use Intel Supplemental Streaming SIMD Extensions 3
+# sse41 = yes/no      --- -msse4.1           --- Use Intel Streaming SIMD Extensions 4.1
+# avx2 = yes/no       --- -mavx2             --- Use Intel Advanced Vector Extensions 2
+# avxvnni = yes/no    --- -mavxvnni          --- Use Intel Vector Neural Network Instructions AVX
+# avx512 = yes/no     --- -mavx512bw         --- Use Intel Advanced Vector Extensions 512
+# vnni512 = yes/no    --- -mavx512vnni       --- Use Intel Vector Neural Network Instructions 512
+# avx512icl = yes/no  --- ... multiple ...   --- Use All AVX-512 features available on both Intel Ice Lake and AMD Zen 4
+# altivec = yes/no    --- -maltivec          --- Use PowerPC Altivec SIMD extension
+# vsx = yes/no        --- -mvsx              --- Use POWER VSX SIMD extension
+# neon = yes/no       --- -DUSE_NEON         --- Use ARM SIMD architecture
+# dotprod = yes/no    --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions
+# lsx = yes/no        --- -mlsx              --- Use Loongson SIMD eXtension
+# lasx = yes/no       --- -mlasx             --- use Loongson Advanced SIMD eXtension
+#
+# Note that Makefile is space sensitive, so when adding new architectures
+# or modifying existing flags, you have to make sure there are no extra spaces
+# at the end of the line for flag values.
+#
+# Example of use for these flags:
+# make build ARCH=x86-64-avx512 debug=yes sanitize="address undefined"
+### 2.1. General and architecture defaults
+ifeq ($(ARCH),)
+   ARCH = native
+endif
+ifeq ($(ARCH), native)
+   override ARCH := $(shell $(SHELL) ../scripts/get_native_properties.sh | cut -d " " -f 1)
+endif
+# explicitly check for the list of supported architectures (as listed with make help),
+# the user can override with `make ARCH=x86-64-avx512icl SUPPORTED_ARCH=true`
+ifeq ($(ARCH), $(filter $(ARCH), \
+                 x86-64-avx512icl x86-64-vnni512 x86-64-avx512 x86-64-avxvnni \
+                 x86-64-bmi2 x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
+                 x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-64-altivec ppc-64-vsx ppc-32 e2k \
+                 armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64 \
+                 loongarch64 loongarch64-lsx loongarch64-lasx))
+   SUPPORTED_ARCH=true
+else
+   SUPPORTED_ARCH=false
+endif
+optimize = yes
+debug = no
+sanitize = none
+bits = 64
+prefetch = no
+popcnt = no
+pext = no
+sse = no
+mmx = no
+sse2 = no
+ssse3 = no
+sse41 = no
+avx2 = no
+avxvnni = no
+avx512 = no
+vnni512 = no
+avx512icl = no
+altivec = no
+vsx = no
+neon = no
+dotprod = no
+arm_version = 0
+lsx = no
+lasx = no
+STRIP = strip
+ifneq ($(shell which clang-format-20 2> /dev/null),)
+	CLANG-FORMAT = clang-format-20
+else
+	CLANG-FORMAT = clang-format
+endif
+### 2.2 Architecture specific
+ifeq ($(findstring x86,$(ARCH)),x86)
+# x86-32/64
+ifeq ($(findstring x86-32,$(ARCH)),x86-32)
+	arch = i386
+	bits = 32
+	sse = no
+	mmx = yes
+else
+	arch = x86_64
+	sse = yes
+	sse2 = yes
+endif
+ifeq ($(findstring -sse,$(ARCH)),-sse)
+	sse = yes
+endif
+ifeq ($(findstring -popcnt,$(ARCH)),-popcnt)
+	popcnt = yes
+endif
+ifeq ($(findstring -mmx,$(ARCH)),-mmx)
+	mmx = yes
+endif
+ifeq ($(findstring -sse2,$(ARCH)),-sse2)
+	sse = yes
+	sse2 = yes
+endif
+ifeq ($(findstring -ssse3,$(ARCH)),-ssse3)
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+endif
+ifeq ($(findstring -sse41,$(ARCH)),-sse41)
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+ifeq ($(findstring -modern,$(ARCH)),-modern)
+        $(warning *** ARCH=$(ARCH) is deprecated, defaulting to ARCH=x86-64-sse41-popcnt. Execute `make help` for a list of available architectures. ***)
+        $(shell sleep 5)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+ifeq ($(findstring -avx2,$(ARCH)),-avx2)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+endif
+ifeq ($(findstring -avxvnni,$(ARCH)),-avxvnni)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	avxvnni = yes
+	pext = yes
+endif
+ifeq ($(findstring -bmi2,$(ARCH)),-bmi2)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+endif
+ifeq ($(findstring -avx512,$(ARCH)),-avx512)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+	avx512 = yes
+endif
+ifeq ($(findstring -vnni512,$(ARCH)),-vnni512)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+	avx512 = yes
+	vnni512 = yes
+endif
+ifeq ($(findstring -avx512icl,$(ARCH)),-avx512icl)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+	avx512 = yes
+	vnni512 = yes
+	avx512icl = yes
+endif
+ifeq ($(sse),yes)
+	prefetch = yes
+endif
+# 64-bit pext is not available on x86-32
+ifeq ($(bits),32)
+	pext = no
+endif
+else
+# all other architectures
+ifeq ($(ARCH),general-32)
+	arch = any
+	bits = 32
+endif
+ifeq ($(ARCH),general-64)
+	arch = any
+endif
+ifeq ($(ARCH),armv7)
+	arch = armv7
+	prefetch = yes
+	bits = 32
+	arm_version = 7
+endif
+ifeq ($(ARCH),armv7-neon)
+	arch = armv7
+	prefetch = yes
+	popcnt = yes
+	neon = yes
+	bits = 32
+	arm_version = 7
+endif
+ifeq ($(ARCH),armv8)
+	arch = armv8
+	prefetch = yes
+	popcnt = yes
+	neon = yes
+	arm_version = 8
+endif
+ifeq ($(ARCH),armv8-dotprod)
+	arch = armv8
+	prefetch = yes
+	popcnt = yes
+	neon = yes
+	dotprod = yes
+	arm_version = 8
+endif
+ifeq ($(ARCH),apple-silicon)
+	arch = arm64
+	prefetch = yes
+	popcnt = yes
+	neon = yes
+	dotprod = yes
+	arm_version = 8
+endif
+ifeq ($(ARCH),ppc-32)
+	arch = ppc
+	bits = 32
+endif
+ifeq ($(ARCH),ppc-64)
+	arch = ppc64
+	popcnt = yes
+	prefetch = yes
+endif
+ifeq ($(ARCH),ppc-64-altivec)
+	arch = ppc64
+	popcnt = yes
+	prefetch = yes
+	altivec = yes
+endif
+ifeq ($(ARCH),ppc-64-vsx)
+	arch = ppc64
+	popcnt = yes
+	prefetch = yes
+	vsx = yes
+endif
+ifeq ($(findstring e2k,$(ARCH)),e2k)
+	arch = e2k
+	mmx = yes
+	bits = 64
+	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+	popcnt = yes
+endif
+ifeq ($(ARCH),riscv64)
+	arch = riscv64
+endif
+ifeq ($(findstring loongarch64,$(ARCH)),loongarch64)
+	arch = loongarch64
+	prefetch = yes
+ifeq ($(findstring -lasx,$(ARCH)),-lasx)
+	lsx = yes
+	lasx = yes
+endif
+ifeq ($(findstring -lsx,$(ARCH)),-lsx)
+	lsx = yes
+endif
+endif
+endif
+### ==========================================================================
+### Section 3. Low-level Configuration
+### ==========================================================================
+### 3.1 Selecting compiler (default = gcc)
+ifeq ($(MAKELEVEL),0)
+       export ENV_CXXFLAGS := $(CXXFLAGS)
+       export ENV_DEPENDFLAGS := $(DEPENDFLAGS)
+       export ENV_LDFLAGS := $(LDFLAGS)
+endif
+CXXFLAGS = $(ENV_CXXFLAGS) -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS)
+DEPENDFLAGS = $(ENV_DEPENDFLAGS) -std=c++17
+LDFLAGS = $(ENV_LDFLAGS) $(EXTRALDFLAGS)
+ifeq ($(COMP),)
+	COMP=gcc
+endif
+ifeq ($(COMP),gcc)
+	comp=gcc
+	CXX=g++
+	CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-declarations
+	ifeq ($(arch),$(filter $(arch),armv7 armv8 riscv64))
+		ifeq ($(OS),Android)
+			CXXFLAGS += -m$(bits)
+			LDFLAGS += -m$(bits)
+		endif
+		ifeq ($(ARCH),riscv64)
+			CXXFLAGS += -latomic
+		endif
+	else ifeq ($(arch),loongarch64)
+		CXXFLAGS += -latomic
+	else
+		CXXFLAGS += -m$(bits)
+		LDFLAGS += -m$(bits)
+	endif
+	ifeq ($(arch),$(filter $(arch),armv7))
+		LDFLAGS += -latomic
+	endif
+	ifneq ($(KERNEL),Darwin)
+	   LDFLAGS += -Wl,--no-as-needed
+	endif
+endif
+ifeq ($(target_windows),yes)
+	LDFLAGS += -static
+endif
+ifeq ($(COMP),mingw)
+	comp=mingw
+	ifeq ($(bits),64)
+		ifeq ($(shell which x86_64-w64-mingw32-c++-posix 2> /dev/null),)
+			CXX=x86_64-w64-mingw32-c++
+		else
+			CXX=x86_64-w64-mingw32-c++-posix
+		endif
+	else
+		ifeq ($(shell which i686-w64-mingw32-c++-posix 2> /dev/null),)
+			CXX=i686-w64-mingw32-c++
+		else
+			CXX=i686-w64-mingw32-c++-posix
+		endif
+	endif
+	CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-declarations
+endif
+ifeq ($(COMP),icx)
+	comp=icx
+	CXX=icpx
+	CXXFLAGS += --intel -pedantic -Wextra -Wshadow -Wmissing-prototypes \
+		-Wconditional-uninitialized -Wabi -Wdeprecated
+endif
+ifeq ($(COMP),clang)
+	comp=clang
+	CXX=clang++
+	ifeq ($(target_windows),yes)
+		CXX=x86_64-w64-mingw32-clang++
+	endif
+	CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-prototypes \
+	            -Wconditional-uninitialized -flax-vector-conversions=none
+	ifeq ($(filter $(KERNEL),Darwin OpenBSD FreeBSD),)
+	ifeq ($(target_windows),)
+	ifneq ($(RTLIB),compiler-rt)
+		LDFLAGS += -latomic
+	endif
+	endif
+	endif
+	ifeq ($(arch),$(filter $(arch),armv7 armv8 riscv64))
+		ifeq ($(OS),Android)
+			CXXFLAGS += -m$(bits)
+			LDFLAGS += -m$(bits)
+		endif
+		ifeq ($(ARCH),riscv64)
+			CXXFLAGS += -latomic
+		endif
+	else ifeq ($(arch),loongarch64)
+		CXXFLAGS += -latomic
+	else
+		CXXFLAGS += -m$(bits)
+		LDFLAGS += -m$(bits)
+	endif
+endif
+ifeq ($(KERNEL),Darwin)
+	CXXFLAGS += -mmacosx-version-min=10.15
+	LDFLAGS += -mmacosx-version-min=10.15
+	ifneq ($(arch),any)
+		CXXFLAGS += -arch $(arch)
+		LDFLAGS += -arch $(arch)
+	endif
+	XCRUN = xcrun
+endif
+# To cross-compile for Android, use NDK version r27c or later.
+ifeq ($(COMP),ndk)
+	CXXFLAGS += -stdlib=libc++
+	comp=clang
+	ifeq ($(arch),armv7)
+		CXX=armv7a-linux-androideabi29-clang++
+		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
+		ifneq ($(shell which arm-linux-androideabi-strip 2>/dev/null),)
+			STRIP=arm-linux-androideabi-strip
+		else
+			STRIP=llvm-strip
+		endif
+	endif
+	ifeq ($(arch),armv8)
+		CXX=aarch64-linux-android29-clang++
+		ifneq ($(shell which aarch64-linux-android-strip 2>/dev/null),)
+			STRIP=aarch64-linux-android-strip
+		else
+			STRIP=llvm-strip
+		endif
+	endif
+	ifeq ($(arch),x86_64)
+		CXX=x86_64-linux-android29-clang++
+		ifneq ($(shell which x86_64-linux-android-strip 2>/dev/null),)
+			STRIP=x86_64-linux-android-strip
+		else
+			STRIP=llvm-strip
+		endif
+	endif
+	LDFLAGS += -static-libstdc++
+endif
+### Allow overwriting CXX from command line
+ifdef COMPCXX
+	CXX=$(COMPCXX)
+endif
+# llvm-profdata must be version compatible with the specified CXX (be it clang, or the gcc alias)
+# make -j profile-build CXX=clang++-20 COMP=clang
+# Locate the version in the same directory as the compiler used,
+# with fallback to a generic one if it can't be located
+	LLVM_PROFDATA := $(dir $(realpath $(shell which $(CXX) 2> /dev/null)))llvm-profdata
+# for icx
+ifeq ($(wildcard $(LLVM_PROFDATA)),)
+	LLVM_PROFDATA := $(dir $(realpath $(shell which $(CXX) 2> /dev/null)))/compiler/llvm-profdata
+endif
+ifeq ($(wildcard $(LLVM_PROFDATA)),)
+	LLVM_PROFDATA := llvm-profdata
+endif
+ifeq ($(comp),icx)
+	profile_make = icx-profile-make
+	profile_use = icx-profile-use
+else ifeq ($(comp),clang)
+	profile_make = clang-profile-make
+	profile_use = clang-profile-use
+else
+	profile_make = gcc-profile-make
+	profile_use = gcc-profile-use
+	ifeq ($(KERNEL),Darwin)
+		EXTRAPROFILEFLAGS = -fvisibility=hidden
+	endif
+endif
+### Sometimes gcc is really clang
+ifeq ($(COMP),gcc)
+	gccversion := $(shell $(CXX) --version 2>/dev/null)
+	gccisclang := $(findstring clang,$(gccversion))
+	ifneq ($(gccisclang),)
+		profile_make = clang-profile-make
+		profile_use = clang-profile-use
+	else
+		CXXFLAGS += -Wstack-usage=128000
+	endif
+endif
+### On mingw use Windows threads, otherwise POSIX
+ifneq ($(comp),mingw)
+	CXXFLAGS += -DUSE_PTHREADS
+	# On Android Bionic's C library comes with its own pthread implementation bundled in
+	ifneq ($(OS),Android)
+		# Haiku has pthreads in its libroot, so only link it in on other platforms
+		ifneq ($(KERNEL),Haiku)
+			ifneq ($(COMP),ndk)
+				LDFLAGS += -lpthread
+				add_lrt = yes
+				ifeq ($(target_windows),yes)
+					add_lrt = no
+				endif
+				ifeq ($(KERNEL),Darwin)
+					add_lrt = no
+				endif
+				ifeq ($(add_lrt),yes)
+					LDFLAGS += -lrt
+				endif
+			endif
+		endif
+	endif
+endif
+### 3.2.1 Debugging
+ifeq ($(debug),no)
+	CXXFLAGS += -DNDEBUG
+else
+	CXXFLAGS += -g
+	CXXFLAGS += -D_GLIBCXX_ASSERTIONS -D_GLIBCXX_DEBUG
+endif
+### 3.2.2 Debugging with undefined behavior sanitizers
+ifneq ($(sanitize),none)
+        CXXFLAGS += -g3 $(addprefix -fsanitize=,$(sanitize))
+        LDFLAGS += $(addprefix -fsanitize=,$(sanitize))
+endif
+### 3.3 Optimization
+ifeq ($(optimize),yes)
+	CXXFLAGS += -O3 -funroll-loops
+	ifeq ($(comp),gcc)
+		ifeq ($(OS), Android)
+			CXXFLAGS += -fno-gcse -mthumb -march=armv7-a -mfloat-abi=softfp
+		endif
+	endif
+	ifeq ($(KERNEL),Darwin)
+		ifeq ($(comp),$(filter $(comp),clang icx))
+			CXXFLAGS += -mdynamic-no-pic
+		endif
+		ifeq ($(comp),gcc)
+			ifneq ($(arch),arm64)
+				CXXFLAGS += -mdynamic-no-pic
+			endif
+		endif
+	endif
+	ifeq ($(comp),clang)
+		clangmajorversion := $(shell $(CXX) -dumpversion 2>/dev/null | cut -f1 -d.)
+		ifeq ($(shell expr $(clangmajorversion) \< 16),1)
+			CXXFLAGS += -fexperimental-new-pass-manager
+		endif
+	endif
+endif
+### 3.4 Bits
+ifeq ($(bits),64)
+	CXXFLAGS += -DIS_64BIT
+endif
+### 3.5 prefetch and popcount
+ifeq ($(prefetch),yes)
+	ifeq ($(sse),yes)
+		CXXFLAGS += -msse
+	endif
+else
+	CXXFLAGS += -DNO_PREFETCH
+endif
+ifeq ($(popcnt),yes)
+	ifeq ($(arch),$(filter $(arch),ppc64 ppc64-altivec ppc64-vsx armv7 armv8 arm64))
+		CXXFLAGS += -DUSE_POPCNT
+	else
+		CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT
+	endif
+endif
+### 3.6 SIMD architectures
+ifeq ($(avx2),yes)
+	CXXFLAGS += -DUSE_AVX2
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -mavx2 -mbmi
+	endif
+endif
+ifeq ($(avxvnni),yes)
+	CXXFLAGS += -DUSE_VNNI -DUSE_AVXVNNI
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -mavxvnni
+	endif
+endif
+ifeq ($(avx512),yes)
+	CXXFLAGS += -DUSE_AVX512
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -mavx512f -mavx512bw -mavx512dq -mavx512vl
+	endif
+endif
+ifeq ($(vnni512),yes)
+	CXXFLAGS += -DUSE_VNNI
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl
+	endif
+endif
+ifeq ($(avx512icl),yes)
+	CXXFLAGS += -DUSE_AVX512 -DUSE_VNNI -DUSE_AVX512ICL
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx512vpopcntdq -mavx512bitalg -mavx512vnni -mvpclmulqdq -mgfni -mvaes
+	endif
+endif
+ifeq ($(sse41),yes)
+	CXXFLAGS += -DUSE_SSE41
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -msse4.1
+	endif
+endif
+ifeq ($(ssse3),yes)
+	CXXFLAGS += -DUSE_SSSE3
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -mssse3
+	endif
+endif
+ifeq ($(sse2),yes)
+	CXXFLAGS += -DUSE_SSE2
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -msse2
+	endif
+endif
+ifeq ($(mmx),yes)
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -mmmx
+	endif
+endif
+ifeq ($(altivec),yes)
+	CXXFLAGS += -maltivec
+	ifeq ($(COMP),gcc)
+		CXXFLAGS += -mabi=altivec
+	endif
+endif
+ifeq ($(vsx),yes)
+	CXXFLAGS += -mvsx
+	ifeq ($(COMP),gcc)
+		CXXFLAGS += -DNO_WARN_X86_INTRINSICS -DUSE_SSE2
+	endif
+endif
+ifeq ($(neon),yes)
+	CXXFLAGS += -DUSE_NEON=$(arm_version)
+	ifeq ($(KERNEL),Linux)
+	ifneq ($(COMP),ndk)
+	ifneq ($(arch),armv8)
+		CXXFLAGS += -mfpu=neon
+	endif
+	endif
+	endif
+endif
+ifeq ($(dotprod),yes)
+	CXXFLAGS += -march=armv8.2-a+dotprod -DUSE_NEON_DOTPROD
+endif
+ifeq ($(lasx),yes)
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -mlasx
+	endif
+endif
+ifeq ($(lsx),yes)
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -mlsx
+	endif
+endif
+### 3.7 pext
+ifeq ($(pext),yes)
+	CXXFLAGS += -DUSE_PEXT
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
+		CXXFLAGS += -mbmi2
+	endif
+endif
+### 3.8.1 Try to include git commit sha for versioning
+GIT_SHA := $(shell git rev-parse HEAD 2>/dev/null | cut -c 1-8)
+ifneq ($(GIT_SHA), )
+	CXXFLAGS += -DGIT_SHA=$(GIT_SHA)
+endif
+### 3.8.2 Try to include git commit date for versioning
+GIT_DATE := $(shell git show -s --date=format:'%Y%m%d' --format=%cd HEAD 2>/dev/null)
+ifneq ($(GIT_DATE), )
+	CXXFLAGS += -DGIT_DATE=$(GIT_DATE)
+endif
+### 3.8.3 Try to include architecture
+ifneq ($(ARCH), )
+	CXXFLAGS += -DARCH=$(ARCH)
+endif
+### 3.9 Link Time Optimization
+### This is a mix of compile and link time options because the lto link phase
+### needs access to the optimization flags.
+ifeq ($(optimize),yes)
+ifeq ($(debug),no)
+	ifneq ($(KERNEL),Darwin)
+		LLD_BIN := $(shell command -v ld.lld 2>/dev/null)
+		ifeq ($(LLD_BIN),)
+			LLD_BIN := $(shell command -v lld 2>/dev/null)
+		endif
+		ifneq ($(LLD_BIN),)
+			ifeq ($(comp),clang)
+				LDFLAGS += -fuse-ld=lld
+			else ifeq ($(comp),gcc)
+				ifneq ($(gccisclang),)
+					LDFLAGS += -fuse-ld=lld
+				endif
+			endif
+		endif
+	endif
+	ifeq ($(comp),$(filter $(comp),clang icx))
+		CXXFLAGS += -flto=full
+		ifeq ($(comp),icx)
+			CXXFLAGS += -fwhole-program-vtables
+		endif
+		LDFLAGS += $(CXXFLAGS)
+# GCC and CLANG use different methods for parallelizing LTO and CLANG pretends to be
+# GCC on some systems.
+	else ifeq ($(comp),gcc)
+		ifeq ($(gccisclang),)
+			CXXFLAGS += -flto -flto-partition=one
+			LDFLAGS += $(CXXFLAGS) -flto=jobserver
+		else
+			CXXFLAGS += -flto=full
+			LDFLAGS += $(CXXFLAGS)
+		endif
+# To use LTO and static linking on Windows,
+# the tool chain requires gcc version 10.1 or later.
+	else ifeq ($(comp),mingw)
+		CXXFLAGS += -flto -flto-partition=one
+		LDFLAGS += $(CXXFLAGS) -save-temps
+	endif
+endif
+endif
+### 3.10 Android 5 can only run position independent executables. Note that this
+### breaks Android 4.0 and earlier.
+ifeq ($(OS), Android)
+	CXXFLAGS += -fPIE
+	LDFLAGS += -fPIE -pie
+endif
+### 3.11 Inline settings
+ifeq ($(optimize), yes)
+	ifeq ($(comp), clang)
+		CXXFLAGS += -Xclang -mllvm -Xclang -inline-threshold=500
+	endif
+endif
+### ==========================================================================
+### Section 4. Public Targets
+### ==========================================================================
+help:
+	@echo "" && \
+	echo "To compile stockfish, type: " && \
+	echo "" && \
+	echo "make -j target [ARCH=arch] [COMP=compiler] [COMPCXX=cxx]" && \
+	echo "" && \
+	echo "Supported targets:" && \
+	echo "" && \
+	echo "help                    > Display architecture details" && \
+	echo "profile-build           > standard build with profile-guided optimization" && \
+	echo "build                   > skip profile-guided optimization" && \
+	echo "net                     > Download the default nnue nets" && \
+	echo "strip                   > Strip executable" && \
+	echo "install                 > Install executable" && \
+	echo "clean                   > Clean up" && \
+	echo "" && \
+	echo "Supported archs:" && \
+	echo "" && \
+	echo "native                  > select the best architecture for the host processor (default)" && \
+	echo "x86-64-avx512icl        > x86 64-bit with minimum avx512 support of Intel Ice Lake or AMD Zen 4" && \
+	echo "x86-64-vnni512          > x86 64-bit with vnni 512bit support" && \
+	echo "x86-64-avx512           > x86 64-bit with avx512 support" && \
+	echo "x86-64-avxvnni          > x86 64-bit with vnni 256bit support" && \
+	echo "x86-64-bmi2             > x86 64-bit with bmi2 support" && \
+	echo "x86-64-avx2             > x86 64-bit with avx2 support" && \
+	echo "x86-64-sse41-popcnt     > x86 64-bit with sse41 and popcnt support" && \
+	echo "x86-64-modern           > deprecated, currently x86-64-sse41-popcnt" && \
+	echo "x86-64-ssse3            > x86 64-bit with ssse3 support" && \
+	echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 compile and popcnt support" && \
+	echo "x86-64                  > x86 64-bit generic (with sse2 support)" && \
+	echo "x86-32-sse41-popcnt     > x86 32-bit with sse41 and popcnt support" && \
+	echo "x86-32-sse2             > x86 32-bit with sse2 support" && \
+	echo "x86-32                  > x86 32-bit generic (with mmx compile support)" && \
+	echo "ppc-64                  > PPC 64-bit" && \
+	echo "ppc-64-altivec          > PPC 64-bit with altivec support" && \
+	echo "ppc-64-vsx              > PPC 64-bit with vsx support" && \
+	echo "ppc-32                  > PPC 32-bit" && \
+	echo "armv7                   > ARMv7 32-bit" && \
+	echo "armv7-neon              > ARMv7 32-bit with popcnt and neon" && \
+	echo "armv8                   > ARMv8 64-bit with popcnt and neon" && \
+	echo "armv8-dotprod           > ARMv8 64-bit with popcnt, neon and dot product support" && \
+	echo "e2k                     > Elbrus 2000" && \
+	echo "apple-silicon           > Apple silicon ARM64" && \
+	echo "general-64              > unspecified 64-bit" && \
+	echo "general-32              > unspecified 32-bit" && \
+	echo "riscv64                 > RISC-V 64-bit" && \
+	echo "loongarch64             > LoongArch 64-bit" && \
+	echo "loongarch64-lsx         > LoongArch 64-bit with SIMD eXtension" && \
+	echo "loongarch64-lasx        > LoongArch 64-bit with Advanced SIMD eXtension" && \
+	echo "" && \
+	echo "Supported compilers:" && \
+	echo "" && \
+	echo "gcc                     > GNU compiler (default)" && \
+	echo "mingw                   > GNU compiler with MinGW under Windows" && \
+	echo "clang                   > LLVM Clang compiler" && \
+	echo "icx                     > Intel oneAPI DPC++/C++ Compiler" && \
+	echo "ndk                     > Google NDK to cross-compile for Android" && \
+	echo "" && \
+	echo "Simple examples. If you don't know what to do, you likely want to run one of: " && \
+	echo "" && \
+	echo "make -j profile-build ARCH=x86-64-avx2    # typically a fast compile for common systems " && \
+	echo "make -j profile-build ARCH=x86-64-sse41-popcnt  # A more portable compile for 64-bit systems " && \
+	echo "make -j profile-build ARCH=x86-64         # A portable compile for 64-bit systems " && \
+	echo "" && \
+	echo "Advanced examples, for experienced users: " && \
+	echo "" && \
+	echo "make -j profile-build ARCH=x86-64-avxvnni" && \
+	echo "make -j profile-build ARCH=x86-64-avxvnni COMP=gcc COMPCXX=g++-12.0" && \
+	echo "make -j build ARCH=x86-64-ssse3 COMP=clang" && \
+	echo ""
+ifneq ($(SUPPORTED_ARCH), true)
+	@echo "Specify a supported architecture with the ARCH option for more details"
+	@echo ""
+endif
+.PHONY: help analyze build profile-build strip install clean net \
+	objclean profileclean config-sanity \
+	icx-profile-use icx-profile-make \
+	gcc-profile-use gcc-profile-make \
+	clang-profile-use clang-profile-make FORCE \
+	format analyze
+analyze: net config-sanity objclean
+	$(MAKE) -k ARCH=$(ARCH) COMP=$(COMP) $(OBJS)
+build: net config-sanity
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
+profile-build: net config-sanity objclean profileclean
+	@echo ""
+	@echo "Step 1/4. Building instrumented executable ..."
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
+	@echo ""
+	@echo "Step 2/4. Running benchmark for pgo-build ..."
+	$(PGOBENCH) > PGOBENCH.out 2>&1
+	tail -n 4 PGOBENCH.out
+	@echo ""
+	@echo "Step 3/4. Building optimized executable ..."
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use)
+	@echo ""
+	@echo "Step 4/4. Deleting profile data ..."
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
+strip:
+	$(STRIP) $(EXE)
+install:
+	-mkdir -p -m 755 $(BINDIR)
+	-cp $(EXE) $(BINDIR)
+	$(STRIP) $(BINDIR)/$(EXE)
+# clean all
+clean: objclean profileclean
+	@rm -f .depend *~ core
+# clean binaries and objects
+objclean:
+	@rm -f stockfish stockfish.exe *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o
+# clean auxiliary profiling files
+profileclean:
+	@rm -rf profdir
+	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s PGOBENCH.out
+	@rm -f stockfish.profdata *.profraw
+	@rm -f stockfish.*args*
+	@rm -f stockfish.*lt*
+	@rm -f stockfish.res
+	@rm -f ./-lstdc++.res
+# evaluation network (nnue)
+net:
+	@$(SHELL) ../scripts/net.sh
+format:
+	$(CLANG-FORMAT) -i $(SRCS) $(HEADERS) -style=file
+### ==========================================================================
+### Section 5. Private Targets
+### ==========================================================================
+all: $(EXE) .depend
+config-sanity: net
+	@echo ""
+	@echo "Config:" && \
+	echo "debug: '$(debug)'" && \
+	echo "sanitize: '$(sanitize)'" && \
+	echo "optimize: '$(optimize)'" && \
+	echo "arch: '$(arch)'" && \
+	echo "bits: '$(bits)'" && \
+	echo "kernel: '$(KERNEL)'" && \
+	echo "os: '$(OS)'" && \
+	echo "prefetch: '$(prefetch)'" && \
+	echo "popcnt: '$(popcnt)'" && \
+	echo "pext: '$(pext)'" && \
+	echo "sse: '$(sse)'" && \
+	echo "mmx: '$(mmx)'" && \
+	echo "sse2: '$(sse2)'" && \
+	echo "ssse3: '$(ssse3)'" && \
+	echo "sse41: '$(sse41)'" && \
+	echo "avx2: '$(avx2)'" && \
+	echo "avxvnni: '$(avxvnni)'" && \
+	echo "avx512: '$(avx512)'" && \
+	echo "vnni512: '$(vnni512)'" && \
+	echo "avx512icl: '$(avx512icl)'" && \
+	echo "altivec: '$(altivec)'" && \
+	echo "vsx: '$(vsx)'" && \
+	echo "neon: '$(neon)'" && \
+	echo "dotprod: '$(dotprod)'" && \
+	echo "arm_version: '$(arm_version)'" && \
+	echo "lsx: '$(lsx)'" && \
+	echo "lasx: '$(lasx)'" && \
+	echo "target_windows: '$(target_windows)'" && \
+	echo "" && \
+	echo "Flags:" && \
+	echo "CXX: $(CXX)" && \
+	echo "CXXFLAGS: $(CXXFLAGS)" && \
+	echo "LDFLAGS: $(LDFLAGS)" && \
+	echo "" && \
+	echo "Testing config sanity. If this fails, try 'make help' ..." && \
+	echo "" && \
+	(test "$(debug)" = "yes" || test "$(debug)" = "no") && \
+	(test "$(optimize)" = "yes" || test "$(optimize)" = "no") && \
+	(test "$(SUPPORTED_ARCH)" = "true") && \
+	(test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
+	 test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || test "$(arch)" = "e2k" || \
+	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" || \
+	 test "$(arch)" = "riscv64" || test "$(arch)" = "loongarch64") && \
+	(test "$(bits)" = "32" || test "$(bits)" = "64") && \
+	(test "$(prefetch)" = "yes" || test "$(prefetch)" = "no") && \
+	(test "$(popcnt)" = "yes" || test "$(popcnt)" = "no") && \
+	(test "$(pext)" = "yes" || test "$(pext)" = "no") && \
+	(test "$(sse)" = "yes" || test "$(sse)" = "no") && \
+	(test "$(mmx)" = "yes" || test "$(mmx)" = "no") && \
+	(test "$(sse2)" = "yes" || test "$(sse2)" = "no") && \
+	(test "$(ssse3)" = "yes" || test "$(ssse3)" = "no") && \
+	(test "$(sse41)" = "yes" || test "$(sse41)" = "no") && \
+	(test "$(avx2)" = "yes" || test "$(avx2)" = "no") && \
+	(test "$(avx512)" = "yes" || test "$(avx512)" = "no") && \
+	(test "$(vnni512)" = "yes" || test "$(vnni512)" = "no") && \
+	(test "$(avx512icl)" = "yes" || test "$(avx512icl)" = "no") && \
+	(test "$(altivec)" = "yes" || test "$(altivec)" = "no") && \
+	(test "$(vsx)" = "yes" || test "$(vsx)" = "no") && \
+	(test "$(neon)" = "yes" || test "$(neon)" = "no") && \
+	(test "$(lsx)" = "yes" || test "$(lsx)" = "no") && \
+	(test "$(lasx)" = "yes" || test "$(lasx)" = "no") && \
+	(test "$(comp)" = "gcc" || test "$(comp)" = "icx" || test "$(comp)" = "mingw" || \
+	 test "$(comp)" = "clang" || test "$(comp)" = "armv7a-linux-androideabi16-clang" || \
+	 test "$(comp)" = "aarch64-linux-android21-clang")
+$(EXE): $(OBJS)
+	+$(CXX) -o $@ $(OBJS) $(LDFLAGS)
+# Force recompilation to ensure version info is up-to-date
+misc.o: FORCE
+FORCE:
+clang-profile-make:
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
+	EXTRACXXFLAGS='-fprofile-generate ' \
+	EXTRALDFLAGS=' -fprofile-generate' \
+	all
+clang-profile-use:
+	$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
+	EXTRACXXFLAGS='-fprofile-use=stockfish.profdata' \
+	EXTRALDFLAGS='-fprofile-use ' \
+	all
+gcc-profile-make:
+	@mkdir -p profdir
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
+	EXTRACXXFLAGS='-fprofile-generate=profdir' \
+	EXTRACXXFLAGS+=$(EXTRAPROFILEFLAGS) \
+	EXTRALDFLAGS='-lgcov' \
+	all
+gcc-profile-use:
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
+	EXTRACXXFLAGS='-fprofile-use=profdir -fno-peel-loops -fno-tracer' \
+	EXTRACXXFLAGS+=$(EXTRAPROFILEFLAGS) \
+	EXTRALDFLAGS='-lgcov' \
+	all
+icx-profile-make:
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
+	EXTRACXXFLAGS='-fprofile-instr-generate ' \
+	EXTRALDFLAGS=' -fprofile-instr-generate' \
+	all
+icx-profile-use:
+	$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
+	EXTRACXXFLAGS='-fprofile-instr-use=stockfish.profdata' \
+	EXTRALDFLAGS='-fprofile-use ' \
+	all
+.depend: $(SRCS)
+	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null
+ifeq (, $(filter $(MAKECMDGOALS), help strip install clean net objclean profileclean format config-sanity))
+-include .depend
+endif

src/benchmark.cpp ADDED Viewed

	@@ -0,0 +1,516 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "benchmark.h"
+#include "numa.h"
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <vector>
+namespace {
+// clang-format off
+const std::vector<std::string> Defaults = {
+  "setoption name UCI_Chess960 value false",
+  "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1",
+  "r3k2r/p1ppqpb1/bn2pnp1/3PN3/1p2P3/2N2Q1p/PPPBBPPP/R3K2R w KQkq - 0 10",
+  "8/2p5/3p4/KP5r/1R3p1k/8/4P1P1/8 w - - 0 11",
+  "4rrk1/pp1n3p/3q2pQ/2p1pb2/2PP4/2P3N1/P2B2PP/4RRK1 b - - 7 19",
+  "rq3rk1/ppp2ppp/1bnpb3/3N2B1/3NP3/7P/PPPQ1PP1/2KR3R w - - 7 14 moves d4e6",
+  "r1bq1r1k/1pp1n1pp/1p1p4/4p2Q/4Pp2/1BNP4/PPP2PPP/3R1RK1 w - - 2 14 moves g2g4",
+  "r3r1k1/2p2ppp/p1p1bn2/8/1q2P3/2NPQN2/PPP3PP/R4RK1 b - - 2 15",
+  "r1bbk1nr/pp3p1p/2n5/1N4p1/2Np1B2/8/PPP2PPP/2KR1B1R w kq - 0 13",
+  "r1bq1rk1/ppp1nppp/4n3/3p3Q/3P4/1BP1B3/PP1N2PP/R4RK1 w - - 1 16",
+  "4r1k1/r1q2ppp/ppp2n2/4P3/5Rb1/1N1BQ3/PPP3PP/R5K1 w - - 1 17",
+  "2rqkb1r/ppp2p2/2npb1p1/1N1Nn2p/2P1PP2/8/PP2B1PP/R1BQK2R b KQ - 0 11",
+  "r1bq1r1k/b1p1npp1/p2p3p/1p6/3PP3/1B2NN2/PP3PPP/R2Q1RK1 w - - 1 16",
+  "3r1rk1/p5pp/bpp1pp2/8/q1PP1P2/b3P3/P2NQRPP/1R2B1K1 b - - 6 22",
+  "r1q2rk1/2p1bppp/2Pp4/p6b/Q1PNp3/4B3/PP1R1PPP/2K4R w - - 2 18",
+  "4k2r/1pb2ppp/1p2p3/1R1p4/3P4/2r1PN2/P4PPP/1R4K1 b - - 3 22",
+  "3q2k1/pb3p1p/4pbp1/2r5/PpN2N2/1P2P2P/5PP1/Q2R2K1 b - - 4 26",
+  "6k1/6p1/6Pp/ppp5/3pn2P/1P3K2/1PP2P2/3N4 b - - 0 1",
+  "3b4/5kp1/1p1p1p1p/pP1PpP1P/P1P1P3/3KN3/8/8 w - - 0 1",
+  "2K5/p7/7P/5pR1/8/5k2/r7/8 w - - 0 1 moves g5g6 f3e3 g6g5 e3f3",
+  "8/6pk/1p6/8/PP3p1p/5P2/4KP1q/3Q4 w - - 0 1",
+  "7k/3p2pp/4q3/8/4Q3/5Kp1/P6b/8 w - - 0 1",
+  "8/2p5/8/2kPKp1p/2p4P/2P5/3P4/8 w - - 0 1",
+  "8/1p3pp1/7p/5P1P/2k3P1/8/2K2P2/8 w - - 0 1",
+  "8/pp2r1k1/2p1p3/3pP2p/1P1P1P1P/P5KR/8/8 w - - 0 1",
+  "8/3p4/p1bk3p/Pp6/1Kp1PpPp/2P2P1P/2P5/5B2 b - - 0 1",
+  "5k2/7R/4P2p/5K2/p1r2P1p/8/8/8 b - - 0 1",
+  "6k1/6p1/P6p/r1N5/5p2/7P/1b3PP1/4R1K1 w - - 0 1",
+  "1r3k2/4q3/2Pp3b/3Bp3/2Q2p2/1p1P2P1/1P2KP2/3N4 w - - 0 1",
+  "6k1/4pp1p/3p2p1/P1pPb3/R7/1r2P1PP/3B1P2/6K1 w - - 0 1",
+  "8/3p3B/5p2/5P2/p7/PP5b/k7/6K1 w - - 0 1",
+  "5rk1/q6p/2p3bR/1pPp1rP1/1P1Pp3/P3B1Q1/1K3P2/R7 w - - 93 90",
+  "4rrk1/1p1nq3/p7/2p1P1pp/3P2bp/3Q1Bn1/PPPB4/1K2R1NR w - - 40 21",
+  "r3k2r/3nnpbp/q2pp1p1/p7/Pp1PPPP1/4BNN1/1P5P/R2Q1RK1 w kq - 0 16",
+  "3Qb1k1/1r2ppb1/pN1n2q1/Pp1Pp1Pr/4P2p/4BP2/4B1R1/1R5K b - - 11 40",
+  "4k3/3q1r2/1N2r1b1/3ppN2/2nPP3/1B1R2n1/2R1Q3/3K4 w - - 5 1",
+  "1r6/1P4bk/3qr1p1/N6p/3pp2P/6R1/3Q1PP1/1R4K1 w - - 1 42",
+  // Positions with high numbers of changed threats
+  "k7/2n1n3/1nbNbn2/2NbRBn1/1nbRQR2/2NBRBN1/3N1N2/7K w - - 0 1",
+  "K7/8/8/BNQNQNB1/N5N1/R1Q1q2r/n5n1/bnqnqnbk w - - 0 1",
+  // 5-man positions
+  "8/8/8/8/5kp1/P7/8/1K1N4 w - - 0 1",     // Kc2 - mate
+  "8/8/8/5N2/8/p7/8/2NK3k w - - 0 1",      // Na2 - mate
+  "8/3k4/8/8/8/4B3/4KB2/2B5 w - - 0 1",    // draw
+  // 6-man positions
+  "8/8/1P6/5pr1/8/4R3/7k/2K5 w - - 0 1",   // Re5 - mate
+  "8/2p4P/8/kr6/6R1/8/8/1K6 w - - 0 1",    // Ka2 - mate
+  "8/8/3P3k/8/1p6/8/1P6/1K3n2 b - - 0 1",  // Nd2 - draw
+  // 7-man positions
+  "8/R7/2q5/8/6k1/8/1P5p/K6R w - - 0 124", // Draw
+  // Mate and stalemate positions
+  "6k1/3b3r/1p1p4/p1n2p2/1PPNpP1q/P3Q1p1/1R1RB1P1/5K2 b - - 0 1",
+  "r2r1n2/pp2bk2/2p1p2p/3q4/3PN1QP/2P3R1/P4PP1/5RK1 w - - 0 1",
+  "8/8/8/8/8/6k1/6p1/6K1 w - -",
+  "7k/7P/6K1/8/3B4/8/8/8 b - -",
+  // Chess 960
+  "setoption name UCI_Chess960 value true",
+  "bbqnnrkr/pppppppp/8/8/8/8/PPPPPPPP/BBQNNRKR w HFhf - 0 1 moves g2g3 d7d5 d2d4 c8h3 c1g5 e8d6 g5e7 f7f6",
+  "nqbnrkrb/pppppppp/8/8/8/8/PPPPPPPP/NQBNRKRB w KQkq - 0 1",
+  "setoption name UCI_Chess960 value false"
+};
+// clang-format on
+// clang-format off
+// human-randomly picked 5 games with <60 moves from
+// https://tests.stockfishchess.org/tests/view/665c71f9fd45fb0f907c21e0
+// only moves for one side
+const std::vector<std::vector<std::string>> BenchmarkPositions = {
+    {
+        "rnbq1k1r/ppp1bppp/4pn2/8/2B5/2NP1N2/PPP2PPP/R1BQR1K1 b - - 2 8",
+        "rnbq1k1r/pp2bppp/4pn2/2p5/2B2B2/2NP1N2/PPP2PPP/R2QR1K1 b - - 1 9",
+        "r1bq1k1r/pp2bppp/2n1pn2/2p5/2B1NB2/3P1N2/PPP2PPP/R2QR1K1 b - - 3 10",
+        "r1bq1k1r/pp2bppp/2n1p3/2p5/2B1PB2/5N2/PPP2PPP/R2QR1K1 b - - 0 11",
+        "r1b2k1r/pp2bppp/2n1p3/2p5/2B1PB2/5N2/PPP2PPP/3RR1K1 b - - 0 12",
+        "r1b1k2r/pp2bppp/2n1p3/2p5/2B1PB2/2P2N2/PP3PPP/3RR1K1 b - - 0 13",
+        "r1b1k2r/1p2bppp/p1n1p3/2p5/4PB2/2P2N2/PP2BPPP/3RR1K1 b - - 1 14",
+        "r1b1k2r/4bppp/p1n1p3/1pp5/P3PB2/2P2N2/1P2BPPP/3RR1K1 b - - 0 15",
+        "r1b1k2r/4bppp/p1n1p3/1P6/2p1PB2/2P2N2/1P2BPPP/3RR1K1 b - - 0 16",
+        "r1b1k2r/4bppp/2n1p3/1p6/2p1PB2/1PP2N2/4BPPP/3RR1K1 b - - 0 17",
+        "r3k2r/3bbppp/2n1p3/1p6/2P1PB2/2P2N2/4BPPP/3RR1K1 b - - 0 18",
+        "r3k2r/3bbppp/2n1p3/8/1pP1P3/2P2N2/3BBPPP/3RR1K1 b - - 1 19",
+        "1r2k2r/3bbppp/2n1p3/8/1pPNP3/2P5/3BBPPP/3RR1K1 b - - 3 20",
+        "1r2k2r/3bbppp/2n1p3/8/2PNP3/2B5/4BPPP/3RR1K1 b - - 0 21",
+        "1r2k2r/3bb1pp/2n1pp2/1N6/2P1P3/2B5/4BPPP/3RR1K1 b - - 1 22",
+        "1r2k2r/3b2pp/2n1pp2/1N6/1BP1P3/8/4BPPP/3RR1K1 b - - 0 23",
+        "1r2k2r/3b2pp/4pp2/1N6/1nP1P3/8/3RBPPP/4R1K1 b - - 1 24",
+        "1r5r/3bk1pp/4pp2/1N6/1nP1PP2/8/3RB1PP/4R1K1 b - - 0 25",
+        "1r5r/3bk1pp/2n1pp2/1N6/2P1PP2/8/3RBKPP/4R3 b - - 2 26",
+        "1r5r/3bk1pp/2n2p2/1N2p3/2P1PP2/6P1/3RBK1P/4R3 b - - 0 27",
+        "1r1r4/3bk1pp/2n2p2/1N2p3/2P1PP2/6P1/3RBK1P/R7 b - - 2 28",
+        "1r1r4/N3k1pp/2n1bp2/4p3/2P1PP2/6P1/3RBK1P/R7 b - - 4 29",
+        "1r1r4/3bk1pp/2N2p2/4p3/2P1PP2/6P1/3RBK1P/R7 b - - 0 30",
+        "1r1R4/4k1pp/2b2p2/4p3/2P1PP2/6P1/4BK1P/R7 b - - 0 31",
+        "3r4/4k1pp/2b2p2/4P3/2P1P3/6P1/4BK1P/R7 b - - 0 32",
+        "3r4/R3k1pp/2b5/4p3/2P1P3/6P1/4BK1P/8 b - - 1 33",
+        "8/3rk1pp/2b5/R3p3/2P1P3/6P1/4BK1P/8 b - - 3 34",
+        "8/3r2pp/2bk4/R1P1p3/4P3/6P1/4BK1P/8 b - - 0 35",
+        "8/2kr2pp/2b5/R1P1p3/4P3/4K1P1/4B2P/8 b - - 2 36",
+        "1k6/3r2pp/2b5/RBP1p3/4P3/4K1P1/7P/8 b - - 4 37",
+        "8/1k1r2pp/2b5/R1P1p3/4P3/3BK1P1/7P/8 b - - 6 38",
+        "1k6/3r2pp/2b5/2P1p3/4P3/3BK1P1/7P/R7 b - - 8 39",
+        "1k6/r5pp/2b5/2P1p3/4P3/3BK1P1/7P/5R2 b - - 10 40",
+        "1k3R2/6pp/2b5/2P1p3/4P3/r2BK1P1/7P/8 b - - 12 41",
+        "5R2/2k3pp/2b5/2P1p3/4P3/r2B2P1/3K3P/8 b - - 14 42",
+        "5R2/2k3pp/2b5/2P1p3/4P3/3BK1P1/r6P/8 b - - 16 43",
+        "5R2/2k3pp/2b5/2P1p3/4P3/r2B2P1/4K2P/8 b - - 18 44",
+        "5R2/2k3pp/2b5/2P1p3/4P3/3B1KP1/r6P/8 b - - 20 45",
+        "8/2k2Rpp/2b5/2P1p3/4P3/r2B1KP1/7P/8 b - - 22 46",
+        "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/4K2P/8 b - - 24 47",
+        "3k4/5Rpp/2b5/2P1p3/4P3/3B1KP1/r6P/8 b - - 26 48",
+        "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/4K2P/8 b - - 28 49",
+        "3k4/5Rpp/2b5/2P1p3/4P3/3BK1P1/r6P/8 b - - 30 50",
+        "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/3K3P/8 b - - 32 51",
+        "3k4/5Rpp/2b5/2P1p3/4P3/2KB2P1/r6P/8 b - - 34 52",
+        "3k4/5Rpp/2b5/2P1p3/4P3/r2B2P1/2K4P/8 b - - 36 53",
+        "3k4/5Rpp/2b5/2P1p3/4P3/1K1B2P1/r6P/8 b - - 38 54",
+        "3k4/6Rp/2b5/2P1p3/4P3/1K1B2P1/7r/8 b - - 0 55",
+        "3k4/8/2b3Rp/2P1p3/4P3/1K1B2P1/7r/8 b - - 1 56",
+        "8/2k3R1/2b4p/2P1p3/4P3/1K1B2P1/7r/8 b - - 3 57",
+        "3k4/8/2b3Rp/2P1p3/4P3/1K1B2P1/7r/8 b - - 5 58",
+        "8/2k5/2b3Rp/2P1p3/1K2P3/3B2P1/7r/8 b - - 7 59",
+        "8/2k5/2b3Rp/2P1p3/4P3/2KB2P1/3r4/8 b - - 9 60",
+        "8/2k5/2b3Rp/2P1p3/1K2P3/3B2P1/6r1/8 b - - 11 61",
+        "8/2k5/2b3Rp/2P1p3/4P3/2KB2P1/3r4/8 b - - 13 62",
+        "8/2k5/2b3Rp/2P1p3/2K1P3/3B2P1/6r1/8 b - - 15 63",
+        "4b3/2k3R1/7p/2P1p3/2K1P3/3B2P1/6r1/8 b - - 17 64",
+    },
+    {
+        "r1bqkbnr/npp1pppp/p7/3P4/4pB2/2N5/PPP2PPP/R2QKBNR w KQkq - 1 6",
+        "r1bqkb1r/npp1pppp/p4n2/3P4/4pB2/2N5/PPP1QPPP/R3KBNR w KQkq - 3 7",
+        "r2qkb1r/npp1pppp/p4n2/3P1b2/4pB2/2N5/PPP1QPPP/2KR1BNR w kq - 5 8",
+        "r2qkb1r/1pp1pppp/p4n2/1n1P1b2/4pB2/2N4P/PPP1QPP1/2KR1BNR w kq - 1 9",
+        "r2qkb1r/1pp1pppp/5n2/1p1P1b2/4pB2/7P/PPP1QPP1/2KR1BNR w kq - 0 10",
+        "r2qkb1r/1ppbpppp/5n2/1Q1P4/4pB2/7P/PPP2PP1/2KR1BNR w kq - 1 11",
+        "3qkb1r/1Qpbpppp/5n2/3P4/4pB2/7P/rPP2PP1/2KR1BNR w k - 0 12",
+        "q3kb1r/1Qpbpppp/5n2/3P4/4pB2/7P/rPP2PP1/1K1R1BNR w k - 2 13",
+        "r3kb1r/2pbpppp/5n2/3P4/4pB2/7P/1PP2PP1/1K1R1BNR w k - 0 14",
+        "r3kb1r/2Bb1ppp/4pn2/3P4/4p3/7P/1PP2PP1/1K1R1BNR w k - 0 15",
+        "r3kb1r/2Bb2pp/4pn2/8/4p3/7P/1PP2PP1/1K1R1BNR w k - 0 16",
+        "r3k2r/2Bb2pp/4pn2/2b5/4p3/7P/1PP1NPP1/1K1R1B1R w k - 2 17",
+        "r6r/2Bbk1pp/4pn2/2b5/3Np3/7P/1PP2PP1/1K1R1B1R w - - 4 18",
+        "r6r/b2bk1pp/4pn2/4B3/3Np3/7P/1PP2PP1/1K1R1B1R w - - 6 19",
+        "r1r5/b2bk1pp/4pn2/4B3/2BNp3/7P/1PP2PP1/1K1R3R w - - 8 20",
+        "r7/b2bk1pp/4pn2/2r1B3/2BNp3/1P5P/2P2PP1/1K1R3R w - - 1 21",
+        "rb6/3bk1pp/4pn2/2r1B3/2BNpP2/1P5P/2P3P1/1K1R3R w - - 1 22",
+        "1r6/3bk1pp/4pn2/2r5/2BNpP2/1P5P/2P3P1/1K1R3R w - - 0 23",
+        "1r6/3bk1p1/4pn1p/2r5/2BNpP2/1P5P/2P3P1/2KR3R w - - 0 24",
+        "8/3bk1p1/1r2pn1p/2r5/2BNpP1P/1P6/2P3P1/2KR3R w - - 1 25",
+        "8/3bk3/1r2pnpp/2r5/2BNpP1P/1P6/2P3P1/2K1R2R w - - 0 26",
+        "2b5/4k3/1r2pnpp/2r5/2BNpP1P/1P4P1/2P5/2K1R2R w - - 1 27",
+        "8/1b2k3/1r2pnpp/2r5/2BNpP1P/1P4P1/2P5/2K1R1R1 w - - 3 28",
+        "8/1b1nk3/1r2p1pp/2r5/2BNpPPP/1P6/2P5/2K1R1R1 w - - 1 29",
+        "8/1b2k3/1r2p1pp/2r1nP2/2BNp1PP/1P6/2P5/2K1R1R1 w - - 1 30",
+        "8/1b2k3/1r2p1p1/2r1nPp1/2BNp2P/1P6/2P5/2K1R1R1 w - - 0 31",
+        "8/1b2k3/1r2p1n1/2r3p1/2BNp2P/1P6/2P5/2K1R1R1 w - - 0 32",
+        "8/1b2k3/1r2p1n1/6r1/2BNp2P/1P6/2P5/2K1R3 w - - 0 33",
+        "8/1b2k3/1r2p3/4n1P1/2BNp3/1P6/2P5/2K1R3 w - - 1 34",
+        "8/1b2k3/1r2p3/4n1P1/2BN4/1P2p3/2P5/2K4R w - - 0 35",
+        "8/1b2k3/1r2p2R/6P1/2nN4/1P2p3/2P5/2K5 w - - 0 36",
+        "8/1b2k3/3rp2R/6P1/2PN4/4p3/2P5/2K5 w - - 1 37",
+        "8/4k3/3rp2R/6P1/2PN4/2P1p3/6b1/2K5 w - - 1 38",
+        "8/4k3/r3p2R/2P3P1/3N4/2P1p3/6b1/2K5 w - - 1 39",
+        "8/3k4/r3p2R/2P2NP1/8/2P1p3/6b1/2K5 w - - 3 40",
+        "8/3k4/4p2R/2P3P1/8/2P1N3/6b1/r1K5 w - - 1 41",
+        "8/3k4/4p2R/2P3P1/8/2P1N3/3K2b1/6r1 w - - 3 42",
+        "8/3k4/4p2R/2P3P1/8/2PKNb2/8/6r1 w - - 5 43",
+        "8/4k3/4p1R1/2P3P1/8/2PKNb2/8/6r1 w - - 7 44",
+        "8/4k3/4p1R1/2P3P1/3K4/2P1N3/8/6rb w - - 9 45",
+        "8/3k4/4p1R1/2P1K1P1/8/2P1N3/8/6rb w - - 11 46",
+        "8/3k4/4p1R1/2P3P1/5K2/2P1N3/8/4r2b w - - 13 47",
+        "8/3k4/2b1p2R/2P3P1/5K2/2P1N3/8/4r3 w - - 15 48",
+        "8/3k4/2b1p3/2P3P1/5K2/2P1N2R/8/6r1 w - - 17 49",
+        "2k5/7R/2b1p3/2P3P1/5K2/2P1N3/8/6r1 w - - 19 50",
+        "2k5/7R/4p3/2P3P1/b1P2K2/4N3/8/6r1 w - - 1 51",
+        "2k5/3bR3/4p3/2P3P1/2P2K2/4N3/8/6r1 w - - 3 52",
+        "3k4/3b2R1/4p3/2P3P1/2P2K2/4N3/8/6r1 w - - 5 53",
+        "3kb3/6R1/4p1P1/2P5/2P2K2/4N3/8/6r1 w - - 1 54",
+        "3kb3/6R1/4p1P1/2P5/2P2KN1/8/8/2r5 w - - 3 55",
+        "3kb3/6R1/4p1P1/2P1N3/2P2K2/8/8/5r2 w - - 5 56",
+        "3kb3/6R1/4p1P1/2P1N3/2P5/4K3/8/4r3 w - - 7 57",
+    },
+    {
+        "rnbq1rk1/ppp1npb1/4p1p1/3P3p/3PP3/2N2N2/PP2BPPP/R1BQ1RK1 b - - 0 8",
+        "rnbq1rk1/ppp1npb1/6p1/3pP2p/3P4/2N2N2/PP2BPPP/R1BQ1RK1 b - - 0 9",
+        "rn1q1rk1/ppp1npb1/6p1/3pP2p/3P2b1/2N2N2/PP2BPPP/R1BQR1K1 b - - 2 10",
+        "r2q1rk1/ppp1npb1/2n3p1/3pP2p/3P2bN/2N5/PP2BPPP/R1BQR1K1 b - - 4 11",
+        "r4rk1/pppqnpb1/2n3p1/3pP2p/3P2bN/2N4P/PP2BPP1/R1BQR1K1 b - - 0 12",
+        "r4rk1/pppqnpb1/2n3p1/3pP2p/3P3N/7P/PP2NPP1/R1BQR1K1 b - - 0 13",
+        "r4rk1/pppq1pb1/2n3p1/3pPN1p/3P4/7P/PP2NPP1/R1BQR1K1 b - - 0 14",
+        "r4rk1/ppp2pb1/2n3p1/3pPq1p/3P1N2/7P/PP3PP1/R1BQR1K1 b - - 1 15",
+        "r4rk1/pppq1pb1/2n3p1/3pP2p/P2P1N2/7P/1P3PP1/R1BQR1K1 b - - 0 16",
+        "r2n1rk1/pppq1pb1/6p1/3pP2p/P2P1N2/R6P/1P3PP1/2BQR1K1 b - - 2 17",
+        "r4rk1/pppq1pb1/4N1p1/3pP2p/P2P4/R6P/1P3PP1/2BQR1K1 b - - 0 18",
+        "r4rk1/ppp2pb1/4q1p1/3pP1Bp/P2P4/R6P/1P3PP1/3QR1K1 b - - 1 19",
+        "r3r1k1/ppp2pb1/4q1p1/3pP1Bp/P2P1P2/R6P/1P4P1/3QR1K1 b - - 0 20",
+        "r3r1k1/ppp3b1/4qpp1/3pP2p/P2P1P1B/R6P/1P4P1/3QR1K1 b - - 1 21",
+        "r3r1k1/ppp3b1/4q1p1/3pP2p/P4P1B/R6P/1P4P1/3QR1K1 b - - 0 22",
+        "r4rk1/ppp3b1/4q1p1/3pP1Bp/P4P2/R6P/1P4P1/3QR1K1 b - - 2 23",
+        "r4rk1/pp4b1/4q1p1/2ppP1Bp/P4P2/3R3P/1P4P1/3QR1K1 b - - 1 24",
+        "r4rk1/pp4b1/4q1p1/2p1P1Bp/P2p1PP1/3R3P/1P6/3QR1K1 b - - 0 25",
+        "r4rk1/pp4b1/4q1p1/2p1P1B1/P2p1PP1/3R4/1P6/3QR1K1 b - - 0 26",
+        "r5k1/pp3rb1/4q1p1/2p1P1B1/P2p1PP1/6R1/1P6/3QR1K1 b - - 2 27",
+        "5rk1/pp3rb1/4q1p1/2p1P1B1/P2pRPP1/6R1/1P6/3Q2K1 b - - 4 28",
+        "5rk1/1p3rb1/p3q1p1/P1p1P1B1/3pRPP1/6R1/1P6/3Q2K1 b - - 0 29",
+        "4r1k1/1p3rb1/p3q1p1/P1p1P1B1/3pRPP1/1P4R1/8/3Q2K1 b - - 0 30",
+        "4r1k1/5rb1/pP2q1p1/2p1P1B1/3pRPP1/1P4R1/8/3Q2K1 b - - 0 31",
+        "4r1k1/5rb1/pq4p1/2p1P1B1/3pRPP1/1P4R1/4Q3/6K1 b - - 1 32",
+        "4r1k1/1r4b1/pq4p1/2p1P1B1/3pRPP1/1P4R1/2Q5/6K1 b - - 3 33",
+        "4r1k1/1r4b1/1q4p1/p1p1P1B1/3p1PP1/1P4R1/2Q5/4R1K1 b - - 1 34",
+        "4r1k1/3r2b1/1q4p1/p1p1P1B1/2Qp1PP1/1P4R1/8/4R1K1 b - - 3 35",
+        "4r1k1/3r2b1/4q1p1/p1p1P1B1/2Qp1PP1/1P4R1/5K2/4R3 b - - 5 36",
+        "4r1k1/3r2b1/6p1/p1p1P1B1/2Pp1PP1/6R1/5K2/4R3 b - - 0 37",
+        "4r1k1/3r2b1/6p1/p1p1P1B1/2P2PP1/3p2R1/5K2/3R4 b - - 1 38",
+        "5rk1/3r2b1/6p1/p1p1P1B1/2P2PP1/3p2R1/8/3RK3 b - - 3 39",
+        "5rk1/6b1/6p1/p1p1P1B1/2Pr1PP1/3R4/8/3RK3 b - - 0 40",
+        "5rk1/3R2b1/6p1/p1p1P1B1/2r2PP1/8/8/3RK3 b - - 1 41",
+        "5rk1/3R2b1/6p1/p1p1P1B1/4rPP1/8/3K4/3R4 b - - 3 42",
+        "1r4k1/3R2b1/6p1/p1p1P1B1/4rPP1/2K5/8/3R4 b - - 5 43",
+        "1r4k1/3R2b1/6p1/p1p1P1B1/2K2PP1/4r3/8/3R4 b - - 7 44",
+        "1r3bk1/8/3R2p1/p1p1P1B1/2K2PP1/4r3/8/3R4 b - - 9 45",
+        "1r3bk1/8/6R1/2p1P1B1/p1K2PP1/4r3/8/3R4 b - - 0 46",
+        "1r3b2/5k2/R7/2p1P1B1/p1K2PP1/4r3/8/3R4 b - - 2 47",
+        "5b2/1r3k2/R7/2p1P1B1/p1K2PP1/4r3/8/7R b - - 4 48",
+        "5b2/5k2/R7/2pKP1B1/pr3PP1/4r3/8/7R b - - 6 49",
+        "5b2/5k2/R1K5/2p1P1B1/p2r1PP1/4r3/8/7R b - - 8 50",
+        "8/R4kb1/2K5/2p1P1B1/p2r1PP1/4r3/8/7R b - - 10 51",
+        "8/R5b1/2K3k1/2p1PPB1/p2r2P1/4r3/8/7R b - - 0 52",
+        "8/6R1/2K5/2p1PPk1/p2r2P1/4r3/8/7R b - - 0 53",
+        "8/6R1/2K5/2p1PP2/p2r1kP1/4r3/8/5R2 b - - 2 54",
+        "8/6R1/2K2P2/2p1P3/p2r2P1/4r1k1/8/5R2 b - - 0 55",
+        "8/5PR1/2K5/2p1P3/p2r2P1/4r3/6k1/5R2 b - - 0 56",
+    },
+    {
+        "rn1qkb1r/p1pbpppp/5n2/8/2pP4/2N5/1PQ1PPPP/R1B1KBNR w KQkq - 0 7",
+        "r2qkb1r/p1pbpppp/2n2n2/8/2pP4/2N2N2/1PQ1PPPP/R1B1KB1R w KQkq - 2 8",
+        "r2qkb1r/p1pbpppp/5n2/8/1npPP3/2N2N2/1PQ2PPP/R1B1KB1R w KQkq - 1 9",
+        "r2qkb1r/p1pb1ppp/4pn2/8/1npPP3/2N2N2/1P3PPP/R1BQKB1R w KQkq - 0 10",
+        "r2qk2r/p1pbbppp/4pn2/8/1nBPP3/2N2N2/1P3PPP/R1BQK2R w KQkq - 1 11",
+        "r2q1rk1/p1pbbppp/4pn2/8/1nBPP3/2N2N2/1P3PPP/R1BQ1RK1 w - - 3 12",
+        "r2q1rk1/2pbbppp/p3pn2/8/1nBPPB2/2N2N2/1P3PPP/R2Q1RK1 w - - 0 13",
+        "r2q1rk1/2p1bppp/p3pn2/1b6/1nBPPB2/2N2N2/1P3PPP/R2QR1K1 w - - 2 14",
+        "r2q1rk1/4bppp/p1p1pn2/1b6/1nBPPB2/1PN2N2/5PPP/R2QR1K1 w - - 0 15",
+        "r4rk1/3qbppp/p1p1pn2/1b6/1nBPPB2/1PN2N2/3Q1PPP/R3R1K1 w - - 2 16",
+        "r4rk1/1q2bppp/p1p1pn2/1b6/1nBPPB2/1PN2N1P/3Q1PP1/R3R1K1 w - - 1 17",
+        "r3r1k1/1q2bppp/p1p1pn2/1b6/1nBPPB2/1PN2N1P/4QPP1/R3R1K1 w - - 3 18",
+        "r3r1k1/1q1nbppp/p1p1p3/1b6/1nBPPB2/1PN2N1P/4QPP1/3RR1K1 w - - 5 19",
+        "r3rbk1/1q1n1ppp/p1p1p3/1b6/1nBPPB2/1PN2N1P/3RQPP1/4R1K1 w - - 7 20",
+        "r3rbk1/1q3ppp/pnp1p3/1b6/1nBPPB2/1PN2N1P/3RQPP1/4R2K w - - 9 21",
+        "2r1rbk1/1q3ppp/pnp1p3/1b6/1nBPPB2/1PN2N1P/3RQPP1/1R5K w - - 11 22",
+        "2r1rbk1/1q4pp/pnp1pp2/1b6/1nBPPB2/1PN2N1P/4QPP1/1R1R3K w - - 0 23",
+        "2r1rbk1/5qpp/pnp1pp2/1b6/1nBPP3/1PN1BN1P/4QPP1/1R1R3K w - - 2 24",
+        "2r1rbk1/5qp1/pnp1pp1p/1b6/1nBPP3/1PN1BN1P/4QPP1/1R1R2K1 w - - 0 25",
+        "2r1rbk1/5qp1/pnp1pp1p/1b6/2BPP3/1P2BN1P/n3QPP1/1R1R2K1 w - - 0 26",
+        "r3rbk1/5qp1/pnp1pp1p/1b6/2BPP3/1P2BN1P/Q4PP1/1R1R2K1 w - - 1 27",
+        "rr3bk1/5qp1/pnp1pp1p/1b6/2BPP3/1P2BN1P/Q4PP1/R2R2K1 w - - 3 28",
+        "rr2qbk1/6p1/pnp1pp1p/1b6/2BPP3/1P2BN1P/4QPP1/R2R2K1 w - - 5 29",
+        "rr2qbk1/6p1/1np1pp1p/pb6/2BPP3/1P1QBN1P/5PP1/R2R2K1 w - - 0 30",
+        "rr2qbk1/6p1/1n2pp1p/pp6/3PP3/1P1QBN1P/5PP1/R2R2K1 w - - 0 31",
+        "rr2qbk1/6p1/1n2pp1p/1p1P4/p3P3/1P1QBN1P/5PP1/R2R2K1 w - - 0 32",
+        "rr2qbk1/3n2p1/3Ppp1p/1p6/p3P3/1P1QBN1P/5PP1/R2R2K1 w - - 1 33",
+        "rr3bk1/3n2p1/3Ppp1p/1p5q/pP2P3/3QBN1P/5PP1/R2R2K1 w - - 1 34",
+        "rr3bk1/3n2p1/3Ppp1p/1p5q/1P2P3/p2QBN1P/5PP1/2RR2K1 w - - 0 35",
+        "1r3bk1/3n2p1/r2Ppp1p/1p5q/1P2P3/pQ2BN1P/5PP1/2RR2K1 w - - 2 36",
+        "1r2qbk1/2Rn2p1/r2Ppp1p/1p6/1P2P3/pQ2BN1P/5PP1/3R2K1 w - - 4 37",
+        "1r2qbk1/2Rn2p1/r2Ppp1p/1pB5/1P2P3/1Q3N1P/p4PP1/3R2K1 w - - 0 38",
+        "1r2q1k1/2Rn2p1/r2bpp1p/1pB5/1P2P3/1Q3N1P/p4PP1/R5K1 w - - 0 39",
+        "1r2q1k1/2Rn2p1/3rpp1p/1p6/1P2P3/1Q3N1P/p4PP1/R5K1 w - - 0 40",
+        "2r1q1k1/2Rn2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 1 41",
+        "1r2q1k1/1R1n2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 3 42",
+        "2r1q1k1/2Rn2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 5 43",
+        "1r2q1k1/1R1n2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 7 44",
+        "1rq3k1/R2n2p1/3rpp1p/1p6/1P2P3/5N1P/Q4PP1/R5K1 w - - 9 45",
+        "2q3k1/Rr1n2p1/3rpp1p/1p6/1P2P3/5N1P/4QPP1/R5K1 w - - 11 46",
+        "Rrq3k1/3n2p1/3rpp1p/1p6/1P2P3/5N1P/4QPP1/R5K1 w - - 13 47",
+    },
+    {
+        "rn1qkb1r/1pp2ppp/p4p2/3p1b2/5P2/1P2PN2/P1PP2PP/RN1QKB1R b KQkq - 1 6",
+        "r2qkb1r/1pp2ppp/p1n2p2/3p1b2/3P1P2/1P2PN2/P1P3PP/RN1QKB1R b KQkq - 0 7",
+        "r2qkb1r/1pp2ppp/p4p2/3p1b2/1n1P1P2/1P1BPN2/P1P3PP/RN1QK2R b KQkq - 2 8",
+        "r2qkb1r/1pp2ppp/p4p2/3p1b2/3P1P2/1P1PPN2/P5PP/RN1QK2R b KQkq - 0 9",
+        "r2qk2r/1pp2ppp/p2b1p2/3p1b2/3P1P2/1PNPPN2/P5PP/R2QK2R b KQkq - 2 10",
+        "r2qk2r/1p3ppp/p1pb1p2/3p1b2/3P1P2/1PNPPN2/P5PP/R2Q1RK1 b kq - 1 11",
+        "r2q1rk1/1p3ppp/p1pb1p2/3p1b2/3P1P2/1PNPPN2/P2Q2PP/R4RK1 b - - 3 12",
+        "r2qr1k1/1p3ppp/p1pb1p2/3p1b2/3P1P2/1P1PPN2/P2QN1PP/R4RK1 b - - 5 13",
+        "r3r1k1/1p3ppp/pqpb1p2/3p1b2/3P1P2/1P1PPNN1/P2Q2PP/R4RK1 b - - 7 14",
+        "r3r1k1/1p3ppp/pqp2p2/3p1b2/1b1P1P2/1P1PPNN1/P1Q3PP/R4RK1 b - - 9 15",
+        "r3r1k1/1p1b1ppp/pqp2p2/3p4/1b1P1P2/1P1PPNN1/P4QPP/R4RK1 b - - 11 16",
+        "2r1r1k1/1p1b1ppp/pqp2p2/3p4/1b1PPP2/1P1P1NN1/P4QPP/R4RK1 b - - 0 17",
+        "2r1r1k1/1p1b1ppp/pq3p2/2pp4/1b1PPP2/PP1P1NN1/5QPP/R4RK1 b - - 0 18",
+        "2r1r1k1/1p1b1ppp/pq3p2/2Pp4/4PP2/PPbP1NN1/5QPP/R4RK1 b - - 0 19",
+        "2r1r1k1/1p1b1ppp/p4p2/2Pp4/4PP2/PqbP1NN1/5QPP/RR4K1 b - - 1 20",
+        "2r1r1k1/1p1b1ppp/p4p2/2Pp4/q3PP2/P1bP1NN1/R4QPP/1R4K1 b - - 3 21",
+        "2r1r1k1/1p3ppp/p4p2/1bPP4/q4P2/P1bP1NN1/R4QPP/1R4K1 b - - 0 22",
+        "2r1r1k1/1p3ppp/p4p2/2PP4/q4P2/P1bb1NN1/R4QPP/2R3K1 b - - 1 23",
+        "2r1r1k1/1p3ppp/p2P1p2/2P5/2q2P2/P1bb1NN1/R4QPP/2R3K1 b - - 0 24",
+        "2rr2k1/1p3ppp/p2P1p2/2P5/2q2P2/P1bb1NN1/R4QPP/2R4K b - - 2 25",
+        "2rr2k1/1p3ppp/p2P1p2/2Q5/5P2/P1bb1NN1/R5PP/2R4K b - - 0 26",
+        "3r2k1/1p3ppp/p2P1p2/2r5/5P2/P1bb1N2/R3N1PP/2R4K b - - 1 27",
+        "3r2k1/1p3ppp/p2P1p2/2r5/5P2/P1b2N2/4R1PP/2R4K b - - 0 28",
+        "3r2k1/1p3ppp/p2P1p2/2r5/1b3P2/P4N2/4R1PP/3R3K b - - 2 29",
+        "3r2k1/1p2Rppp/p2P1p2/b1r5/5P2/P4N2/6PP/3R3K b - - 4 30",
+        "3r2k1/1R3ppp/p1rP1p2/b7/5P2/P4N2/6PP/3R3K b - - 0 31",
+        "3r2k1/1R3ppp/p2R1p2/b7/5P2/P4N2/6PP/7K b - - 0 32",
+        "6k1/1R3ppp/p2r1p2/b7/5P2/P4NP1/7P/7K b - - 0 33",
+        "6k1/1R3p1p/p2r1pp1/b7/5P1P/P4NP1/8/7K b - - 0 34",
+        "6k1/3R1p1p/pr3pp1/b7/5P1P/P4NP1/8/7K b - - 2 35",
+        "6k1/5p2/pr3pp1/b2R3p/5P1P/P4NP1/8/7K b - - 1 36",
+        "6k1/5p2/pr3pp1/7p/5P1P/P1bR1NP1/8/7K b - - 3 37",
+        "6k1/5p2/p1r2pp1/7p/5P1P/P1bR1NP1/6K1/8 b - - 5 38",
+        "6k1/5p2/p1r2pp1/b2R3p/5P1P/P4NP1/6K1/8 b - - 7 39",
+        "6k1/5p2/p4pp1/b2R3p/5P1P/P4NPK/2r5/8 b - - 9 40",
+        "6k1/2b2p2/p4pp1/7p/5P1P/P2R1NPK/2r5/8 b - - 11 41",
+        "6k1/2b2p2/5pp1/p6p/3N1P1P/P2R2PK/2r5/8 b - - 1 42",
+        "6k1/2b2p2/5pp1/p6p/3N1P1P/P1R3PK/r7/8 b - - 3 43",
+        "6k1/5p2/1b3pp1/p6p/5P1P/P1R3PK/r1N5/8 b - - 5 44",
+        "8/5pk1/1bR2pp1/p6p/5P1P/P5PK/r1N5/8 b - - 7 45",
+        "3b4/5pk1/2R2pp1/p4P1p/7P/P5PK/r1N5/8 b - - 0 46",
+        "8/4bpk1/2R2pp1/p4P1p/6PP/P6K/r1N5/8 b - - 0 47",
+        "8/5pk1/2R2pP1/p6p/6PP/b6K/r1N5/8 b - - 0 48",
+        "8/6k1/2R2pp1/p6P/7P/b6K/r1N5/8 b - - 0 49",
+        "8/6k1/2R2p2/p6p/7P/b5K1/r1N5/8 b - - 1 50",
+        "8/8/2R2pk1/p6p/7P/b4K2/r1N5/8 b - - 3 51",
+        "8/8/2R2pk1/p6p/7P/4NK2/rb6/8 b - - 5 52",
+        "2R5/8/5pk1/7p/p6P/4NK2/rb6/8 b - - 1 53",
+        "6R1/8/5pk1/7p/p6P/4NK2/1b6/r7 b - - 3 54",
+        "R7/5k2/5p2/7p/p6P/4NK2/1b6/r7 b - - 5 55",
+        "R7/5k2/5p2/7p/7P/p3N3/1b2K3/r7 b - - 1 56",
+        "8/R4k2/5p2/7p/7P/p3N3/1b2K3/7r b - - 3 57",
+        "8/8/5pk1/7p/R6P/p3N3/1b2K3/7r b - - 5 58",
+        "8/8/5pk1/7p/R6P/p7/4K3/2bN3r b - - 7 59",
+        "8/8/5pk1/7p/R6P/p7/4KN1r/2b5 b - - 9 60",
+        "8/8/5pk1/7p/R6P/p3K3/1b3N1r/8 b - - 11 61",
+        "8/8/R4pk1/7p/7P/p1b1K3/5N1r/8 b - - 13 62",
+        "8/8/5pk1/7p/7P/2b1K3/R4N1r/8 b - - 0 63",
+        "8/8/5pk1/7p/3K3P/8/R4N1r/4b3 b - - 2 64",
+    }
+};
+// clang-format on
+}  // namespace
+namespace Stockfish::Benchmark {
+// Builds a list of UCI commands to be run by bench. There
+// are five parameters: TT size in MB, number of search threads that
+// should be used, the limit value spent for each position, a file name
+// where to look for positions in FEN format, and the type of the limit:
+// depth, perft, nodes and movetime (in milliseconds). Examples:
+//
+// bench                            : search default positions up to depth 13
+// bench 64 1 15                    : search default positions up to depth 15 (TT = 64MB)
+// bench 64 1 100000 default nodes  : search default positions for 100K nodes each
+// bench 64 4 5000 current movetime : search current position with 4 threads for 5 sec
+// bench 16 1 5 blah perft          : run a perft 5 on positions in file "blah"
+std::vector<std::string> setup_bench(const std::string& currentFen, std::istream& is) {
+    std::vector<std::string> fens, list;
+    std::string              go, token;
+    // Assign default values to missing arguments
+    std::string ttSize    = (is >> token) ? token : "16";
+    std::string threads   = (is >> token) ? token : "1";
+    std::string limit     = (is >> token) ? token : "13";
+    std::string fenFile   = (is >> token) ? token : "default";
+    std::string limitType = (is >> token) ? token : "depth";
+    go = limitType == "eval" ? "eval" : "go " + limitType + " " + limit;
+    if (fenFile == "default")
+        fens = Defaults;
+    else if (fenFile == "current")
+        fens.push_back(currentFen);
+    else
+    {
+        std::string   fen;
+        std::ifstream file(fenFile);
+        if (!file.is_open())
+        {
+            std::cerr << "Unable to open file " << fenFile << std::endl;
+            exit(EXIT_FAILURE);
+        }
+        while (getline(file, fen))
+            if (!fen.empty())
+                fens.push_back(fen);
+        file.close();
+    }
+    list.emplace_back("setoption name Threads value " + threads);
+    list.emplace_back("setoption name Hash value " + ttSize);
+    list.emplace_back("ucinewgame");
+    for (const std::string& fen : fens)
+        if (fen.find("setoption") != std::string::npos)
+            list.emplace_back(fen);
+        else
+        {
+            list.emplace_back("position fen " + fen);
+            list.emplace_back(go);
+        }
+    return list;
+}
+BenchmarkSetup setup_benchmark(std::istream& is) {
+    // TT_SIZE_PER_THREAD is chosen such that roughly half of the hash is used all positions
+    // for the current sequence have been searched.
+    static constexpr int TT_SIZE_PER_THREAD = 128;
+    static constexpr int DEFAULT_DURATION_S = 150;
+    BenchmarkSetup setup{};
+    // Assign default values to missing arguments
+    int desiredTimeS;
+    if (!(is >> setup.threads))
+        setup.threads = int(get_hardware_concurrency());
+    else
+        setup.originalInvocation += std::to_string(setup.threads);
+    if (!(is >> setup.ttSize))
+        setup.ttSize = TT_SIZE_PER_THREAD * setup.threads;
+    else
+        setup.originalInvocation += " " + std::to_string(setup.ttSize);
+    if (!(is >> desiredTimeS))
+        desiredTimeS = DEFAULT_DURATION_S;
+    else
+        setup.originalInvocation += " " + std::to_string(desiredTimeS);
+    setup.filledInvocation += std::to_string(setup.threads) + " " + std::to_string(setup.ttSize)
+                            + " " + std::to_string(desiredTimeS);
+    auto getCorrectedTime = [&](int ply) {
+        // time per move is fit roughly based on LTC games
+        // seconds = 50/{ply+15}
+        // ms = 50000/{ply+15}
+        // with this fit 10th move gets 2000ms
+        // adjust for desired 10th move time
+        return 50000.0 / (static_cast<double>(ply) + 15.0);
+    };
+    float totalTime = 0;
+    for (const auto& game : BenchmarkPositions)
+    {
+        int ply = 1;
+        for (int i = 0; i < static_cast<int>(game.size()); ++i)
+        {
+            const float correctedTime = float(getCorrectedTime(ply));
+            totalTime += correctedTime;
+            ply += 1;
+        }
+    }
+    float timeScaleFactor = static_cast<float>(desiredTimeS * 1000) / totalTime;
+    for (const auto& game : BenchmarkPositions)
+    {
+        setup.commands.emplace_back("ucinewgame");
+        int ply = 1;
+        for (const std::string& fen : game)
+        {
+            setup.commands.emplace_back("position fen " + fen);
+            const int correctedTime = static_cast<int>(getCorrectedTime(ply) * timeScaleFactor);
+            setup.commands.emplace_back("go movetime " + std::to_string(correctedTime));
+            ply += 1;
+        }
+    }
+    return setup;
+}
+}  // namespace Stockfish

src/benchmark.h ADDED Viewed

	@@ -0,0 +1,42 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef BENCHMARK_H_INCLUDED
+#define BENCHMARK_H_INCLUDED
+#include <iosfwd>
+#include <string>
+#include <vector>
+namespace Stockfish::Benchmark {
+std::vector<std::string> setup_bench(const std::string&, std::istream&);
+struct BenchmarkSetup {
+    int                      ttSize;
+    int                      threads;
+    std::vector<std::string> commands;
+    std::string              originalInvocation;
+    std::string              filledInvocation;
+};
+BenchmarkSetup setup_benchmark(std::istream&);
+}  // namespace Stockfish
+#endif  // #ifndef BENCHMARK_H_INCLUDED

src/bitboard.cpp ADDED Viewed

	@@ -0,0 +1,189 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "bitboard.h"
+#include <algorithm>
+#include <bitset>
+#include <initializer_list>
+#include "misc.h"
+namespace Stockfish {
+uint8_t PopCnt16[1 << 16];
+uint8_t SquareDistance[SQUARE_NB][SQUARE_NB];
+Bitboard LineBB[SQUARE_NB][SQUARE_NB];
+Bitboard BetweenBB[SQUARE_NB][SQUARE_NB];
+Bitboard RayPassBB[SQUARE_NB][SQUARE_NB];
+alignas(64) Magic Magics[SQUARE_NB][2];
+namespace {
+Bitboard RookTable[0x19000];   // To store rook attacks
+Bitboard BishopTable[0x1480];  // To store bishop attacks
+void init_magics(PieceType pt, Bitboard table[], Magic magics[][2]);
+}
+// Returns an ASCII representation of a bitboard suitable
+// to be printed to standard output. Useful for debugging.
+std::string Bitboards::pretty(Bitboard b) {
+    std::string s = "+---+---+---+---+---+---+---+---+\n";
+    for (Rank r = RANK_8;; --r)
+    {
+        for (File f = FILE_A; f <= FILE_H; ++f)
+            s += b & make_square(f, r) ? "| X " : "|   ";
+        s += "| " + std::to_string(1 + r) + "\n+---+---+---+---+---+---+---+---+\n";
+        if (r == RANK_1)
+            break;
+    }
+    s += "  a   b   c   d   e   f   g   h\n";
+    return s;
+}
+// Initializes various bitboard tables. It is called at
+// startup and relies on global objects to be already zero-initialized.
+void Bitboards::init() {
+    for (unsigned i = 0; i < (1 << 16); ++i)
+        PopCnt16[i] = uint8_t(std::bitset<16>(i).count());
+    for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1)
+        for (Square s2 = SQ_A1; s2 <= SQ_H8; ++s2)
+            SquareDistance[s1][s2] = std::max(distance<File>(s1, s2), distance<Rank>(s1, s2));
+    init_magics(ROOK, RookTable, Magics);
+    init_magics(BISHOP, BishopTable, Magics);
+    for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1)
+    {
+        for (PieceType pt : {BISHOP, ROOK})
+            for (Square s2 = SQ_A1; s2 <= SQ_H8; ++s2)
+            {
+                if (PseudoAttacks[pt][s1] & s2)
+                {
+                    LineBB[s1][s2] = (attacks_bb(pt, s1, 0) & attacks_bb(pt, s2, 0)) | s1 | s2;
+                    BetweenBB[s1][s2] =
+                      (attacks_bb(pt, s1, square_bb(s2)) & attacks_bb(pt, s2, square_bb(s1)));
+                    RayPassBB[s1][s2] =
+                      attacks_bb(pt, s1, 0) & (attacks_bb(pt, s2, square_bb(s1)) | s2);
+                }
+                BetweenBB[s1][s2] |= s2;
+            }
+    }
+}
+namespace {
+// Computes all rook and bishop attacks at startup. Magic
+// bitboards are used to look up attacks of sliding pieces. As a reference see
+// https://www.chessprogramming.org/Magic_Bitboards. In particular, here we use
+// the so called "fancy" approach.
+void init_magics(PieceType pt, Bitboard table[], Magic magics[][2]) {
+#ifndef USE_PEXT
+    // Optimal PRNG seeds to pick the correct magics in the shortest time
+    int seeds[][RANK_NB] = {{8977, 44560, 54343, 38998, 5731, 95205, 104912, 17020},
+                            {728, 10316, 55013, 32803, 12281, 15100, 16645, 255}};
+    Bitboard occupancy[4096];
+    int      epoch[4096] = {}, cnt = 0;
+#endif
+    Bitboard reference[4096];
+    int      size = 0;
+    for (Square s = SQ_A1; s <= SQ_H8; ++s)
+    {
+        // Board edges are not considered in the relevant occupancies
+        Bitboard edges = ((Rank1BB | Rank8BB) & ~rank_bb(s)) | ((FileABB | FileHBB) & ~file_bb(s));
+        // Given a square 's', the mask is the bitboard of sliding attacks from
+        // 's' computed on an empty board. The index must be big enough to contain
+        // all the attacks for each possible subset of the mask and so is 2 power
+        // the number of 1s of the mask. Hence we deduce the size of the shift to
+        // apply to the 64 or 32 bits word to get the index.
+        Magic& m = magics[s][pt - BISHOP];
+        m.mask   = Bitboards::sliding_attack(pt, s, 0) & ~edges;
+#ifndef USE_PEXT
+        m.shift = (Is64Bit ? 64 : 32) - popcount(m.mask);
+#endif
+        // Set the offset for the attacks table of the square. We have individual
+        // table sizes for each square with "Fancy Magic Bitboards".
+        m.attacks = s == SQ_A1 ? table : magics[s - 1][pt - BISHOP].attacks + size;
+        size      = 0;
+        // Use Carry-Rippler trick to enumerate all subsets of masks[s] and
+        // store the corresponding sliding attack bitboard in reference[].
+        Bitboard b = 0;
+        do
+        {
+#ifndef USE_PEXT
+            occupancy[size] = b;
+#endif
+            reference[size] = Bitboards::sliding_attack(pt, s, b);
+            if (HasPext)
+                m.attacks[pext(b, m.mask)] = reference[size];
+            size++;
+            b = (b - m.mask) & m.mask;
+        } while (b);
+#ifndef USE_PEXT
+        PRNG rng(seeds[Is64Bit][rank_of(s)]);
+        // Find a magic for square 's' picking up an (almost) random number
+        // until we find the one that passes the verification test.
+        for (int i = 0; i < size;)
+        {
+            for (m.magic = 0; popcount((m.magic * m.mask) >> 56) < 6;)
+                m.magic = rng.sparse_rand<Bitboard>();
+            // A good magic must map every possible occupancy to an index that
+            // looks up the correct sliding attack in the attacks[s] database.
+            // Note that we build up the database for square 's' as a side
+            // effect of verifying the magic. Keep track of the attempt count
+            // and save it in epoch[], little speed-up trick to avoid resetting
+            // m.attacks[] after every failed attempt.
+            for (++cnt, i = 0; i < size; ++i)
+            {
+                unsigned idx = m.index(occupancy[i]);
+                if (epoch[idx] < cnt)
+                {
+                    epoch[idx]     = cnt;
+                    m.attacks[idx] = reference[i];
+                }
+                else if (m.attacks[idx] != reference[i])
+                    break;
+            }
+        }
+#endif
+    }
+}
+}
+}  // namespace Stockfish

src/bitboard.h ADDED Viewed

	@@ -0,0 +1,458 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef BITBOARD_H_INCLUDED
+#define BITBOARD_H_INCLUDED
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+#include <initializer_list>
+#include <array>
+#include "types.h"
+namespace Stockfish {
+namespace Bitboards {
+void        init();
+std::string pretty(Bitboard b);
+}  // namespace Stockfish::Bitboards
+constexpr Bitboard FileABB = 0x0101010101010101ULL;
+constexpr Bitboard FileBBB = FileABB << 1;
+constexpr Bitboard FileCBB = FileABB << 2;
+constexpr Bitboard FileDBB = FileABB << 3;
+constexpr Bitboard FileEBB = FileABB << 4;
+constexpr Bitboard FileFBB = FileABB << 5;
+constexpr Bitboard FileGBB = FileABB << 6;
+constexpr Bitboard FileHBB = FileABB << 7;
+constexpr Bitboard Rank1BB = 0xFF;
+constexpr Bitboard Rank2BB = Rank1BB << (8 * 1);
+constexpr Bitboard Rank3BB = Rank1BB << (8 * 2);
+constexpr Bitboard Rank4BB = Rank1BB << (8 * 3);
+constexpr Bitboard Rank5BB = Rank1BB << (8 * 4);
+constexpr Bitboard Rank6BB = Rank1BB << (8 * 5);
+constexpr Bitboard Rank7BB = Rank1BB << (8 * 6);
+constexpr Bitboard Rank8BB = Rank1BB << (8 * 7);
+extern uint8_t PopCnt16[1 << 16];
+extern uint8_t SquareDistance[SQUARE_NB][SQUARE_NB];
+extern Bitboard BetweenBB[SQUARE_NB][SQUARE_NB];
+extern Bitboard LineBB[SQUARE_NB][SQUARE_NB];
+extern Bitboard RayPassBB[SQUARE_NB][SQUARE_NB];
+// Magic holds all magic bitboards relevant data for a single square
+struct Magic {
+    Bitboard  mask;
+    Bitboard* attacks;
+#ifndef USE_PEXT
+    Bitboard magic;
+    unsigned shift;
+#endif
+    // Compute the attack's index using the 'magic bitboards' approach
+    unsigned index(Bitboard occupied) const {
+#ifdef USE_PEXT
+        return unsigned(pext(occupied, mask));
+#else
+        if (Is64Bit)
+            return unsigned(((occupied & mask) * magic) >> shift);
+        unsigned lo = unsigned(occupied) & unsigned(mask);
+        unsigned hi = unsigned(occupied >> 32) & unsigned(mask >> 32);
+        return (lo * unsigned(magic) ^ hi * unsigned(magic >> 32)) >> shift;
+#endif
+    }
+    Bitboard attacks_bb(Bitboard occupied) const { return attacks[index(occupied)]; }
+};
+extern Magic Magics[SQUARE_NB][2];
+constexpr Bitboard square_bb(Square s) {
+    assert(is_ok(s));
+    return 1ULL << s;
+}
+// Overloads of bitwise operators between a Bitboard and a Square for testing
+// whether a given bit is set in a bitboard, and for setting and clearing bits.
+constexpr Bitboard  operator&(Bitboard b, Square s) { return b & square_bb(s); }
+constexpr Bitboard  operator|(Bitboard b, Square s) { return b | square_bb(s); }
+constexpr Bitboard  operator^(Bitboard b, Square s) { return b ^ square_bb(s); }
+constexpr Bitboard& operator|=(Bitboard& b, Square s) { return b |= square_bb(s); }
+constexpr Bitboard& operator^=(Bitboard& b, Square s) { return b ^= square_bb(s); }
+constexpr Bitboard operator&(Square s, Bitboard b) { return b & s; }
+constexpr Bitboard operator|(Square s, Bitboard b) { return b | s; }
+constexpr Bitboard operator^(Square s, Bitboard b) { return b ^ s; }
+constexpr Bitboard operator|(Square s1, Square s2) { return square_bb(s1) | s2; }
+constexpr bool more_than_one(Bitboard b) { return b & (b - 1); }
+// rank_bb() and file_bb() return a bitboard representing all the squares on
+// the given file or rank.
+constexpr Bitboard rank_bb(Rank r) { return Rank1BB << (8 * r); }
+constexpr Bitboard rank_bb(Square s) { return rank_bb(rank_of(s)); }
+constexpr Bitboard file_bb(File f) { return FileABB << f; }
+constexpr Bitboard file_bb(Square s) { return file_bb(file_of(s)); }
+// Moves a bitboard one or two steps as specified by the direction D
+template<Direction D>
+constexpr Bitboard shift(Bitboard b) {
+    return D == NORTH         ? b << 8
+         : D == SOUTH         ? b >> 8
+         : D == NORTH + NORTH ? b << 16
+         : D == SOUTH + SOUTH ? b >> 16
+         : D == EAST          ? (b & ~FileHBB) << 1
+         : D == WEST          ? (b & ~FileABB) >> 1
+         : D == NORTH_EAST    ? (b & ~FileHBB) << 9
+         : D == NORTH_WEST    ? (b & ~FileABB) << 7
+         : D == SOUTH_EAST    ? (b & ~FileHBB) >> 7
+         : D == SOUTH_WEST    ? (b & ~FileABB) >> 9
+                              : 0;
+}
+// Returns the squares attacked by pawns of the given color
+// from the squares in the given bitboard.
+template<Color C>
+constexpr Bitboard pawn_attacks_bb(Bitboard b) {
+    return C == WHITE ? shift<NORTH_WEST>(b) | shift<NORTH_EAST>(b)
+                      : shift<SOUTH_WEST>(b) | shift<SOUTH_EAST>(b);
+}
+// Returns a bitboard representing an entire line (from board edge
+// to board edge) that intersects the two given squares. If the given squares
+// are not on a same file/rank/diagonal, the function returns 0. For instance,
+// line_bb(SQ_C4, SQ_F7) will return a bitboard with the A2-G8 diagonal.
+inline Bitboard line_bb(Square s1, Square s2) {
+    assert(is_ok(s1) && is_ok(s2));
+    return LineBB[s1][s2];
+}
+// Returns a bitboard representing the squares in the semi-open
+// segment between the squares s1 and s2 (excluding s1 but including s2). If the
+// given squares are not on a same file/rank/diagonal, it returns s2. For instance,
+// between_bb(SQ_C4, SQ_F7) will return a bitboard with squares D5, E6 and F7, but
+// between_bb(SQ_E6, SQ_F8) will return a bitboard with the square F8. This trick
+// allows to generate non-king evasion moves faster: the defending piece must either
+// interpose itself to cover the check or capture the checking piece.
+inline Bitboard between_bb(Square s1, Square s2) {
+    assert(is_ok(s1) && is_ok(s2));
+    return BetweenBB[s1][s2];
+}
+// distance() functions return the distance between x and y, defined as the
+// number of steps for a king in x to reach y.
+template<typename T1 = Square>
+inline int distance(Square x, Square y);
+template<>
+inline int distance<File>(Square x, Square y) {
+    return std::abs(file_of(x) - file_of(y));
+}
+template<>
+inline int distance<Rank>(Square x, Square y) {
+    return std::abs(rank_of(x) - rank_of(y));
+}
+template<>
+inline int distance<Square>(Square x, Square y) {
+    return SquareDistance[x][y];
+}
+inline int edge_distance(File f) { return std::min(f, File(FILE_H - f)); }
+constexpr int constexpr_popcount(Bitboard b) {
+    b = b - ((b >> 1) & 0x5555555555555555ULL);
+    b = (b & 0x3333333333333333ULL) + ((b >> 2) & 0x3333333333333333ULL);
+    b = (b + (b >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+    return static_cast<int>((b * 0x0101010101010101ULL) >> 56);
+}
+// Counts the number of non-zero bits in a bitboard.
+inline int popcount(Bitboard b) {
+#ifndef USE_POPCNT
+    std::uint16_t indices[4];
+    std::memcpy(indices, &b, sizeof(b));
+    return PopCnt16[indices[0]] + PopCnt16[indices[1]] + PopCnt16[indices[2]]
+         + PopCnt16[indices[3]];
+#elif defined(_MSC_VER)
+    return int(_mm_popcnt_u64(b));
+#else  // Assumed gcc or compatible compiler
+    return __builtin_popcountll(b);
+#endif
+}
+// Returns the least significant bit in a non-zero bitboard.
+inline Square lsb(Bitboard b) {
+    assert(b);
+#if defined(__GNUC__)  // GCC, Clang, ICX
+    return Square(__builtin_ctzll(b));
+#elif defined(_MSC_VER)
+    #ifdef _WIN64  // MSVC, WIN64
+    unsigned long idx;
+    _BitScanForward64(&idx, b);
+    return Square(idx);
+    #else  // MSVC, WIN32
+    unsigned long idx;
+    if (b & 0xffffffff)
+    {
+        _BitScanForward(&idx, int32_t(b));
+        return Square(idx);
+    }
+    else
+    {
+        _BitScanForward(&idx, int32_t(b >> 32));
+        return Square(idx + 32);
+    }
+    #endif
+#else  // Compiler is neither GCC nor MSVC compatible
+    #error "Compiler not supported."
+#endif
+}
+// Returns the most significant bit in a non-zero bitboard.
+inline Square msb(Bitboard b) {
+    assert(b);
+#if defined(__GNUC__)  // GCC, Clang, ICX
+    return Square(63 ^ __builtin_clzll(b));
+#elif defined(_MSC_VER)
+    #ifdef _WIN64  // MSVC, WIN64
+    unsigned long idx;
+    _BitScanReverse64(&idx, b);
+    return Square(idx);
+    #else  // MSVC, WIN32
+    unsigned long idx;
+    if (b >> 32)
+    {
+        _BitScanReverse(&idx, int32_t(b >> 32));
+        return Square(idx + 32);
+    }
+    else
+    {
+        _BitScanReverse(&idx, int32_t(b));
+        return Square(idx);
+    }
+    #endif
+#else  // Compiler is neither GCC nor MSVC compatible
+    #error "Compiler not supported."
+#endif
+}
+// Returns the bitboard of the least significant
+// square of a non-zero bitboard. It is equivalent to square_bb(lsb(bb)).
+inline Bitboard least_significant_square_bb(Bitboard b) {
+    assert(b);
+    return b & -b;
+}
+// Finds and clears the least significant bit in a non-zero bitboard.
+inline Square pop_lsb(Bitboard& b) {
+    assert(b);
+    const Square s = lsb(b);
+    b &= b - 1;
+    return s;
+}
+namespace Bitboards {
+// Returns the bitboard of target square for the given step
+// from the given square. If the step is off the board, returns empty bitboard.
+constexpr Bitboard safe_destination(Square s, int step) {
+    constexpr auto abs = [](int v) { return v < 0 ? -v : v; };
+    Square         to  = Square(s + step);
+    return is_ok(to) && abs(file_of(s) - file_of(to)) <= 2 ? square_bb(to) : Bitboard(0);
+}
+constexpr Bitboard sliding_attack(PieceType pt, Square sq, Bitboard occupied) {
+    Bitboard  attacks             = 0;
+    Direction RookDirections[4]   = {NORTH, SOUTH, EAST, WEST};
+    Direction BishopDirections[4] = {NORTH_EAST, SOUTH_EAST, SOUTH_WEST, NORTH_WEST};
+    for (Direction d : (pt == ROOK ? RookDirections : BishopDirections))
+    {
+        Square s = sq;
+        while (safe_destination(s, d))
+        {
+            attacks |= (s += d);
+            if (occupied & s)
+            {
+                break;
+            }
+        }
+    }
+    return attacks;
+}
+constexpr Bitboard knight_attack(Square sq) {
+    Bitboard b = {};
+    for (int step : {-17, -15, -10, -6, 6, 10, 15, 17})
+        b |= safe_destination(sq, step);
+    return b;
+}
+constexpr Bitboard king_attack(Square sq) {
+    Bitboard b = {};
+    for (int step : {-9, -8, -7, -1, 1, 7, 8, 9})
+        b |= safe_destination(sq, step);
+    return b;
+}
+constexpr Bitboard pseudo_attacks(PieceType pt, Square sq) {
+    switch (pt)
+    {
+    case PieceType::ROOK :
+    case PieceType::BISHOP :
+        return sliding_attack(pt, sq, 0);
+    case PieceType::QUEEN :
+        return sliding_attack(PieceType::ROOK, sq, 0) | sliding_attack(PieceType::BISHOP, sq, 0);
+    case PieceType::KNIGHT :
+        return knight_attack(sq);
+    case PieceType::KING :
+        return king_attack(sq);
+    default :
+        assert(false);
+        return 0;
+    }
+}
+}
+inline constexpr auto PseudoAttacks = []() constexpr {
+    std::array<std::array<Bitboard, SQUARE_NB>, PIECE_TYPE_NB> attacks{};
+    for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1)
+    {
+        attacks[WHITE][s1] = pawn_attacks_bb<WHITE>(square_bb(s1));
+        attacks[BLACK][s1] = pawn_attacks_bb<BLACK>(square_bb(s1));
+        attacks[KING][s1]   = Bitboards::pseudo_attacks(KING, s1);
+        attacks[KNIGHT][s1] = Bitboards::pseudo_attacks(KNIGHT, s1);
+        attacks[QUEEN][s1] = attacks[BISHOP][s1] = Bitboards::pseudo_attacks(BISHOP, s1);
+        attacks[QUEEN][s1] |= attacks[ROOK][s1]  = Bitboards::pseudo_attacks(ROOK, s1);
+    }
+    return attacks;
+}();
+// Returns the pseudo attacks of the given piece type
+// assuming an empty board.
+template<PieceType Pt>
+inline Bitboard attacks_bb(Square s, Color c = COLOR_NB) {
+    assert((Pt != PAWN || c < COLOR_NB) && is_ok(s));
+    return Pt == PAWN ? PseudoAttacks[c][s] : PseudoAttacks[Pt][s];
+}
+// Returns the attacks by the given piece
+// assuming the board is occupied according to the passed Bitboard.
+// Sliding piece attacks do not continue passed an occupied square.
+template<PieceType Pt>
+inline Bitboard attacks_bb(Square s, Bitboard occupied) {
+    assert(Pt != PAWN && is_ok(s));
+    switch (Pt)
+    {
+    case BISHOP :
+    case ROOK :
+        return Magics[s][Pt - BISHOP].attacks_bb(occupied);
+    case QUEEN :
+        return attacks_bb<BISHOP>(s, occupied) | attacks_bb<ROOK>(s, occupied);
+    default :
+        return PseudoAttacks[Pt][s];
+    }
+}
+// Returns the attacks by the given piece
+// assuming the board is occupied according to the passed Bitboard.
+// Sliding piece attacks do not continue passed an occupied square.
+inline Bitboard attacks_bb(PieceType pt, Square s, Bitboard occupied) {
+    assert(pt != PAWN && is_ok(s));
+    switch (pt)
+    {
+    case BISHOP :
+        return attacks_bb<BISHOP>(s, occupied);
+    case ROOK :
+        return attacks_bb<ROOK>(s, occupied);
+    case QUEEN :
+        return attacks_bb<BISHOP>(s, occupied) | attacks_bb<ROOK>(s, occupied);
+    default :
+        return PseudoAttacks[pt][s];
+    }
+}
+inline Bitboard attacks_bb(Piece pc, Square s, Bitboard occupied) {
+    return type_of(pc) == PAWN ? PseudoAttacks[color_of(pc)][s]
+                               : attacks_bb(type_of(pc), s, occupied);
+}
+}  // namespace Stockfish
+#endif  // #ifndef BITBOARD_H_INCLUDED

src/engine.cpp ADDED Viewed

	@@ -0,0 +1,411 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "engine.h"
+#include <algorithm>
+#include <cassert>
+#include <deque>
+#include <iosfwd>
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <string_view>
+#include <utility>
+#include <vector>
+#include "evaluate.h"
+#include "misc.h"
+#include "nnue/network.h"
+#include "nnue/nnue_common.h"
+#include "nnue/nnue_misc.h"
+#include "numa.h"
+#include "perft.h"
+#include "position.h"
+#include "search.h"
+#include "shm.h"
+#include "syzygy/tbprobe.h"
+#include "types.h"
+#include "uci.h"
+#include "ucioption.h"
+namespace Stockfish {
+namespace NN = Eval::NNUE;
+constexpr auto StartFEN   = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
+constexpr int  MaxHashMB  = Is64Bit ? 33554432 : 2048;
+int            MaxThreads = std::max(1024, 4 * int(get_hardware_concurrency()));
+// The default configuration will attempt to group L3 domains up to 32 threads.
+// This size was found to be a good balance between the Elo gain of increased
+// history sharing and the speed loss from more cross-cache accesses (see
+// PR#6526). The user can always explicitly override this behavior.
+constexpr NumaAutoPolicy DefaultNumaPolicy = BundledL3Policy{32};
+Engine::Engine(std::optional<std::string> path) :
+    binaryDirectory(path ? CommandLine::get_binary_directory(*path) : ""),
+    numaContext(NumaConfig::from_system(DefaultNumaPolicy)),
+    states(new std::deque<StateInfo>(1)),
+    threads(),
+    networks(numaContext, get_default_networks()) {
+    pos.set(StartFEN, false, &states->back());
+    options.add(  //
+      "Debug Log File", Option("", [](const Option& o) {
+          start_logger(o);
+          return std::nullopt;
+      }));
+    options.add(  //
+      "NumaPolicy", Option("auto", [this](const Option& o) {
+          set_numa_config_from_option(o);
+          return numa_config_information_as_string() + "\n"
+               + thread_allocation_information_as_string();
+      }));
+    options.add(  //
+      "Threads", Option(1, 1, MaxThreads, [this](const Option&) {
+          resize_threads();
+          return thread_allocation_information_as_string();
+      }));
+    options.add(  //
+      "Hash", Option(16, 1, MaxHashMB, [this](const Option& o) {
+          set_tt_size(o);
+          return std::nullopt;
+      }));
+    options.add(  //
+      "Clear Hash", Option([this](const Option&) {
+          search_clear();
+          return std::nullopt;
+      }));
+    options.add(  //
+      "Ponder", Option(false));
+    options.add(  //
+      "MultiPV", Option(1, 1, MAX_MOVES));
+    options.add("Skill Level", Option(20, 0, 20));
+    options.add("Move Overhead", Option(10, 0, 5000));
+    options.add("nodestime", Option(0, 0, 10000));
+    options.add("UCI_Chess960", Option(false));
+    options.add("UCI_LimitStrength", Option(false));
+    options.add("UCI_Elo",
+                Option(Stockfish::Search::Skill::LowestElo, Stockfish::Search::Skill::LowestElo,
+                       Stockfish::Search::Skill::HighestElo));
+    options.add("UCI_ShowWDL", Option(false));
+    options.add(  //
+      "SyzygyPath", Option("", [](const Option& o) {
+          Tablebases::init(o);
+          return std::nullopt;
+      }));
+    options.add("SyzygyProbeDepth", Option(1, 1, 100));
+    options.add("Syzygy50MoveRule", Option(true));
+    options.add("SyzygyProbeLimit", Option(7, 0, 7));
+    options.add(  //
+      "EvalFile", Option(EvalFileDefaultNameBig, [this](const Option& o) {
+          load_big_network(o);
+          return std::nullopt;
+      }));
+    options.add(  //
+      "EvalFileSmall", Option(EvalFileDefaultNameSmall, [this](const Option& o) {
+          load_small_network(o);
+          return std::nullopt;
+      }));
+    threads.clear();
+    threads.ensure_network_replicated();
+    resize_threads();
+}
+std::uint64_t Engine::perft(const std::string& fen, Depth depth, bool isChess960) {
+    verify_networks();
+    return Benchmark::perft(fen, depth, isChess960);
+}
+void Engine::go(Search::LimitsType& limits) {
+    assert(limits.perft == 0);
+    verify_networks();
+    threads.start_thinking(options, pos, states, limits);
+}
+void Engine::stop() { threads.stop = true; }
+void Engine::search_clear() {
+    wait_for_search_finished();
+    tt.clear(threads);
+    threads.clear();
+    // @TODO wont work with multiple instances
+    Tablebases::init(options["SyzygyPath"]);  // Free mapped files
+}
+void Engine::set_on_update_no_moves(std::function<void(const Engine::InfoShort&)>&& f) {
+    updateContext.onUpdateNoMoves = std::move(f);
+}
+void Engine::set_on_update_full(std::function<void(const Engine::InfoFull&)>&& f) {
+    updateContext.onUpdateFull = std::move(f);
+}
+void Engine::set_on_iter(std::function<void(const Engine::InfoIter&)>&& f) {
+    updateContext.onIter = std::move(f);
+}
+void Engine::set_on_bestmove(std::function<void(std::string_view, std::string_view)>&& f) {
+    updateContext.onBestmove = std::move(f);
+}
+void Engine::set_on_verify_networks(std::function<void(std::string_view)>&& f) {
+    onVerifyNetworks = std::move(f);
+}
+void Engine::wait_for_search_finished() { threads.main_thread()->wait_for_search_finished(); }
+void Engine::set_position(const std::string& fen, const std::vector<std::string>& moves) {
+    // Drop the old state and create a new one
+    states = StateListPtr(new std::deque<StateInfo>(1));
+    pos.set(fen, options["UCI_Chess960"], &states->back());
+    for (const auto& move : moves)
+    {
+        auto m = UCIEngine::to_move(pos, move);
+        if (m == Move::none())
+            break;
+        states->emplace_back();
+        pos.do_move(m, states->back());
+    }
+}
+// modifiers
+void Engine::set_numa_config_from_option(const std::string& o) {
+    if (o == "auto" || o == "system")
+    {
+        numaContext.set_numa_config(NumaConfig::from_system(DefaultNumaPolicy));
+    }
+    else if (o == "hardware")
+    {
+        // Don't respect affinity set in the system.
+        numaContext.set_numa_config(NumaConfig::from_system(DefaultNumaPolicy, false));
+    }
+    else if (o == "none")
+    {
+        numaContext.set_numa_config(NumaConfig{});
+    }
+    else
+    {
+        numaContext.set_numa_config(NumaConfig::from_string(o));
+    }
+    // Force reallocation of threads in case affinities need to change.
+    resize_threads();
+    threads.ensure_network_replicated();
+}
+void Engine::resize_threads() {
+    threads.wait_for_search_finished();
+    threads.set(numaContext.get_numa_config(), {options, threads, tt, sharedHists, networks},
+                updateContext);
+    // Reallocate the hash with the new threadpool size
+    set_tt_size(options["Hash"]);
+    threads.ensure_network_replicated();
+}
+void Engine::set_tt_size(size_t mb) {
+    wait_for_search_finished();
+    tt.resize(mb, threads);
+}
+void Engine::set_ponderhit(bool b) { threads.main_manager()->ponder = b; }
+// network related
+void Engine::verify_networks() const {
+    networks->big.verify(options["EvalFile"], onVerifyNetworks);
+    networks->small.verify(options["EvalFileSmall"], onVerifyNetworks);
+    auto statuses = networks.get_status_and_errors();
+    for (size_t i = 0; i < statuses.size(); ++i)
+    {
+        const auto [status, error] = statuses[i];
+        std::string message        = "Network replica " + std::to_string(i + 1) + ": ";
+        if (status == SystemWideSharedConstantAllocationStatus::NoAllocation)
+        {
+            message += "No allocation.";
+        }
+        else if (status == SystemWideSharedConstantAllocationStatus::LocalMemory)
+        {
+            message += "Local memory.";
+        }
+        else if (status == SystemWideSharedConstantAllocationStatus::SharedMemory)
+        {
+            message += "Shared memory.";
+        }
+        else
+        {
+            message += "Unknown status.";
+        }
+        if (error.has_value())
+        {
+            message += " " + *error;
+        }
+        onVerifyNetworks(message);
+    }
+}
+std::unique_ptr<Eval::NNUE::Networks> Engine::get_default_networks() const {
+    auto networks_ =
+      std::make_unique<NN::Networks>(NN::EvalFile{EvalFileDefaultNameBig, "None", ""},
+                                     NN::EvalFile{EvalFileDefaultNameSmall, "None", ""});
+    networks_->big.load(binaryDirectory, "");
+    networks_->small.load(binaryDirectory, "");
+    return networks_;
+}
+void Engine::load_big_network(const std::string& file) {
+    networks.modify_and_replicate(
+      [this, &file](NN::Networks& networks_) { networks_.big.load(binaryDirectory, file); });
+    threads.clear();
+    threads.ensure_network_replicated();
+}
+void Engine::load_small_network(const std::string& file) {
+    networks.modify_and_replicate(
+      [this, &file](NN::Networks& networks_) { networks_.small.load(binaryDirectory, file); });
+    threads.clear();
+    threads.ensure_network_replicated();
+}
+void Engine::save_network(const std::pair<std::optional<std::string>, std::string> files[2]) {
+    networks.modify_and_replicate([&files](NN::Networks& networks_) {
+        networks_.big.save(files[0].first);
+        networks_.small.save(files[1].first);
+    });
+}
+// utility functions
+void Engine::trace_eval() const {
+    StateListPtr trace_states(new std::deque<StateInfo>(1));
+    Position     p;
+    p.set(pos.fen(), options["UCI_Chess960"], &trace_states->back());
+    verify_networks();
+    sync_cout << "\n" << Eval::trace(p, *networks) << sync_endl;
+}
+const OptionsMap& Engine::get_options() const { return options; }
+OptionsMap&       Engine::get_options() { return options; }
+std::string Engine::fen() const { return pos.fen(); }
+void Engine::flip() { pos.flip(); }
+std::string Engine::visualize() const {
+    std::stringstream ss;
+    ss << pos;
+    return ss.str();
+}
+int Engine::get_hashfull(int maxAge) const { return tt.hashfull(maxAge); }
+std::vector<std::pair<size_t, size_t>> Engine::get_bound_thread_count_by_numa_node() const {
+    auto                                   counts = threads.get_bound_thread_count_by_numa_node();
+    const NumaConfig&                      cfg    = numaContext.get_numa_config();
+    std::vector<std::pair<size_t, size_t>> ratios;
+    NumaIndex                              n = 0;
+    for (; n < counts.size(); ++n)
+        ratios.emplace_back(counts[n], cfg.num_cpus_in_numa_node(n));
+    if (!counts.empty())
+        for (; n < cfg.num_numa_nodes(); ++n)
+            ratios.emplace_back(0, cfg.num_cpus_in_numa_node(n));
+    return ratios;
+}
+std::string Engine::get_numa_config_as_string() const {
+    return numaContext.get_numa_config().to_string();
+}
+std::string Engine::numa_config_information_as_string() const {
+    auto cfgStr = get_numa_config_as_string();
+    return "Available processors: " + cfgStr;
+}
+std::string Engine::thread_binding_information_as_string() const {
+    auto              boundThreadsByNode = get_bound_thread_count_by_numa_node();
+    std::stringstream ss;
+    if (boundThreadsByNode.empty())
+        return ss.str();
+    bool isFirst = true;
+    for (auto&& [current, total] : boundThreadsByNode)
+    {
+        if (!isFirst)
+            ss << ":";
+        ss << current << "/" << total;
+        isFirst = false;
+    }
+    return ss.str();
+}
+std::string Engine::thread_allocation_information_as_string() const {
+    std::stringstream ss;
+    size_t threadsSize = threads.size();
+    ss << "Using " << threadsSize << (threadsSize > 1 ? " threads" : " thread");
+    auto boundThreadsByNodeStr = thread_binding_information_as_string();
+    if (boundThreadsByNodeStr.empty())
+        return ss.str();
+    ss << " with NUMA node thread binding: ";
+    ss << boundThreadsByNodeStr;
+    return ss.str();
+}
+}

src/engine.h ADDED Viewed

	@@ -0,0 +1,134 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef ENGINE_H_INCLUDED
+#define ENGINE_H_INCLUDED
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+#include "history.h"
+#include "nnue/network.h"
+#include "numa.h"
+#include "position.h"
+#include "search.h"
+#include "syzygy/tbprobe.h"  // for Stockfish::Depth
+#include "thread.h"
+#include "tt.h"
+#include "ucioption.h"
+namespace Stockfish {
+class Engine {
+   public:
+    using InfoShort = Search::InfoShort;
+    using InfoFull  = Search::InfoFull;
+    using InfoIter  = Search::InfoIteration;
+    Engine(std::optional<std::string> path = std::nullopt);
+    // Cannot be movable due to components holding backreferences to fields
+    Engine(const Engine&)            = delete;
+    Engine(Engine&&)                 = delete;
+    Engine& operator=(const Engine&) = delete;
+    Engine& operator=(Engine&&)      = delete;
+    ~Engine() { wait_for_search_finished(); }
+    std::uint64_t perft(const std::string& fen, Depth depth, bool isChess960);
+    // non blocking call to start searching
+    void go(Search::LimitsType&);
+    // non blocking call to stop searching
+    void stop();
+    // blocking call to wait for search to finish
+    void wait_for_search_finished();
+    // set a new position, moves are in UCI format
+    void set_position(const std::string& fen, const std::vector<std::string>& moves);
+    // modifiers
+    void set_numa_config_from_option(const std::string& o);
+    void resize_threads();
+    void set_tt_size(size_t mb);
+    void set_ponderhit(bool);
+    void search_clear();
+    void set_on_update_no_moves(std::function<void(const InfoShort&)>&&);
+    void set_on_update_full(std::function<void(const InfoFull&)>&&);
+    void set_on_iter(std::function<void(const InfoIter&)>&&);
+    void set_on_bestmove(std::function<void(std::string_view, std::string_view)>&&);
+    void set_on_verify_networks(std::function<void(std::string_view)>&&);
+    // network related
+    void                                  verify_networks() const;
+    std::unique_ptr<Eval::NNUE::Networks> get_default_networks() const;
+    void                                  load_big_network(const std::string& file);
+    void                                  load_small_network(const std::string& file);
+    void save_network(const std::pair<std::optional<std::string>, std::string> files[2]);
+    // utility functions
+    void trace_eval() const;
+    const OptionsMap& get_options() const;
+    OptionsMap&       get_options();
+    int get_hashfull(int maxAge = 0) const;
+    std::string                            fen() const;
+    void                                   flip();
+    std::string                            visualize() const;
+    std::vector<std::pair<size_t, size_t>> get_bound_thread_count_by_numa_node() const;
+    std::string                            get_numa_config_as_string() const;
+    std::string                            numa_config_information_as_string() const;
+    std::string                            thread_allocation_information_as_string() const;
+    std::string                            thread_binding_information_as_string() const;
+   private:
+    const std::string binaryDirectory;
+    NumaReplicationContext numaContext;
+    Position     pos;
+    StateListPtr states;
+    OptionsMap                                         options;
+    ThreadPool                                         threads;
+    TranspositionTable                                 tt;
+    LazyNumaReplicatedSystemWide<Eval::NNUE::Networks> networks;
+    Search::SearchManager::UpdateContext  updateContext;
+    std::function<void(std::string_view)> onVerifyNetworks;
+    std::map<NumaIndex, SharedHistories>  sharedHists;
+};
+}  // namespace Stockfish
+#endif  // #ifndef ENGINE_H_INCLUDED

src/evaluate.cpp ADDED Viewed

	@@ -0,0 +1,124 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "evaluate.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <tuple>
+#include "nnue/network.h"
+#include "nnue/nnue_misc.h"
+#include "position.h"
+#include "types.h"
+#include "uci.h"
+#include "nnue/nnue_accumulator.h"
+namespace Stockfish {
+// Returns a static, purely materialistic evaluation of the position from
+// the point of view of the side to move. It can be divided by PawnValue to get
+// an approximation of the material advantage on the board in terms of pawns.
+int Eval::simple_eval(const Position& pos) {
+    Color c = pos.side_to_move();
+    return PawnValue * (pos.count<PAWN>(c) - pos.count<PAWN>(~c)) + pos.non_pawn_material(c)
+         - pos.non_pawn_material(~c);
+}
+bool Eval::use_smallnet(const Position& pos) { return std::abs(simple_eval(pos)) > 962; }
+// Evaluate is the evaluator for the outer world. It returns a static evaluation
+// of the position from the point of view of the side to move.
+Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
+                     const Position&                pos,
+                     Eval::NNUE::AccumulatorStack&  accumulators,
+                     Eval::NNUE::AccumulatorCaches& caches,
+                     int                            optimism) {
+    assert(!pos.checkers());
+    bool smallNet           = use_smallnet(pos);
+    auto [psqt, positional] = smallNet ? networks.small.evaluate(pos, accumulators, caches.small)
+                                       : networks.big.evaluate(pos, accumulators, caches.big);
+    Value nnue = (125 * psqt + 131 * positional) / 128;
+    // Re-evaluate the position when higher eval accuracy is worth the time spent
+    if (smallNet && (std::abs(nnue) < 277))
+    {
+        std::tie(psqt, positional) = networks.big.evaluate(pos, accumulators, caches.big);
+        nnue                       = (125 * psqt + 131 * positional) / 128;
+        smallNet                   = false;
+    }
+    // Blend optimism and eval with nnue complexity
+    int nnueComplexity = std::abs(psqt - positional);
+    optimism += optimism * nnueComplexity / 476;
+    nnue -= nnue * nnueComplexity / 18236;
+    int material = 534 * pos.count<PAWN>() + pos.non_pawn_material();
+    int v        = (nnue * (77871 + material) + optimism * (7191 + material)) / 77871;
+    // Damp down the evaluation linearly when shuffling
+    v -= v * pos.rule50_count() / 199;
+    // Guarantee evaluation does not hit the tablebase range
+    v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+    return v;
+}
+// Like evaluate(), but instead of returning a value, it returns
+// a string (suitable for outputting to stdout) that contains the detailed
+// descriptions and values of each evaluation term. Useful for debugging.
+// Trace scores are from white's point of view
+std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) {
+    if (pos.checkers())
+        return "Final evaluation: none (in check)";
+    auto accumulators = std::make_unique<Eval::NNUE::AccumulatorStack>();
+    auto caches       = std::make_unique<Eval::NNUE::AccumulatorCaches>(networks);
+    std::stringstream ss;
+    ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2);
+    ss << '\n' << NNUE::trace(pos, networks, *caches) << '\n';
+    ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15);
+    auto [psqt, positional] = networks.big.evaluate(pos, *accumulators, caches->big);
+    Value v                 = psqt + positional;
+    v                       = pos.side_to_move() == WHITE ? v : -v;
+    ss << "NNUE evaluation        " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)\n";
+    v = evaluate(networks, pos, *accumulators, *caches, VALUE_ZERO);
+    v = pos.side_to_move() == WHITE ? v : -v;
+    ss << "Final evaluation       " << 0.01 * UCIEngine::to_cp(v, pos) << " (white side)";
+    ss << " [with scaled NNUE, ...]";
+    ss << "\n";
+    return ss.str();
+}
+}  // namespace Stockfish

src/evaluate.h ADDED Viewed

	@@ -0,0 +1,58 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef EVALUATE_H_INCLUDED
+#define EVALUATE_H_INCLUDED
+#include <string>
+#include "types.h"
+namespace Stockfish {
+class Position;
+namespace Eval {
+// The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
+// for the build process (profile-build and fishtest) to work. Do not change the
+// name of the macro or the location where this macro is defined, as it is used
+// in the Makefile/Fishtest.
+#define EvalFileDefaultNameBig "nn-9a0cc2a62c52.nnue"
+#define EvalFileDefaultNameSmall "nn-47fc8b7fff06.nnue"
+namespace NNUE {
+struct Networks;
+struct AccumulatorCaches;
+class AccumulatorStack;
+}
+std::string trace(Position& pos, const Eval::NNUE::Networks& networks);
+int   simple_eval(const Position& pos);
+bool  use_smallnet(const Position& pos);
+Value evaluate(const NNUE::Networks&          networks,
+               const Position&                pos,
+               Eval::NNUE::AccumulatorStack&  accumulators,
+               Eval::NNUE::AccumulatorCaches& caches,
+               int                            optimism);
+}  // namespace Eval
+}  // namespace Stockfish
+#endif  // #ifndef EVALUATE_H_INCLUDED

src/history.h ADDED Viewed

	@@ -0,0 +1,273 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef HISTORY_H_INCLUDED
+#define HISTORY_H_INCLUDED
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <type_traits>  // IWYU pragma: keep
+#include "memory.h"
+#include "misc.h"
+#include "position.h"
+namespace Stockfish {
+constexpr int PAWN_HISTORY_BASE_SIZE   = 8192;  // has to be a power of 2
+constexpr int UINT_16_HISTORY_SIZE     = std::numeric_limits<uint16_t>::max() + 1;
+constexpr int CORRHIST_BASE_SIZE       = UINT_16_HISTORY_SIZE;
+constexpr int CORRECTION_HISTORY_LIMIT = 1024;
+constexpr int LOW_PLY_HISTORY_SIZE     = 5;
+static_assert((PAWN_HISTORY_BASE_SIZE & (PAWN_HISTORY_BASE_SIZE - 1)) == 0,
+              "PAWN_HISTORY_BASE_SIZE has to be a power of 2");
+static_assert((CORRHIST_BASE_SIZE & (CORRHIST_BASE_SIZE - 1)) == 0,
+              "CORRHIST_BASE_SIZE has to be a power of 2");
+// StatsEntry is the container of various numerical statistics. We use a class
+// instead of a naked value to directly call history update operator<<() on
+// the entry. The first template parameter T is the base type of the array,
+// and the second template parameter D limits the range of updates in [-D, D]
+// when we update values with the << operator
+template<typename T, int D, bool Atomic = false>
+struct StatsEntry {
+    static_assert(std::is_arithmetic_v<T>, "Not an arithmetic type");
+   private:
+    std::conditional_t<Atomic, std::atomic<T>, T> entry;
+   public:
+    void operator=(const T& v) {
+        if constexpr (Atomic)
+            entry.store(v, std::memory_order_relaxed);
+        else
+            entry = v;
+    }
+    operator T() const {
+        if constexpr (Atomic)
+            return entry.load(std::memory_order_relaxed);
+        else
+            return entry;
+    }
+    void operator<<(int bonus) {
+        // Make sure that bonus is in range [-D, D]
+        int clampedBonus = std::clamp(bonus, -D, D);
+        T   val          = *this;
+        *this            = val + clampedBonus - val * std::abs(clampedBonus) / D;
+        assert(std::abs(T(*this)) <= D);
+    }
+};
+enum StatsType {
+    NoCaptures,
+    Captures
+};
+template<typename T, int D, std::size_t... Sizes>
+using Stats = MultiArray<StatsEntry<T, D>, Sizes...>;
+template<typename T, int D, std::size_t... Sizes>
+using AtomicStats = MultiArray<StatsEntry<T, D, true>, Sizes...>;
+// DynStats is a dynamically sized array of Stats, used for thread-shared histories
+// which should scale with the total number of threads. The SizeMultiplier gives
+// the per-thread allocation count of T.
+template<typename T, int SizeMultiplier>
+struct DynStats {
+    explicit DynStats(size_t s) {
+        size = s * SizeMultiplier;
+        data = make_unique_large_page<T[]>(size);
+    }
+    // Sets all values in the range to 0
+    void clear_range(int value, size_t threadIdx, size_t numaTotal) {
+        size_t start = uint64_t(threadIdx) * size / numaTotal;
+        assert(start < size);
+        size_t end = threadIdx + 1 == numaTotal ? size : uint64_t(threadIdx + 1) * size / numaTotal;
+        while (start < end)
+            data[start++].fill(value);
+    }
+    size_t get_size() const { return size; }
+    T&     operator[](size_t index) {
+        assert(index < size);
+        return data.get()[index];
+    }
+    const T& operator[](size_t index) const {
+        assert(index < size);
+        return data.get()[index];
+    }
+   private:
+    size_t            size;
+    LargePagePtr<T[]> data;
+};
+// ButterflyHistory records how often quiet moves have been successful or unsuccessful
+// during the current search, and is used for reduction and move ordering decisions.
+// It uses 2 tables (one for each color) indexed by the move's from and to squares,
+// see https://www.chessprogramming.org/Butterfly_Boards
+using ButterflyHistory = Stats<std::int16_t, 7183, COLOR_NB, UINT_16_HISTORY_SIZE>;
+// LowPlyHistory is addressed by ply and move's from and to squares, used
+// to improve move ordering near the root
+using LowPlyHistory = Stats<std::int16_t, 7183, LOW_PLY_HISTORY_SIZE, UINT_16_HISTORY_SIZE>;
+// CapturePieceToHistory is addressed by a move's [piece][to][captured piece type]
+using CapturePieceToHistory = Stats<std::int16_t, 10692, PIECE_NB, SQUARE_NB, PIECE_TYPE_NB>;
+// PieceToHistory is like ButterflyHistory but is addressed by a move's [piece][to]
+using PieceToHistory = Stats<std::int16_t, 30000, PIECE_NB, SQUARE_NB>;
+// ContinuationHistory is the combined history of a given pair of moves, usually
+// the current one given a previous one. The nested history table is based on
+// PieceToHistory instead of ButterflyBoards.
+using ContinuationHistory = MultiArray<PieceToHistory, PIECE_NB, SQUARE_NB>;
+// PawnHistory is addressed by the pawn structure and a move's [piece][to]
+using PawnHistory =
+  DynStats<AtomicStats<std::int16_t, 8192, PIECE_NB, SQUARE_NB>, PAWN_HISTORY_BASE_SIZE>;
+// Correction histories record differences between the static evaluation of
+// positions and their search score. It is used to improve the static evaluation
+// used by some search heuristics.
+// see https://www.chessprogramming.org/Static_Evaluation_Correction_History
+enum CorrHistType {
+    Pawn,          // By color and pawn structure
+    Minor,         // By color and positions of minor pieces (Knight, Bishop)
+    NonPawn,       // By non-pawn material positions and color
+    PieceTo,       // By [piece][to] move
+    Continuation,  // Combined history of move pairs
+};
+template<typename T, int D>
+struct CorrectionBundle {
+    StatsEntry<T, D, true> pawn;
+    StatsEntry<T, D, true> minor;
+    StatsEntry<T, D, true> nonPawnWhite;
+    StatsEntry<T, D, true> nonPawnBlack;
+    void operator=(T val) {
+        pawn         = val;
+        minor        = val;
+        nonPawnWhite = val;
+        nonPawnBlack = val;
+    }
+};
+namespace Detail {
+template<CorrHistType>
+struct CorrHistTypedef {
+    using type =
+      DynStats<Stats<std::int16_t, CORRECTION_HISTORY_LIMIT, COLOR_NB>, CORRHIST_BASE_SIZE>;
+};
+template<>
+struct CorrHistTypedef<PieceTo> {
+    using type = Stats<std::int16_t, CORRECTION_HISTORY_LIMIT, PIECE_NB, SQUARE_NB>;
+};
+template<>
+struct CorrHistTypedef<Continuation> {
+    using type = MultiArray<CorrHistTypedef<PieceTo>::type, PIECE_NB, SQUARE_NB>;
+};
+template<>
+struct CorrHistTypedef<NonPawn> {
+    using type = DynStats<Stats<std::int16_t, CORRECTION_HISTORY_LIMIT, COLOR_NB, COLOR_NB>,
+                          CORRHIST_BASE_SIZE>;
+};
+}
+using UnifiedCorrectionHistory =
+  DynStats<MultiArray<CorrectionBundle<std::int16_t, CORRECTION_HISTORY_LIMIT>, COLOR_NB>,
+           CORRHIST_BASE_SIZE>;
+template<CorrHistType T>
+using CorrectionHistory = typename Detail::CorrHistTypedef<T>::type;
+using TTMoveHistory = StatsEntry<std::int16_t, 8192>;
+// Set of histories shared between groups of threads. To avoid excessive
+// cross-node data transfer, histories are shared only between threads
+// on a given NUMA node. The passed size must be a power of two to make
+// the indexing more efficient.
+struct SharedHistories {
+    SharedHistories(size_t threadCount) :
+        correctionHistory(threadCount),
+        pawnHistory(threadCount) {
+        assert((threadCount & (threadCount - 1)) == 0 && threadCount != 0);
+        sizeMinus1         = correctionHistory.get_size() - 1;
+        pawnHistSizeMinus1 = pawnHistory.get_size() - 1;
+    }
+    size_t get_size() const { return sizeMinus1 + 1; }
+    auto& pawn_entry(const Position& pos) {
+        return pawnHistory[pos.pawn_key() & pawnHistSizeMinus1];
+    }
+    const auto& pawn_entry(const Position& pos) const {
+        return pawnHistory[pos.pawn_key() & pawnHistSizeMinus1];
+    }
+    auto& pawn_correction_entry(const Position& pos) {
+        return correctionHistory[pos.pawn_key() & sizeMinus1];
+    }
+    const auto& pawn_correction_entry(const Position& pos) const {
+        return correctionHistory[pos.pawn_key() & sizeMinus1];
+    }
+    auto& minor_piece_correction_entry(const Position& pos) {
+        return correctionHistory[pos.minor_piece_key() & sizeMinus1];
+    }
+    const auto& minor_piece_correction_entry(const Position& pos) const {
+        return correctionHistory[pos.minor_piece_key() & sizeMinus1];
+    }
+    template<Color c>
+    auto& nonpawn_correction_entry(const Position& pos) {
+        return correctionHistory[pos.non_pawn_key(c) & sizeMinus1];
+    }
+    template<Color c>
+    const auto& nonpawn_correction_entry(const Position& pos) const {
+        return correctionHistory[pos.non_pawn_key(c) & sizeMinus1];
+    }
+    UnifiedCorrectionHistory correctionHistory;
+    PawnHistory              pawnHistory;
+   private:
+    size_t sizeMinus1, pawnHistSizeMinus1;
+};
+}  // namespace Stockfish
+#endif  // #ifndef HISTORY_H_INCLUDED

src/incbin/UNLICENCE ADDED Viewed

	@@ -0,0 +1,26 @@

+The file "incbin.h" is free and unencumbered software released into
+the public domain by Dale Weiler, see:
+   <https://github.com/graphitemaster/incbin>
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+For more information, please refer to <http://unlicense.org/>

src/incbin/incbin.h ADDED Viewed

	@@ -0,0 +1,476 @@

+/**
+ * @file incbin.h
+ * @author Dale Weiler
+ * @brief Utility for including binary files
+ *
+ * Facilities for including binary files into the current translation unit and
+ * making use from them externally in other translation units.
+ */
+#ifndef INCBIN_HDR
+#define INCBIN_HDR
+#include <limits.h>
+#if   defined(__AVX512BW__) || \
+      defined(__AVX512CD__) || \
+      defined(__AVX512DQ__) || \
+      defined(__AVX512ER__) || \
+      defined(__AVX512PF__) || \
+      defined(__AVX512VL__) || \
+      defined(__AVX512F__)
+# define INCBIN_ALIGNMENT_INDEX 6
+#elif defined(__AVX__)      || \
+      defined(__AVX2__)
+# define INCBIN_ALIGNMENT_INDEX 5
+#elif defined(__SSE__)      || \
+      defined(__SSE2__)     || \
+      defined(__SSE3__)     || \
+      defined(__SSSE3__)    || \
+      defined(__SSE4_1__)   || \
+      defined(__SSE4_2__)   || \
+      defined(__neon__)     || \
+      defined(__ARM_NEON)   || \
+      defined(__ALTIVEC__)
+# define INCBIN_ALIGNMENT_INDEX 4
+#elif ULONG_MAX != 0xffffffffu
+# define INCBIN_ALIGNMENT_INDEX 3
+# else
+# define INCBIN_ALIGNMENT_INDEX 2
+#endif
+/* Lookup table of (1 << n) where `n' is `INCBIN_ALIGNMENT_INDEX' */
+#define INCBIN_ALIGN_SHIFT_0 1
+#define INCBIN_ALIGN_SHIFT_1 2
+#define INCBIN_ALIGN_SHIFT_2 4
+#define INCBIN_ALIGN_SHIFT_3 8
+#define INCBIN_ALIGN_SHIFT_4 16
+#define INCBIN_ALIGN_SHIFT_5 32
+#define INCBIN_ALIGN_SHIFT_6 64
+/* Actual alignment value */
+#define INCBIN_ALIGNMENT \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_ALIGN_SHIFT, _), \
+        INCBIN_ALIGNMENT_INDEX)
+/* Stringize */
+#define INCBIN_STR(X) \
+    #X
+#define INCBIN_STRINGIZE(X) \
+    INCBIN_STR(X)
+/* Concatenate */
+#define INCBIN_CAT(X, Y) \
+    X ## Y
+#define INCBIN_CONCATENATE(X, Y) \
+    INCBIN_CAT(X, Y)
+/* Deferred macro expansion */
+#define INCBIN_EVAL(X) \
+    X
+#define INCBIN_INVOKE(N, ...) \
+    INCBIN_EVAL(N(__VA_ARGS__))
+/* Variable argument count for overloading by arity */
+#define INCBIN_VA_ARG_COUNTER(_1, _2, _3, N, ...) N
+#define INCBIN_VA_ARGC(...) INCBIN_VA_ARG_COUNTER(__VA_ARGS__, 3, 2, 1, 0)
+/* Green Hills uses a different directive for including binary data */
+#if defined(__ghs__)
+#  if (__ghs_asm == 2)
+#    define INCBIN_MACRO ".file"
+/* Or consider the ".myrawdata" entry in the ld file */
+#  else
+#    define INCBIN_MACRO "\tINCBIN"
+#  endif
+#else
+#  define INCBIN_MACRO ".incbin"
+#endif
+#ifndef _MSC_VER
+#  define INCBIN_ALIGN \
+    __attribute__((aligned(INCBIN_ALIGNMENT)))
+#else
+#  define INCBIN_ALIGN __declspec(align(INCBIN_ALIGNMENT))
+#endif
+#if defined(__arm__) || /* GNU C and RealView */ \
+    defined(__arm) || /* Diab */ \
+    defined(_ARM) /* ImageCraft */
+#  define INCBIN_ARM
+#endif
+#ifdef __GNUC__
+/* Utilize .balign where supported */
+#  define INCBIN_ALIGN_HOST ".balign " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".balign 1\n"
+#elif defined(INCBIN_ARM)
+/*
+ * On arm assemblers, the alignment value is calculated as (1 << n) where `n' is
+ * the shift count. This is the value passed to `.align'
+ */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT_INDEX) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 0\n"
+#else
+/* We assume other inline assembler's treat `.align' as `.balign' */
+#  define INCBIN_ALIGN_HOST ".align " INCBIN_STRINGIZE(INCBIN_ALIGNMENT) "\n"
+#  define INCBIN_ALIGN_BYTE ".align 1\n"
+#endif
+/* INCBIN_CONST is used by incbin.c generated files */
+#if defined(__cplusplus)
+#  define INCBIN_EXTERNAL extern "C"
+#  define INCBIN_CONST    extern const
+#else
+#  define INCBIN_EXTERNAL extern
+#  define INCBIN_CONST    const
+#endif
+/**
+ * @brief Optionally override the linker section into which size and data is
+ * emitted.
+ *
+ * @warning If you use this facility, you might have to deal with
+ * platform-specific linker output section naming on your own.
+ */
+#if !defined(INCBIN_OUTPUT_SECTION)
+#  if defined(__APPLE__)
+#    define INCBIN_OUTPUT_SECTION ".const_data"
+#  else
+#    define INCBIN_OUTPUT_SECTION ".rodata"
+#  endif
+#endif
+/**
+ * @brief Optionally override the linker section into which data is emitted.
+ *
+ * @warning If you use this facility, you might have to deal with
+ * platform-specific linker output section naming on your own.
+ */
+#if !defined(INCBIN_OUTPUT_DATA_SECTION)
+#  define INCBIN_OUTPUT_DATA_SECTION INCBIN_OUTPUT_SECTION
+#endif
+/**
+ * @brief Optionally override the linker section into which size is emitted.
+ *
+ * @warning If you use this facility, you might have to deal with
+ * platform-specific linker output section naming on your own.
+ *
+ * @note This is useful for Harvard architectures where program memory cannot
+ * be directly read from the program without special instructions. With this you
+ * can chose to put the size variable in RAM rather than ROM.
+ */
+#if !defined(INCBIN_OUTPUT_SIZE_SECTION)
+#  define INCBIN_OUTPUT_SIZE_SECTION INCBIN_OUTPUT_SECTION
+#endif
+#if defined(__APPLE__)
+#  include "TargetConditionals.h"
+#  if defined(TARGET_OS_IPHONE) && !defined(INCBIN_SILENCE_BITCODE_WARNING)
+#    warning "incbin is incompatible with bitcode. Using the library will break upload to App Store if you have bitcode enabled. Add `#define INCBIN_SILENCE_BITCODE_WARNING` before including this header to silence this warning."
+#  endif
+/* The directives are different for Apple branded compilers */
+#  define INCBIN_SECTION         INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".globl " INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  define INCBIN_INT             ".long "
+#  define INCBIN_MANGLE          "_"
+#  define INCBIN_BYTE            ".byte "
+#  define INCBIN_TYPE(...)
+#else
+#  define INCBIN_SECTION         ".section " INCBIN_OUTPUT_SECTION "\n"
+#  define INCBIN_GLOBAL(NAME)    ".global " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME "\n"
+#  if defined(__ghs__)
+#    define INCBIN_INT           ".word "
+#  else
+#    define INCBIN_INT           ".int "
+#  endif
+#  if defined(__USER_LABEL_PREFIX__)
+#    define INCBIN_MANGLE        INCBIN_STRINGIZE(__USER_LABEL_PREFIX__)
+#  else
+#    define INCBIN_MANGLE        ""
+#  endif
+#  if defined(INCBIN_ARM)
+/* On arm assemblers, `@' is used as a line comment token */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", %object\n"
+#  elif defined(__MINGW32__) || defined(__MINGW64__)
+/* Mingw doesn't support this directive either */
+#    define INCBIN_TYPE(NAME)
+#  else
+/* It's safe to use `@' on other architectures */
+#    define INCBIN_TYPE(NAME)    ".type " INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME ", @object\n"
+#  endif
+#  define INCBIN_BYTE            ".byte "
+#endif
+/* List of style types used for symbol names */
+#define INCBIN_STYLE_CAMEL 0
+#define INCBIN_STYLE_SNAKE 1
+/**
+ * @brief Specify the prefix to use for symbol names.
+ *
+ * @note By default this is "g".
+ *
+ * @code
+ * #define INCBIN_PREFIX incbin
+ * #include "incbin.h"
+ * INCBIN(Foo, "foo.txt");
+ *
+ * // Now you have the following symbols instead:
+ * // const unsigned char incbinFoo<data>[];
+ * // const unsigned char *const incbinFoo<end>;
+ * // const unsigned int incbinFoo<size>;
+ * @endcode
+ */
+#if !defined(INCBIN_PREFIX)
+#  define INCBIN_PREFIX g
+#endif
+/**
+ * @brief Specify the style used for symbol names.
+ *
+ * Possible options are
+ * - INCBIN_STYLE_CAMEL "CamelCase"
+ * - INCBIN_STYLE_SNAKE "snake_case"
+ *
+ * @note By default this is INCBIN_STYLE_CAMEL
+ *
+ * @code
+ * #define INCBIN_STYLE INCBIN_STYLE_SNAKE
+ * #include "incbin.h"
+ * INCBIN(foo, "foo.txt");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>foo_data[];
+ * // const unsigned char *const <prefix>foo_end;
+ * // const unsigned int <prefix>foo_size;
+ * @endcode
+ */
+#if !defined(INCBIN_STYLE)
+#  define INCBIN_STYLE INCBIN_STYLE_CAMEL
+#endif
+/* Style lookup tables */
+#define INCBIN_STYLE_0_DATA Data
+#define INCBIN_STYLE_0_END End
+#define INCBIN_STYLE_0_SIZE Size
+#define INCBIN_STYLE_1_DATA _data
+#define INCBIN_STYLE_1_END _end
+#define INCBIN_STYLE_1_SIZE _size
+/* Style lookup: returning identifier */
+#define INCBIN_STYLE_IDENT(TYPE) \
+    INCBIN_CONCATENATE( \
+        INCBIN_STYLE_, \
+        INCBIN_CONCATENATE( \
+            INCBIN_EVAL(INCBIN_STYLE), \
+            INCBIN_CONCATENATE(_, TYPE)))
+/* Style lookup: returning string literal */
+#define INCBIN_STYLE_STRING(TYPE) \
+    INCBIN_STRINGIZE( \
+        INCBIN_STYLE_IDENT(TYPE)) \
+/* Generate the global labels by indirectly invoking the macro with our style
+ * type and concatenating the name against them. */
+#define INCBIN_GLOBAL_LABELS(NAME, TYPE) \
+    INCBIN_INVOKE( \
+        INCBIN_GLOBAL, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE))) \
+    INCBIN_INVOKE( \
+        INCBIN_TYPE, \
+        INCBIN_CONCATENATE( \
+            NAME, \
+            INCBIN_INVOKE( \
+                INCBIN_STYLE_IDENT, \
+                TYPE)))
+/**
+ * @brief Externally reference binary data included in another translation unit.
+ *
+ * Produces three external symbols that reference the binary data included in
+ * another translation unit.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param TYPE Optional array type. Omitting this picks a default of `unsigned char`.
+ * @param NAME The name given for the binary data
+ *
+ * @code
+ * INCBIN_EXTERN(Foo);
+ *
+ * // Now you have the following symbols:
+ * // extern const unsigned char <prefix>Foo<data>[];
+ * // extern const unsigned char *const <prefix>Foo<end>;
+ * // extern const unsigned int <prefix>Foo<size>;
+ * @endcode
+ *
+ * You may specify a custom optional data type as well as the first argument.
+ * @code
+ * INCBIN_EXTERN(custom_type, Foo);
+ *
+ * // Now you have the following symbols:
+ * // extern const custom_type <prefix>Foo<data>[];
+ * // extern const custom_type *const <prefix>Foo<end>;
+ * // extern const unsigned int <prefix>Foo<size>;
+ * @endcode
+ */
+#define INCBIN_EXTERN(...) \
+    INCBIN_CONCATENATE(INCBIN_EXTERN_, INCBIN_VA_ARGC(__VA_ARGS__))(__VA_ARGS__)
+#define INCBIN_EXTERN_1(NAME, ...) \
+    INCBIN_EXTERN_2(unsigned char, NAME)
+#define INCBIN_EXTERN_2(TYPE, NAME) \
+    INCBIN_EXTERNAL const INCBIN_ALIGN TYPE \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(DATA))[]; \
+    INCBIN_EXTERNAL const INCBIN_ALIGN TYPE *const \
+    INCBIN_CONCATENATE( \
+        INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+        INCBIN_STYLE_IDENT(END)); \
+    INCBIN_EXTERNAL const unsigned int \
+        INCBIN_CONCATENATE( \
+            INCBIN_CONCATENATE(INCBIN_PREFIX, NAME), \
+            INCBIN_STYLE_IDENT(SIZE))
+/**
+ * @brief Externally reference textual data included in another translation unit.
+ *
+ * Produces three external symbols that reference the textual data included in
+ * another translation unit.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name given for the textual data
+ *
+ * @code
+ * INCBIN_EXTERN(Foo);
+ *
+ * // Now you have the following symbols:
+ * // extern const char <prefix>Foo<data>[];
+ * // extern const char *const <prefix>Foo<end>;
+ * // extern const unsigned int <prefix>Foo<size>;
+ * @endcode
+ */
+#define INCTXT_EXTERN(NAME) \
+    INCBIN_EXTERN_2(char, NAME)
+/**
+ * @brief Include a binary file into the current translation unit.
+ *
+ * Includes a binary file into the current translation unit, producing three symbols
+ * for objects that encode the data and size respectively.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param TYPE Optional array type. Omitting this picks a default of `unsigned char`.
+ * @param NAME The name to associate with this binary data (as an identifier.)
+ * @param FILENAME The file to include (as a string literal.)
+ *
+ * @code
+ * INCBIN(Icon, "icon.png");
+ *
+ * // Now you have the following symbols:
+ * // const unsigned char <prefix>Icon<data>[];
+ * // const unsigned char *const <prefix>Icon<end>;
+ * // const unsigned int <prefix>Icon<size>;
+ * @endcode
+ *
+ * You may specify a custom optional data type as well as the first argument.
+ * These macros are specialized by arity.
+ * @code
+ * INCBIN(custom_type, Icon, "icon.png");
+ *
+ * // Now you have the following symbols:
+ * // const custom_type <prefix>Icon<data>[];
+ * // const custom_type *const <prefix>Icon<end>;
+ * // const unsigned int <prefix>Icon<size>;
+ * @endcode
+ *
+ * @warning This must be used in global scope
+ * @warning The identifiers may be different if INCBIN_STYLE is not default
+ *
+ * To externally reference the data included by this in another translation unit
+ * please @see INCBIN_EXTERN.
+ */
+#ifdef _MSC_VER
+#  define INCBIN(NAME, FILENAME) \
+      INCBIN_EXTERN(NAME)
+#else
+#  define INCBIN(...) \
+     INCBIN_CONCATENATE(INCBIN_, INCBIN_VA_ARGC(__VA_ARGS__))(__VA_ARGS__)
+#  if defined(__GNUC__)
+#    define INCBIN_1(...) _Pragma("GCC error \"Single argument INCBIN not allowed\"")
+#  elif defined(__clang__)
+#    define INCBIN_1(...) _Pragma("clang error \"Single argument INCBIN not allowed\"")
+#  else
+#    define INCBIN_1(...) /* Cannot do anything here */
+#  endif
+#  define INCBIN_2(NAME, FILENAME) \
+      INCBIN_3(unsigned char, NAME, FILENAME)
+#  define INCBIN_3(TYPE, NAME, FILENAME) INCBIN_COMMON(TYPE, NAME, FILENAME, /* No terminator for binary data */)
+#  define INCBIN_COMMON(TYPE, NAME, FILENAME, TERMINATOR) \
+    __asm__(INCBIN_SECTION \
+            INCBIN_GLOBAL_LABELS(NAME, DATA) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) ":\n" \
+            INCBIN_MACRO " \"" FILENAME "\"\n" \
+                TERMINATOR \
+            INCBIN_GLOBAL_LABELS(NAME, END) \
+            INCBIN_ALIGN_BYTE \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) ":\n" \
+                INCBIN_BYTE "1\n" \
+            INCBIN_GLOBAL_LABELS(NAME, SIZE) \
+            INCBIN_ALIGN_HOST \
+            INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(SIZE) ":\n" \
+                INCBIN_INT INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(END) " - " \
+                           INCBIN_MANGLE INCBIN_STRINGIZE(INCBIN_PREFIX) #NAME INCBIN_STYLE_STRING(DATA) "\n" \
+            INCBIN_ALIGN_HOST \
+            ".text\n" \
+    ); \
+    INCBIN_EXTERN(TYPE, NAME)
+#endif
+/**
+ * @brief Include a textual file into the current translation unit.
+ *
+ * This behaves the same as INCBIN except it produces char compatible arrays
+ * and implicitly adds a null-terminator byte, thus the size of data included
+ * by this is one byte larger than that of INCBIN.
+ *
+ * Includes a textual file into the current translation unit, producing three
+ * symbols for objects that encode the data and size respectively.
+ *
+ * The symbol names are a concatenation of `INCBIN_PREFIX' before *NAME*; with
+ * "Data", as well as "End" and "Size" after. An example is provided below.
+ *
+ * @param NAME The name to associate with this binary data (as an identifier.)
+ * @param FILENAME The file to include (as a string literal.)
+ *
+ * @code
+ * INCTXT(Readme, "readme.txt");
+ *
+ * // Now you have the following symbols:
+ * // const char <prefix>Readme<data>[];
+ * // const char *const <prefix>Readme<end>;
+ * // const unsigned int <prefix>Readme<size>;
+ * @endcode
+ *
+ * @warning This must be used in global scope
+ * @warning The identifiers may be different if INCBIN_STYLE is not default
+ *
+ * To externally reference the data included by this in another translation unit
+ * please @see INCBIN_EXTERN.
+ */
+#if defined(_MSC_VER)
+#  define INCTXT(NAME, FILENAME) \
+     INCBIN_EXTERN(NAME)
+#else
+#  define INCTXT(NAME, FILENAME) \
+     INCBIN_COMMON(char, NAME, FILENAME, INCBIN_BYTE "0\n")
+#endif
+#endif

src/main.cpp ADDED Viewed

	@@ -0,0 +1,43 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <iostream>
+#include <memory>
+#include "bitboard.h"
+#include "misc.h"
+#include "position.h"
+#include "tune.h"
+#include "uci.h"
+using namespace Stockfish;
+int main(int argc, char* argv[]) {
+    std::cout << engine_info() << std::endl;
+    Bitboards::init();
+    Position::init();
+    auto uci = std::make_unique<UCIEngine>(argc, argv);
+    Tune::init(uci->engine_options());
+    uci->loop();
+    return 0;
+}

src/memory.cpp ADDED Viewed

	@@ -0,0 +1,199 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "memory.h"
+#include <cstdlib>
+#if __has_include("features.h")
+    #include <features.h>
+#endif
+#if defined(__linux__) && !defined(__ANDROID__)
+    #include <sys/mman.h>
+#endif
+#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) \
+  || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32)) \
+  || defined(__e2k__)
+    #define POSIXALIGNEDALLOC
+    #include <stdlib.h>
+#endif
+#ifdef _WIN32
+    #if _WIN32_WINNT < 0x0601
+        #undef _WIN32_WINNT
+        #define _WIN32_WINNT 0x0601  // Force to include needed API prototypes
+    #endif
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <ios>       // std::hex, std::dec
+    #include <iostream>  // std::cerr
+    #include <ostream>   // std::endl
+    #include <windows.h>
+// The needed Windows API for processor groups could be missed from old Windows
+// versions, so instead of calling them directly (forcing the linker to resolve
+// the calls at compile time), try to load them at runtime. To do this we need
+// first to define the corresponding function pointers.
+#endif
+namespace Stockfish {
+// Wrappers for systems where the c++17 implementation does not guarantee the
+// availability of aligned_alloc(). Memory allocated with std_aligned_alloc()
+// must be freed with std_aligned_free().
+void* std_aligned_alloc(size_t alignment, size_t size) {
+#if defined(_ISOC11_SOURCE)
+    return aligned_alloc(alignment, size);
+#elif defined(POSIXALIGNEDALLOC)
+    void* mem = nullptr;
+    posix_memalign(&mem, alignment, size);
+    return mem;
+#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64)
+    return _mm_malloc(size, alignment);
+#elif defined(_WIN32)
+    return _aligned_malloc(size, alignment);
+#else
+    return std::aligned_alloc(alignment, size);
+#endif
+}
+void std_aligned_free(void* ptr) {
+#if defined(POSIXALIGNEDALLOC)
+    free(ptr);
+#elif defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64)
+    _mm_free(ptr);
+#elif defined(_WIN32)
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+}
+// aligned_large_pages_alloc() will return suitably aligned memory,
+// if possible using large pages.
+#if defined(_WIN32)
+static void* aligned_large_pages_alloc_windows([[maybe_unused]] size_t allocSize) {
+    return windows_try_with_large_page_priviliges(
+      [&](size_t largePageSize) {
+          // Round up size to full pages and allocate
+          allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1);
+          return VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
+                              PAGE_READWRITE);
+      },
+      []() { return (void*) nullptr; });
+}
+void* aligned_large_pages_alloc(size_t allocSize) {
+    // Try to allocate large pages
+    void* mem = aligned_large_pages_alloc_windows(allocSize);
+    // Fall back to regular, page-aligned, allocation if necessary
+    if (!mem)
+        mem = VirtualAlloc(nullptr, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+    return mem;
+}
+#else
+void* aligned_large_pages_alloc(size_t allocSize) {
+    #if defined(__linux__)
+    constexpr size_t alignment = 2 * 1024 * 1024;  // 2MB page size assumed
+    #else
+    constexpr size_t alignment = 4096;  // small page size assumed
+    #endif
+    // Round up to multiples of alignment
+    size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
+    void*  mem  = std_aligned_alloc(alignment, size);
+    #if defined(MADV_HUGEPAGE)
+    madvise(mem, size, MADV_HUGEPAGE);
+    #endif
+    return mem;
+}
+#endif
+bool has_large_pages() {
+#if defined(_WIN32)
+    constexpr size_t page_size = 2 * 1024 * 1024;  // 2MB page size assumed
+    void*            mem       = aligned_large_pages_alloc_windows(page_size);
+    if (mem == nullptr)
+    {
+        return false;
+    }
+    else
+    {
+        aligned_large_pages_free(mem);
+        return true;
+    }
+#elif defined(__linux__)
+    #if defined(MADV_HUGEPAGE)
+    return true;
+    #else
+    return false;
+    #endif
+#else
+    return false;
+#endif
+}
+// aligned_large_pages_free() will free the previously memory allocated
+// by aligned_large_pages_alloc(). The effect is a nop if mem == nullptr.
+#if defined(_WIN32)
+void aligned_large_pages_free(void* mem) {
+    if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
+    {
+        DWORD err = GetLastError();
+        std::cerr << "Failed to free large page memory. Error code: 0x" << std::hex << err
+                  << std::dec << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+#else
+void aligned_large_pages_free(void* mem) { std_aligned_free(mem); }
+#endif
+}  // namespace Stockfish

src/memory.h ADDED Viewed

	@@ -0,0 +1,333 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef MEMORY_H_INCLUDED
+#define MEMORY_H_INCLUDED
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <type_traits>
+#include <utility>
+#include <cstring>
+#include "types.h"
+#if defined(_WIN64)
+    #if _WIN32_WINNT < 0x0601
+        #undef _WIN32_WINNT
+        #define _WIN32_WINNT 0x0601  // Force to include needed API prototypes
+    #endif
+    #if !defined(NOMINMAX)
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    // Some Windows headers (RPC/old headers) define short macros such
+    // as 'small' expanding to 'char', which breaks identifiers in the code.
+    // Undefine those macros immediately after including <windows.h>.
+    #ifdef small
+        #undef small
+    #endif
+    #include <psapi.h>
+extern "C" {
+using OpenProcessToken_t      = bool (*)(HANDLE, DWORD, PHANDLE);
+using LookupPrivilegeValueA_t = bool (*)(LPCSTR, LPCSTR, PLUID);
+using AdjustTokenPrivileges_t =
+  bool (*)(HANDLE, BOOL, PTOKEN_PRIVILEGES, DWORD, PTOKEN_PRIVILEGES, PDWORD);
+}
+#endif
+namespace Stockfish {
+void* std_aligned_alloc(size_t alignment, size_t size);
+void  std_aligned_free(void* ptr);
+// Memory aligned by page size, min alignment: 4096 bytes
+void* aligned_large_pages_alloc(size_t size);
+void  aligned_large_pages_free(void* mem);
+bool has_large_pages();
+// Frees memory which was placed there with placement new.
+// Works for both single objects and arrays of unknown bound.
+template<typename T, typename FREE_FUNC>
+void memory_deleter(T* ptr, FREE_FUNC free_func) {
+    if (!ptr)
+        return;
+    // Explicitly needed to call the destructor
+    if constexpr (!std::is_trivially_destructible_v<T>)
+        ptr->~T();
+    free_func(ptr);
+}
+// Frees memory which was placed there with placement new.
+// Works for both single objects and arrays of unknown bound.
+template<typename T, typename FREE_FUNC>
+void memory_deleter_array(T* ptr, FREE_FUNC free_func) {
+    if (!ptr)
+        return;
+    // Move back on the pointer to where the size is allocated
+    const size_t array_offset = std::max(sizeof(size_t), alignof(T));
+    char*        raw_memory   = reinterpret_cast<char*>(ptr) - array_offset;
+    if constexpr (!std::is_trivially_destructible_v<T>)
+    {
+        const size_t size = *reinterpret_cast<size_t*>(raw_memory);
+        // Explicitly call the destructor for each element in reverse order
+        for (size_t i = size; i-- > 0;)
+            ptr[i].~T();
+    }
+    free_func(raw_memory);
+}
+// Allocates memory for a single object and places it there with placement new
+template<typename T, typename ALLOC_FUNC, typename... Args>
+inline std::enable_if_t<!std::is_array_v<T>, T*> memory_allocator(ALLOC_FUNC alloc_func,
+                                                                  Args&&... args) {
+    void* raw_memory = alloc_func(sizeof(T));
+    ASSERT_ALIGNED(raw_memory, alignof(T));
+    return new (raw_memory) T(std::forward<Args>(args)...);
+}
+// Allocates memory for an array of unknown bound and places it there with placement new
+template<typename T, typename ALLOC_FUNC>
+inline std::enable_if_t<std::is_array_v<T>, std::remove_extent_t<T>*>
+memory_allocator(ALLOC_FUNC alloc_func, size_t num) {
+    using ElementType = std::remove_extent_t<T>;
+    const size_t array_offset = std::max(sizeof(size_t), alignof(ElementType));
+    // Save the array size in the memory location
+    char* raw_memory =
+      reinterpret_cast<char*>(alloc_func(array_offset + num * sizeof(ElementType)));
+    ASSERT_ALIGNED(raw_memory, alignof(T));
+    new (raw_memory) size_t(num);
+    for (size_t i = 0; i < num; ++i)
+        new (raw_memory + array_offset + i * sizeof(ElementType)) ElementType();
+    // Need to return the pointer at the start of the array so that
+    // the indexing in unique_ptr<T[]> works.
+    return reinterpret_cast<ElementType*>(raw_memory + array_offset);
+}
+//
+//
+// aligned large page unique ptr
+//
+//
+template<typename T>
+struct LargePageDeleter {
+    void operator()(T* ptr) const { return memory_deleter<T>(ptr, aligned_large_pages_free); }
+};
+template<typename T>
+struct LargePageArrayDeleter {
+    void operator()(T* ptr) const { return memory_deleter_array<T>(ptr, aligned_large_pages_free); }
+};
+template<typename T>
+using LargePagePtr =
+  std::conditional_t<std::is_array_v<T>,
+                     std::unique_ptr<T, LargePageArrayDeleter<std::remove_extent_t<T>>>,
+                     std::unique_ptr<T, LargePageDeleter<T>>>;
+// make_unique_large_page for single objects
+template<typename T, typename... Args>
+std::enable_if_t<!std::is_array_v<T>, LargePagePtr<T>> make_unique_large_page(Args&&... args) {
+    static_assert(alignof(T) <= 4096,
+                  "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    T* obj = memory_allocator<T>(aligned_large_pages_alloc, std::forward<Args>(args)...);
+    return LargePagePtr<T>(obj);
+}
+// make_unique_large_page for arrays of unknown bound
+template<typename T>
+std::enable_if_t<std::is_array_v<T>, LargePagePtr<T>> make_unique_large_page(size_t num) {
+    using ElementType = std::remove_extent_t<T>;
+    static_assert(alignof(ElementType) <= 4096,
+                  "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    ElementType* memory = memory_allocator<T>(aligned_large_pages_alloc, num);
+    return LargePagePtr<T>(memory);
+}
+//
+//
+// aligned unique ptr
+//
+//
+template<typename T>
+struct AlignedDeleter {
+    void operator()(T* ptr) const { return memory_deleter<T>(ptr, std_aligned_free); }
+};
+template<typename T>
+struct AlignedArrayDeleter {
+    void operator()(T* ptr) const { return memory_deleter_array<T>(ptr, std_aligned_free); }
+};
+template<typename T>
+using AlignedPtr =
+  std::conditional_t<std::is_array_v<T>,
+                     std::unique_ptr<T, AlignedArrayDeleter<std::remove_extent_t<T>>>,
+                     std::unique_ptr<T, AlignedDeleter<T>>>;
+// make_unique_aligned for single objects
+template<typename T, typename... Args>
+std::enable_if_t<!std::is_array_v<T>, AlignedPtr<T>> make_unique_aligned(Args&&... args) {
+    const auto func = [](size_t size) { return std_aligned_alloc(alignof(T), size); };
+    T*         obj  = memory_allocator<T>(func, std::forward<Args>(args)...);
+    return AlignedPtr<T>(obj);
+}
+// make_unique_aligned for arrays of unknown bound
+template<typename T>
+std::enable_if_t<std::is_array_v<T>, AlignedPtr<T>> make_unique_aligned(size_t num) {
+    using ElementType = std::remove_extent_t<T>;
+    const auto   func   = [](size_t size) { return std_aligned_alloc(alignof(ElementType), size); };
+    ElementType* memory = memory_allocator<T>(func, num);
+    return AlignedPtr<T>(memory);
+}
+// Get the first aligned element of an array.
+// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes,
+// where N is the number of elements in the array.
+template<uintptr_t Alignment, typename T>
+T* align_ptr_up(T* ptr) {
+    static_assert(alignof(T) < Alignment);
+    const uintptr_t ptrint = reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(ptr));
+    return reinterpret_cast<T*>(
+      reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
+}
+#if defined(_WIN32)
+template<typename FuncYesT, typename FuncNoT>
+auto windows_try_with_large_page_priviliges([[maybe_unused]] FuncYesT&& fyes, FuncNoT&& fno) {
+    #if !defined(_WIN64)
+    return fno();
+    #else
+    HANDLE hProcessToken{};
+    LUID   luid{};
+    const size_t largePageSize = GetLargePageMinimum();
+    if (!largePageSize)
+        return fno();
+    // Dynamically link OpenProcessToken, LookupPrivilegeValue and AdjustTokenPrivileges
+    HMODULE hAdvapi32 = GetModuleHandle(TEXT("advapi32.dll"));
+    if (!hAdvapi32)
+        hAdvapi32 = LoadLibrary(TEXT("advapi32.dll"));
+    auto OpenProcessToken_f =
+      OpenProcessToken_t((void (*)()) GetProcAddress(hAdvapi32, "OpenProcessToken"));
+    if (!OpenProcessToken_f)
+        return fno();
+    auto LookupPrivilegeValueA_f =
+      LookupPrivilegeValueA_t((void (*)()) GetProcAddress(hAdvapi32, "LookupPrivilegeValueA"));
+    if (!LookupPrivilegeValueA_f)
+        return fno();
+    auto AdjustTokenPrivileges_f =
+      AdjustTokenPrivileges_t((void (*)()) GetProcAddress(hAdvapi32, "AdjustTokenPrivileges"));
+    if (!AdjustTokenPrivileges_f)
+        return fno();
+    // We need SeLockMemoryPrivilege, so try to enable it for the process
+    if (!OpenProcessToken_f(  // OpenProcessToken()
+          GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken))
+        return fno();
+    if (!LookupPrivilegeValueA_f(nullptr, "SeLockMemoryPrivilege", &luid))
+        return fno();
+    TOKEN_PRIVILEGES tp{};
+    TOKEN_PRIVILEGES prevTp{};
+    DWORD            prevTpLen = 0;
+    tp.PrivilegeCount           = 1;
+    tp.Privileges[0].Luid       = luid;
+    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+    // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges()
+    // succeeds, we still need to query GetLastError() to ensure that the privileges
+    // were actually obtained.
+    if (!AdjustTokenPrivileges_f(hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp,
+                                 &prevTpLen)
+        || GetLastError() != ERROR_SUCCESS)
+        return fno();
+    auto&& ret = fyes(largePageSize);
+    // Privilege no longer needed, restore previous state
+    AdjustTokenPrivileges_f(hProcessToken, FALSE, &prevTp, 0, nullptr, nullptr);
+    CloseHandle(hProcessToken);
+    return std::forward<decltype(ret)>(ret);
+    #endif
+}
+#endif
+template<typename T, typename ByteT>
+T load_as(const ByteT* buffer) {
+    static_assert(std::is_trivially_copyable<T>::value, "Type must be trivially copyable");
+    static_assert(sizeof(ByteT) == 1);
+    T value;
+    std::memcpy(&value, buffer, sizeof(T));
+    return value;
+}
+}  // namespace Stockfish
+#endif  // #ifndef MEMORY_H_INCLUDED

src/misc.cpp ADDED Viewed

	@@ -0,0 +1,549 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "misc.h"
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <mutex>
+#include <sstream>
+#include <string_view>
+#include "types.h"
+namespace Stockfish {
+namespace {
+// Version number or dev.
+constexpr std::string_view version = "dev";
+// Our fancy logging facility. The trick here is to replace cin.rdbuf() and
+// cout.rdbuf() with two Tie objects that tie cin and cout to a file stream. We
+// can toggle the logging of std::cout and std::cin at runtime whilst preserving
+// usual I/O functionality, all without changing a single line of code!
+// Idea from http://groups.google.com/group/comp.lang.c++/msg/1d941c0f26ea0d81
+struct Tie: public std::streambuf {  // MSVC requires split streambuf for cin and cout
+    Tie(std::streambuf* b, std::streambuf* l) :
+        buf(b),
+        logBuf(l) {}
+    int sync() override { return logBuf->pubsync(), buf->pubsync(); }
+    int overflow(int c) override { return log(buf->sputc(char(c)), "<< "); }
+    int underflow() override { return buf->sgetc(); }
+    int uflow() override { return log(buf->sbumpc(), ">> "); }
+    std::streambuf *buf, *logBuf;
+    int log(int c, const char* prefix) {
+        static int last = '\n';  // Single log file
+        if (last == '\n')
+            logBuf->sputn(prefix, 3);
+        return last = logBuf->sputc(char(c));
+    }
+};
+class Logger {
+    Logger() :
+        in(std::cin.rdbuf(), file.rdbuf()),
+        out(std::cout.rdbuf(), file.rdbuf()) {}
+    ~Logger() { start(""); }
+    std::ofstream file;
+    Tie           in, out;
+   public:
+    static void start(const std::string& fname) {
+        static Logger l;
+        if (l.file.is_open())
+        {
+            std::cout.rdbuf(l.out.buf);
+            std::cin.rdbuf(l.in.buf);
+            l.file.close();
+        }
+        if (!fname.empty())
+        {
+            l.file.open(fname, std::ifstream::out);
+            if (!l.file.is_open())
+            {
+                std::cerr << "Unable to open debug log file " << fname << std::endl;
+                exit(EXIT_FAILURE);
+            }
+            std::cin.rdbuf(&l.in);
+            std::cout.rdbuf(&l.out);
+        }
+    }
+};
+}  // namespace
+// Returns the full name of the current Stockfish version.
+//
+// For local dev compiles we try to append the commit SHA and
+// commit date from git. If that fails only the local compilation
+// date is set and "nogit" is specified:
+//      Stockfish dev-YYYYMMDD-SHA
+//      or
+//      Stockfish dev-YYYYMMDD-nogit
+//
+// For releases (non-dev builds) we only include the version number:
+//      Stockfish version
+std::string engine_version_info() {
+    std::stringstream ss;
+    ss << "Stockfish " << version << std::setfill('0');
+    if constexpr (version == "dev")
+    {
+        ss << "-";
+#ifdef GIT_DATE
+        ss << stringify(GIT_DATE);
+#else
+        constexpr std::string_view months("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec");
+        std::string       month, day, year;
+        std::stringstream date(__DATE__);  // From compiler, format is "Sep 21 2008"
+        date >> month >> day >> year;
+        ss << year << std::setw(2) << std::setfill('0') << (1 + months.find(month) / 4)
+           << std::setw(2) << std::setfill('0') << day;
+#endif
+        ss << "-";
+#ifdef GIT_SHA
+        ss << stringify(GIT_SHA);
+#else
+        ss << "nogit";
+#endif
+    }
+    return ss.str();
+}
+std::string engine_info(bool to_uci) {
+    return engine_version_info() + (to_uci ? "\nid author " : " by ")
+         + "the Stockfish developers (see AUTHORS file)";
+}
+// Returns a string trying to describe the compiler we use
+std::string compiler_info() {
+#define make_version_string(major, minor, patch) \
+    stringify(major) "." stringify(minor) "." stringify(patch)
+    // Predefined macros hell:
+    //
+    // __GNUC__                Compiler is GCC, Clang or ICX
+    // __clang__               Compiler is Clang or ICX
+    // __INTEL_LLVM_COMPILER   Compiler is ICX
+    // _MSC_VER                Compiler is MSVC
+    // _WIN32                  Building on Windows (any)
+    // _WIN64                  Building on Windows 64 bit
+    std::string compiler = "\nCompiled by                : ";
+#if defined(__INTEL_LLVM_COMPILER)
+    compiler += "ICX ";
+    compiler += stringify(__INTEL_LLVM_COMPILER);
+#elif defined(__clang__)
+    compiler += "clang++ ";
+    compiler += make_version_string(__clang_major__, __clang_minor__, __clang_patchlevel__);
+#elif _MSC_VER
+    compiler += "MSVC ";
+    compiler += "(version ";
+    compiler += stringify(_MSC_FULL_VER) "." stringify(_MSC_BUILD);
+    compiler += ")";
+#elif defined(__e2k__) && defined(__LCC__)
+    #define dot_ver2(n) \
+        compiler += char('.'); \
+        compiler += char('0' + (n) / 10); \
+        compiler += char('0' + (n) % 10);
+    compiler += "MCST LCC ";
+    compiler += "(version ";
+    compiler += std::to_string(__LCC__ / 100);
+    dot_ver2(__LCC__ % 100) dot_ver2(__LCC_MINOR__) compiler += ")";
+#elif __GNUC__
+    compiler += "g++ (GNUC) ";
+    compiler += make_version_string(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
+#else
+    compiler += "Unknown compiler ";
+    compiler += "(unknown version)";
+#endif
+#if defined(__APPLE__)
+    compiler += " on Apple";
+#elif defined(__CYGWIN__)
+    compiler += " on Cygwin";
+#elif defined(__MINGW64__)
+    compiler += " on MinGW64";
+#elif defined(__MINGW32__)
+    compiler += " on MinGW32";
+#elif defined(__ANDROID__)
+    compiler += " on Android";
+#elif defined(__linux__)
+    compiler += " on Linux";
+#elif defined(_WIN64)
+    compiler += " on Microsoft Windows 64-bit";
+#elif defined(_WIN32)
+    compiler += " on Microsoft Windows 32-bit";
+#else
+    compiler += " on unknown system";
+#endif
+    compiler += "\nCompilation architecture   : ";
+#if defined(ARCH)
+    compiler += stringify(ARCH);
+#else
+    compiler += "(undefined architecture)";
+#endif
+    compiler += "\nCompilation settings       : ";
+    compiler += (Is64Bit ? "64bit" : "32bit");
+#if defined(USE_AVX512ICL)
+    compiler += " AVX512ICL";
+#endif
+#if defined(USE_VNNI)
+    compiler += " VNNI";
+#endif
+#if defined(USE_AVX512)
+    compiler += " AVX512";
+#endif
+    compiler += (HasPext ? " BMI2" : "");
+#if defined(USE_AVX2)
+    compiler += " AVX2";
+#endif
+#if defined(USE_SSE41)
+    compiler += " SSE41";
+#endif
+#if defined(USE_SSSE3)
+    compiler += " SSSE3";
+#endif
+#if defined(USE_SSE2)
+    compiler += " SSE2";
+#endif
+#if defined(USE_NEON_DOTPROD)
+    compiler += " NEON_DOTPROD";
+#elif defined(USE_NEON)
+    compiler += " NEON";
+#endif
+    compiler += (HasPopCnt ? " POPCNT" : "");
+#if !defined(NDEBUG)
+    compiler += " DEBUG";
+#endif
+    compiler += "\nCompiler __VERSION__ macro : ";
+#ifdef __VERSION__
+    compiler += __VERSION__;
+#else
+    compiler += "(undefined macro)";
+#endif
+    compiler += "\n";
+    return compiler;
+}
+// Debug functions used mainly to collect run-time statistics
+constexpr int MaxDebugSlots = 32;
+namespace {
+template<size_t N>
+struct DebugInfo {
+    std::array<std::atomic<int64_t>, N> data = {0};
+    [[nodiscard]] constexpr std::atomic<int64_t>& operator[](size_t index) {
+        assert(index < N);
+        return data[index];
+    }
+    constexpr DebugInfo& operator=(const DebugInfo& other) {
+        for (size_t i = 0; i < N; i++)
+            data[i].store(other.data[i].load());
+        return *this;
+    }
+};
+struct DebugExtremes: public DebugInfo<3> {
+    DebugExtremes() {
+        data[1] = std::numeric_limits<int64_t>::min();
+        data[2] = std::numeric_limits<int64_t>::max();
+    }
+};
+std::array<DebugInfo<2>, MaxDebugSlots>  hit;
+std::array<DebugInfo<2>, MaxDebugSlots>  mean;
+std::array<DebugInfo<3>, MaxDebugSlots>  stdev;
+std::array<DebugInfo<6>, MaxDebugSlots>  correl;
+std::array<DebugExtremes, MaxDebugSlots> extremes;
+}  // namespace
+void dbg_hit_on(bool cond, int slot) {
+    ++hit.at(slot)[0];
+    if (cond)
+        ++hit.at(slot)[1];
+}
+void dbg_mean_of(int64_t value, int slot) {
+    ++mean.at(slot)[0];
+    mean.at(slot)[1] += value;
+}
+void dbg_stdev_of(int64_t value, int slot) {
+    ++stdev.at(slot)[0];
+    stdev.at(slot)[1] += value;
+    stdev.at(slot)[2] += value * value;
+}
+void dbg_extremes_of(int64_t value, int slot) {
+    ++extremes.at(slot)[0];
+    int64_t current_max = extremes.at(slot)[1].load();
+    while (current_max < value && !extremes.at(slot)[1].compare_exchange_weak(current_max, value))
+    {}
+    int64_t current_min = extremes.at(slot)[2].load();
+    while (current_min > value && !extremes.at(slot)[2].compare_exchange_weak(current_min, value))
+    {}
+}
+void dbg_correl_of(int64_t value1, int64_t value2, int slot) {
+    ++correl.at(slot)[0];
+    correl.at(slot)[1] += value1;
+    correl.at(slot)[2] += value1 * value1;
+    correl.at(slot)[3] += value2;
+    correl.at(slot)[4] += value2 * value2;
+    correl.at(slot)[5] += value1 * value2;
+}
+void dbg_print() {
+    int64_t n;
+    auto    E   = [&n](int64_t x) { return double(x) / n; };
+    auto    sqr = [](double x) { return x * x; };
+    for (int i = 0; i < MaxDebugSlots; ++i)
+        if ((n = hit[i][0]))
+            std::cerr << "Hit #" << i << ": Total " << n << " Hits " << hit[i][1]
+                      << " Hit Rate (%) " << 100.0 * E(hit[i][1]) << std::endl;
+    for (int i = 0; i < MaxDebugSlots; ++i)
+        if ((n = mean[i][0]))
+        {
+            std::cerr << "Mean #" << i << ": Total " << n << " Mean " << E(mean[i][1]) << std::endl;
+        }
+    for (int i = 0; i < MaxDebugSlots; ++i)
+        if ((n = stdev[i][0]))
+        {
+            double r = sqrt(E(stdev[i][2]) - sqr(E(stdev[i][1])));
+            std::cerr << "Stdev #" << i << ": Total " << n << " Stdev " << r << std::endl;
+        }
+    for (int i = 0; i < MaxDebugSlots; ++i)
+        if ((n = extremes[i][0]))
+        {
+            std::cerr << "Extremity #" << i << ": Total " << n << " Min " << extremes[i][2]
+                      << " Max " << extremes[i][1] << std::endl;
+        }
+    for (int i = 0; i < MaxDebugSlots; ++i)
+        if ((n = correl[i][0]))
+        {
+            double r = (E(correl[i][5]) - E(correl[i][1]) * E(correl[i][3]))
+                     / (sqrt(E(correl[i][2]) - sqr(E(correl[i][1])))
+                        * sqrt(E(correl[i][4]) - sqr(E(correl[i][3]))));
+            std::cerr << "Correl. #" << i << ": Total " << n << " Coefficient " << r << std::endl;
+        }
+}
+void dbg_clear() {
+    hit.fill({});
+    mean.fill({});
+    stdev.fill({});
+    correl.fill({});
+    extremes.fill({});
+}
+// Used to serialize access to std::cout
+// to avoid multiple threads writing at the same time.
+std::ostream& operator<<(std::ostream& os, SyncCout sc) {
+    static std::mutex m;
+    if (sc == IO_LOCK)
+        m.lock();
+    if (sc == IO_UNLOCK)
+        m.unlock();
+    return os;
+}
+void sync_cout_start() { std::cout << IO_LOCK; }
+void sync_cout_end() { std::cout << IO_UNLOCK; }
+// Hash function based on public domain MurmurHash64A, by Austin Appleby.
+uint64_t hash_bytes(const char* data, size_t size) {
+    const uint64_t m = 0xc6a4a7935bd1e995ull;
+    const int      r = 47;
+    uint64_t h = size * m;
+    const char* end = data + (size & ~(size_t) 7);
+    for (const char* p = data; p != end; p += 8)
+    {
+        uint64_t k;
+        std::memcpy(&k, p, sizeof(k));
+        k *= m;
+        k ^= k >> r;
+        k *= m;
+        h ^= k;
+        h *= m;
+    }
+    if (size & 7)
+    {
+        uint64_t k = 0;
+        for (int i = (size & 7) - 1; i >= 0; i--)
+            k = (k << 8) | (uint64_t) end[i];
+        h ^= k;
+        h *= m;
+    }
+    h ^= h >> r;
+    h *= m;
+    h ^= h >> r;
+    return h;
+}
+// Trampoline helper to avoid moving Logger to misc.h
+void start_logger(const std::string& fname) { Logger::start(fname); }
+#ifdef _WIN32
+    #include <direct.h>
+    #define GETCWD _getcwd
+#else
+    #include <unistd.h>
+    #define GETCWD getcwd
+#endif
+size_t str_to_size_t(const std::string& s) {
+    unsigned long long value = std::stoull(s);
+    if (value > std::numeric_limits<size_t>::max())
+        std::exit(EXIT_FAILURE);
+    return static_cast<size_t>(value);
+}
+std::optional<std::string> read_file_to_string(const std::string& path) {
+    std::ifstream f(path, std::ios_base::binary);
+    if (!f)
+        return std::nullopt;
+    return std::string(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>());
+}
+void remove_whitespace(std::string& s) {
+    s.erase(std::remove_if(s.begin(), s.end(), [](char c) { return std::isspace(c); }), s.end());
+}
+bool is_whitespace(std::string_view s) {
+    return std::all_of(s.begin(), s.end(), [](char c) { return std::isspace(c); });
+}
+std::string CommandLine::get_binary_directory(std::string argv0) {
+    std::string pathSeparator;
+#ifdef _WIN32
+    pathSeparator = "\\";
+    #ifdef _MSC_VER
+    // Under windows argv[0] may not have the extension. Also _get_pgmptr() had
+    // issues in some Windows 10 versions, so check returned values carefully.
+    char* pgmptr = nullptr;
+    if (!_get_pgmptr(&pgmptr) && pgmptr != nullptr && *pgmptr)
+        argv0 = pgmptr;
+    #endif
+#else
+    pathSeparator = "/";
+#endif
+    // Extract the working directory
+    auto workingDirectory = CommandLine::get_working_directory();
+    // Extract the binary directory path from argv0
+    auto   binaryDirectory = argv0;
+    size_t pos             = binaryDirectory.find_last_of("\\/");
+    if (pos == std::string::npos)
+        binaryDirectory = "." + pathSeparator;
+    else
+        binaryDirectory.resize(pos + 1);
+    // Pattern replacement: "./" at the start of path is replaced by the working directory
+    if (binaryDirectory.find("." + pathSeparator) == 0)
+        binaryDirectory.replace(0, 1, workingDirectory);
+    return binaryDirectory;
+}
+std::string CommandLine::get_working_directory() {
+    std::string workingDirectory = "";
+    char        buff[40000];
+    char*       cwd = GETCWD(buff, 40000);
+    if (cwd)
+        workingDirectory = cwd;
+    return workingDirectory;
+}
+}  // namespace Stockfish

src/misc.h ADDED Viewed

	@@ -0,0 +1,538 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef MISC_H_INCLUDED
+#define MISC_H_INCLUDED
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <exception>  // IWYU pragma: keep
+// IWYU pragma: no_include <__exception/terminate.h>
+#include <functional>
+#include <iosfwd>
+#include <optional>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <vector>
+#if !defined(NO_PREFETCH) && (defined(_MSC_VER) || defined(__INTEL_COMPILER))
+    #include <immintrin.h>
+#endif
+#define stringify2(x) #x
+#define stringify(x) stringify2(x)
+namespace Stockfish {
+std::string engine_version_info();
+std::string engine_info(bool to_uci = false);
+std::string compiler_info();
+// Prefetch hint enums for explicit call-site control.
+enum class PrefetchRw {
+    READ,
+    WRITE
+};
+// NOTE: PrefetchLoc controls locality / cache level, not whether a prefetch
+//       is issued. In particular, PrefetchLoc::NONE maps to a non-temporal /
+//       lowest-locality prefetch (Intel: _MM_HINT_NTA, GCC/Clang: locality = 0)
+//       and therefore still performs a prefetch. To completely disable
+//       prefetching, define NO_PREFETCH so that prefetch() becomes a no-op.
+enum class PrefetchLoc {
+    NONE,      // Non-temporal / no cache locality (still issues a prefetch)
+    LOW,       // Low locality (e.g. T2 / L2)
+    MODERATE,  // Moderate locality (e.g. T1 / L1)
+    HIGH       // High locality (e.g. T0 / closest cache)
+};
+// Preloads the given address into cache. This is a non-blocking
+// function that doesn't stall the CPU waiting for data to be loaded from memory,
+// which can be quite slow.
+#ifdef NO_PREFETCH
+template<PrefetchRw RW = PrefetchRw::READ, PrefetchLoc LOC = PrefetchLoc::HIGH>
+void prefetch(const void*) {}
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+constexpr int get_intel_hint(PrefetchRw rw, PrefetchLoc loc) {
+    if (rw == PrefetchRw::WRITE)
+    {
+    #ifdef _MM_HINT_ET0
+        return _MM_HINT_ET0;
+    #else
+        // Fallback when write-prefetch hint is not available: use T0
+        return _MM_HINT_T0;
+    #endif
+    }
+    switch (loc)
+    {
+    case PrefetchLoc::NONE :
+        return _MM_HINT_NTA;
+    case PrefetchLoc::LOW :
+        return _MM_HINT_T2;
+    case PrefetchLoc::MODERATE :
+        return _MM_HINT_T1;
+    case PrefetchLoc::HIGH :
+        return _MM_HINT_T0;
+    default :
+        return _MM_HINT_T0;
+    }
+}
+template<PrefetchRw RW = PrefetchRw::READ, PrefetchLoc LOC = PrefetchLoc::HIGH>
+void prefetch(const void* addr) {
+    _mm_prefetch(static_cast<const char*>(addr), get_intel_hint(RW, LOC));
+}
+#else
+template<PrefetchRw RW = PrefetchRw::READ, PrefetchLoc LOC = PrefetchLoc::HIGH>
+void prefetch(const void* addr) {
+    __builtin_prefetch(addr, static_cast<int>(RW), static_cast<int>(LOC));
+}
+#endif
+void start_logger(const std::string& fname);
+size_t str_to_size_t(const std::string& s);
+#if defined(__linux__)
+struct PipeDeleter {
+    void operator()(FILE* file) const {
+        if (file != nullptr)
+        {
+            pclose(file);
+        }
+    }
+};
+#endif
+// Reads the file as bytes.
+// Returns std::nullopt if the file does not exist.
+std::optional<std::string> read_file_to_string(const std::string& path);
+void dbg_hit_on(bool cond, int slot = 0);
+void dbg_mean_of(int64_t value, int slot = 0);
+void dbg_stdev_of(int64_t value, int slot = 0);
+void dbg_extremes_of(int64_t value, int slot = 0);
+void dbg_correl_of(int64_t value1, int64_t value2, int slot = 0);
+void dbg_print();
+void dbg_clear();
+using TimePoint = std::chrono::milliseconds::rep;  // A value in milliseconds
+static_assert(sizeof(TimePoint) == sizeof(int64_t), "TimePoint should be 64 bits");
+inline TimePoint now() {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(
+             std::chrono::steady_clock::now().time_since_epoch())
+      .count();
+}
+inline std::vector<std::string_view> split(std::string_view s, std::string_view delimiter) {
+    std::vector<std::string_view> res;
+    if (s.empty())
+        return res;
+    size_t begin = 0;
+    for (;;)
+    {
+        const size_t end = s.find(delimiter, begin);
+        if (end == std::string::npos)
+            break;
+        res.emplace_back(s.substr(begin, end - begin));
+        begin = end + delimiter.size();
+    }
+    res.emplace_back(s.substr(begin));
+    return res;
+}
+void remove_whitespace(std::string& s);
+bool is_whitespace(std::string_view s);
+enum SyncCout {
+    IO_LOCK,
+    IO_UNLOCK
+};
+std::ostream& operator<<(std::ostream&, SyncCout);
+#define sync_cout std::cout << IO_LOCK
+#define sync_endl std::endl << IO_UNLOCK
+void sync_cout_start();
+void sync_cout_end();
+// True if and only if the binary is compiled on a little-endian machine
+static inline const std::uint16_t Le             = 1;
+static inline const bool          IsLittleEndian = *reinterpret_cast<const char*>(&Le) == 1;
+template<typename T, std::size_t MaxSize>
+class ValueList {
+   public:
+    std::size_t size() const { return size_; }
+    int         ssize() const { return int(size_); }
+    void        push_back(const T& value) {
+        assert(size_ < MaxSize);
+        values_[size_++] = value;
+    }
+    const T* begin() const { return values_; }
+    const T* end() const { return values_ + size_; }
+    const T& operator[](int index) const { return values_[index]; }
+    T* make_space(size_t count) {
+        T* result = &values_[size_];
+        size_ += count;
+        assert(size_ <= MaxSize);
+        return result;
+    }
+   private:
+    T           values_[MaxSize];
+    std::size_t size_ = 0;
+};
+template<typename T, std::size_t Size, std::size_t... Sizes>
+class MultiArray;
+namespace Detail {
+template<typename T, std::size_t Size, std::size_t... Sizes>
+struct MultiArrayHelper {
+    using ChildType = MultiArray<T, Sizes...>;
+};
+template<typename T, std::size_t Size>
+struct MultiArrayHelper<T, Size> {
+    using ChildType = T;
+};
+template<typename To, typename From>
+constexpr bool is_strictly_assignable_v =
+  std::is_assignable_v<To&, From> && (std::is_same_v<To, From> || !std::is_convertible_v<From, To>);
+}
+// MultiArray is a generic N-dimensional array.
+// The template parameters (Size and Sizes) encode the dimensions of the array.
+template<typename T, std::size_t Size, std::size_t... Sizes>
+class MultiArray {
+    using ChildType = typename Detail::MultiArrayHelper<T, Size, Sizes...>::ChildType;
+    using ArrayType = std::array<ChildType, Size>;
+    ArrayType data_;
+   public:
+    using value_type             = typename ArrayType::value_type;
+    using size_type              = typename ArrayType::size_type;
+    using difference_type        = typename ArrayType::difference_type;
+    using reference              = typename ArrayType::reference;
+    using const_reference        = typename ArrayType::const_reference;
+    using pointer                = typename ArrayType::pointer;
+    using const_pointer          = typename ArrayType::const_pointer;
+    using iterator               = typename ArrayType::iterator;
+    using const_iterator         = typename ArrayType::const_iterator;
+    using reverse_iterator       = typename ArrayType::reverse_iterator;
+    using const_reverse_iterator = typename ArrayType::const_reverse_iterator;
+    constexpr auto&       at(size_type index) noexcept { return data_.at(index); }
+    constexpr const auto& at(size_type index) const noexcept { return data_.at(index); }
+    constexpr auto&       operator[](size_type index) noexcept { return data_[index]; }
+    constexpr const auto& operator[](size_type index) const noexcept { return data_[index]; }
+    constexpr auto&       front() noexcept { return data_.front(); }
+    constexpr const auto& front() const noexcept { return data_.front(); }
+    constexpr auto&       back() noexcept { return data_.back(); }
+    constexpr const auto& back() const noexcept { return data_.back(); }
+    auto*       data() { return data_.data(); }
+    const auto* data() const { return data_.data(); }
+    constexpr auto begin() noexcept { return data_.begin(); }
+    constexpr auto end() noexcept { return data_.end(); }
+    constexpr auto begin() const noexcept { return data_.begin(); }
+    constexpr auto end() const noexcept { return data_.end(); }
+    constexpr auto cbegin() const noexcept { return data_.cbegin(); }
+    constexpr auto cend() const noexcept { return data_.cend(); }
+    constexpr auto rbegin() noexcept { return data_.rbegin(); }
+    constexpr auto rend() noexcept { return data_.rend(); }
+    constexpr auto rbegin() const noexcept { return data_.rbegin(); }
+    constexpr auto rend() const noexcept { return data_.rend(); }
+    constexpr auto crbegin() const noexcept { return data_.crbegin(); }
+    constexpr auto crend() const noexcept { return data_.crend(); }
+    constexpr bool      empty() const noexcept { return data_.empty(); }
+    constexpr size_type size() const noexcept { return data_.size(); }
+    constexpr size_type max_size() const noexcept { return data_.max_size(); }
+    template<typename U>
+    void fill(const U& v) {
+        static_assert(Detail::is_strictly_assignable_v<T, U>,
+                      "Cannot assign fill value to entry type");
+        for (auto& ele : data_)
+        {
+            if constexpr (sizeof...(Sizes) == 0)
+                ele = v;
+            else
+                ele.fill(v);
+        }
+    }
+    constexpr void swap(MultiArray<T, Size, Sizes...>& other) noexcept { data_.swap(other.data_); }
+};
+// xorshift64star Pseudo-Random Number Generator
+// This class is based on original code written and dedicated
+// to the public domain by Sebastiano Vigna (2014).
+// It has the following characteristics:
+//
+//  -  Outputs 64-bit numbers
+//  -  Passes Dieharder and SmallCrush test batteries
+//  -  Does not require warm-up, no zeroland to escape
+//  -  Internal state is a single 64-bit integer
+//  -  Period is 2^64 - 1
+//  -  Speed: 1.60 ns/call (Core i7 @3.40GHz)
+//
+// For further analysis see
+//   <http://vigna.di.unimi.it/ftp/papers/xorshift.pdf>
+class PRNG {
+    uint64_t s;
+    uint64_t rand64() {
+        s ^= s >> 12, s ^= s << 25, s ^= s >> 27;
+        return s * 2685821657736338717LL;
+    }
+   public:
+    PRNG(uint64_t seed) :
+        s(seed) {
+        assert(seed);
+    }
+    template<typename T>
+    T rand() {
+        return T(rand64());
+    }
+    // Special generator used to fast init magic numbers.
+    // Output values only have 1/8th of their bits set on average.
+    template<typename T>
+    T sparse_rand() {
+        return T(rand64() & rand64() & rand64());
+    }
+};
+inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
+#if defined(__GNUC__) && defined(IS_64BIT)
+    __extension__ using uint128 = unsigned __int128;
+    return (uint128(a) * uint128(b)) >> 64;
+#else
+    uint64_t aL = uint32_t(a), aH = a >> 32;
+    uint64_t bL = uint32_t(b), bH = b >> 32;
+    uint64_t c1 = (aL * bL) >> 32;
+    uint64_t c2 = aH * bL + c1;
+    uint64_t c3 = aL * bH + uint32_t(c2);
+    return aH * bH + (c2 >> 32) + (c3 >> 32);
+#endif
+}
+uint64_t hash_bytes(const char*, size_t);
+template<typename T>
+inline std::size_t get_raw_data_hash(const T& value) {
+    // We must have no padding bytes because we're reinterpreting as char
+    static_assert(std::has_unique_object_representations<T>());
+    return static_cast<std::size_t>(
+      hash_bytes(reinterpret_cast<const char*>(&value), sizeof(value)));
+}
+template<typename T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+    std::size_t x;
+    // For primitive types we avoid using the default hasher, which may be
+    // nondeterministic across program invocations
+    if constexpr (std::is_integral<T>())
+        x = v;
+    else
+        x = std::hash<T>{}(v);
+    seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+inline std::uint64_t hash_string(const std::string& sv) { return hash_bytes(sv.data(), sv.size()); }
+template<std::size_t Capacity>
+class FixedString {
+   public:
+    FixedString() :
+        length_(0) {
+        data_[0] = '\0';
+    }
+    FixedString(const char* str) {
+        size_t len = std::strlen(str);
+        if (len > Capacity)
+            std::terminate();
+        std::memcpy(data_, str, len);
+        length_        = len;
+        data_[length_] = '\0';
+    }
+    FixedString(const std::string& str) {
+        if (str.size() > Capacity)
+            std::terminate();
+        std::memcpy(data_, str.data(), str.size());
+        length_        = str.size();
+        data_[length_] = '\0';
+    }
+    std::size_t size() const { return length_; }
+    std::size_t capacity() const { return Capacity; }
+    const char* c_str() const { return data_; }
+    const char* data() const { return data_; }
+    char& operator[](std::size_t i) { return data_[i]; }
+    const char& operator[](std::size_t i) const { return data_[i]; }
+    FixedString& operator+=(const char* str) {
+        size_t len = std::strlen(str);
+        if (length_ + len > Capacity)
+            std::terminate();
+        std::memcpy(data_ + length_, str, len);
+        length_ += len;
+        data_[length_] = '\0';
+        return *this;
+    }
+    FixedString& operator+=(const FixedString& other) { return (*this += other.c_str()); }
+    operator std::string() const { return std::string(data_, length_); }
+    operator std::string_view() const { return std::string_view(data_, length_); }
+    template<typename T>
+    bool operator==(const T& other) const noexcept {
+        return (std::string_view) (*this) == other;
+    }
+    template<typename T>
+    bool operator!=(const T& other) const noexcept {
+        return (std::string_view) (*this) != other;
+    }
+    void clear() {
+        length_  = 0;
+        data_[0] = '\0';
+    }
+   private:
+    char        data_[Capacity + 1];  // +1 for null terminator
+    std::size_t length_;
+};
+struct CommandLine {
+   public:
+    CommandLine(int _argc, char** _argv) :
+        argc(_argc),
+        argv(_argv) {}
+    static std::string get_binary_directory(std::string argv0);
+    static std::string get_working_directory();
+    int    argc;
+    char** argv;
+};
+namespace Utility {
+template<typename T, typename Predicate>
+void move_to_front(std::vector<T>& vec, Predicate pred) {
+    auto it = std::find_if(vec.begin(), vec.end(), pred);
+    if (it != vec.end())
+    {
+        std::rotate(vec.begin(), it, it + 1);
+    }
+}
+}
+#if defined(__GNUC__)
+    #define sf_always_inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+    #define sf_always_inline __forceinline
+#else
+    // do nothing for other compilers
+    #define sf_always_inline
+#endif
+#if defined(__clang__)
+    #define sf_assume(cond) __builtin_assume(cond)
+#elif defined(__GNUC__)
+    #if __GNUC__ >= 13
+        #define sf_assume(cond) __attribute__((assume(cond)))
+    #else
+        #define sf_assume(cond) \
+            do \
+            { \
+                if (!(cond)) \
+                    __builtin_unreachable(); \
+            } while (0)
+    #endif
+#elif defined(_MSC_VER)
+    #define sf_assume(cond) __assume(cond)
+#else
+    // do nothing for other compilers
+    #define sf_assume(cond)
+#endif
+#ifdef __GNUC__
+    #define sf_unreachable() __builtin_unreachable()
+#elif defined(_MSC_VER)
+    #define sf_unreachable() __assume(0)
+#else
+    #define sf_unreachable()
+#endif
+}  // namespace Stockfish
+template<std::size_t N>
+struct std::hash<Stockfish::FixedString<N>> {
+    std::size_t operator()(const Stockfish::FixedString<N>& fstr) const noexcept {
+        return Stockfish::hash_bytes(fstr.data(), fstr.size());
+    }
+};
+#endif  // #ifndef MISC_H_INCLUDED

src/movegen.cpp ADDED Viewed

	@@ -0,0 +1,312 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "movegen.h"
+#include <cassert>
+#include <initializer_list>
+#include "bitboard.h"
+#include "position.h"
+#if defined(USE_AVX512ICL)
+    #include <array>
+    #include <algorithm>
+    #include <immintrin.h>
+#endif
+namespace Stockfish {
+namespace {
+#if defined(USE_AVX512ICL)
+inline Move* write_moves(Move* moveList, uint32_t mask, __m512i vector) {
+    // Avoid _mm512_mask_compressstoreu_epi16() as it's 256 uOps on Zen4
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(moveList),
+                        _mm512_maskz_compress_epi16(mask, vector));
+    return moveList + popcount(mask);
+}
+template<Direction offset>
+inline Move* splat_pawn_moves(Move* moveList, Bitboard to_bb) {
+    alignas(64) static constexpr auto SPLAT_TABLE = [] {
+        std::array<Move, 64> table{};
+        for (int i = 0; i < 64; i++)
+        {
+            Square from{uint8_t(std::clamp(i - offset, 0, 63))};
+            table[i] = {Move(from, Square{uint8_t(i)})};
+        }
+        return table;
+    }();
+    auto table = reinterpret_cast<const __m512i*>(SPLAT_TABLE.data());
+    moveList =
+      write_moves(moveList, static_cast<uint32_t>(to_bb >> 0), _mm512_load_si512(table + 0));
+    moveList =
+      write_moves(moveList, static_cast<uint32_t>(to_bb >> 32), _mm512_load_si512(table + 1));
+    return moveList;
+}
+inline Move* splat_moves(Move* moveList, Square from, Bitboard to_bb) {
+    alignas(64) static constexpr auto SPLAT_TABLE = [] {
+        std::array<Move, 64> table{};
+        for (uint8_t i = 0; i < 64; i++)
+            table[i] = {Move(SQUARE_ZERO, Square{i})};
+        return table;
+    }();
+    __m512i fromVec = _mm512_set1_epi16(Move(from, SQUARE_ZERO).raw());
+    auto table = reinterpret_cast<const __m512i*>(SPLAT_TABLE.data());
+    moveList = write_moves(moveList, static_cast<uint32_t>(to_bb >> 0),
+                           _mm512_or_si512(_mm512_load_si512(table + 0), fromVec));
+    moveList = write_moves(moveList, static_cast<uint32_t>(to_bb >> 32),
+                           _mm512_or_si512(_mm512_load_si512(table + 1), fromVec));
+    return moveList;
+}
+#else
+template<Direction offset>
+inline Move* splat_pawn_moves(Move* moveList, Bitboard to_bb) {
+    while (to_bb)
+    {
+        Square to   = pop_lsb(to_bb);
+        *moveList++ = Move(to - offset, to);
+    }
+    return moveList;
+}
+inline Move* splat_moves(Move* moveList, Square from, Bitboard to_bb) {
+    while (to_bb)
+        *moveList++ = Move(from, pop_lsb(to_bb));
+    return moveList;
+}
+#endif
+template<GenType Type, Direction D, bool Enemy>
+Move* make_promotions(Move* moveList, [[maybe_unused]] Square to) {
+    constexpr bool all = Type == EVASIONS || Type == NON_EVASIONS;
+    if constexpr (Type == CAPTURES || all)
+        *moveList++ = Move::make<PROMOTION>(to - D, to, QUEEN);
+    if constexpr ((Type == CAPTURES && Enemy) || (Type == QUIETS && !Enemy) || all)
+    {
+        *moveList++ = Move::make<PROMOTION>(to - D, to, ROOK);
+        *moveList++ = Move::make<PROMOTION>(to - D, to, BISHOP);
+        *moveList++ = Move::make<PROMOTION>(to - D, to, KNIGHT);
+    }
+    return moveList;
+}
+template<Color Us, GenType Type>
+Move* generate_pawn_moves(const Position& pos, Move* moveList, Bitboard target) {
+    constexpr Color     Them     = ~Us;
+    constexpr Bitboard  TRank7BB = (Us == WHITE ? Rank7BB : Rank2BB);
+    constexpr Bitboard  TRank3BB = (Us == WHITE ? Rank3BB : Rank6BB);
+    constexpr Direction Up       = pawn_push(Us);
+    constexpr Direction UpRight  = (Us == WHITE ? NORTH_EAST : SOUTH_WEST);
+    constexpr Direction UpLeft   = (Us == WHITE ? NORTH_WEST : SOUTH_EAST);
+    const Bitboard emptySquares = ~pos.pieces();
+    const Bitboard enemies      = Type == EVASIONS ? pos.checkers() : pos.pieces(Them);
+    Bitboard pawnsOn7    = pos.pieces(Us, PAWN) & TRank7BB;
+    Bitboard pawnsNotOn7 = pos.pieces(Us, PAWN) & ~TRank7BB;
+    // Single and double pawn pushes, no promotions
+    if constexpr (Type != CAPTURES)
+    {
+        Bitboard b1 = shift<Up>(pawnsNotOn7) & emptySquares;
+        Bitboard b2 = shift<Up>(b1 & TRank3BB) & emptySquares;
+        if constexpr (Type == EVASIONS)  // Consider only blocking squares
+        {
+            b1 &= target;
+            b2 &= target;
+        }
+        moveList = splat_pawn_moves<Up>(moveList, b1);
+        moveList = splat_pawn_moves<Up + Up>(moveList, b2);
+    }
+    // Promotions and underpromotions
+    if (pawnsOn7)
+    {
+        Bitboard b1 = shift<UpRight>(pawnsOn7) & enemies;
+        Bitboard b2 = shift<UpLeft>(pawnsOn7) & enemies;
+        Bitboard b3 = shift<Up>(pawnsOn7) & emptySquares;
+        if constexpr (Type == EVASIONS)
+            b3 &= target;
+        while (b1)
+            moveList = make_promotions<Type, UpRight, true>(moveList, pop_lsb(b1));
+        while (b2)
+            moveList = make_promotions<Type, UpLeft, true>(moveList, pop_lsb(b2));
+        while (b3)
+            moveList = make_promotions<Type, Up, false>(moveList, pop_lsb(b3));
+    }
+    // Standard and en passant captures
+    if constexpr (Type == CAPTURES || Type == EVASIONS || Type == NON_EVASIONS)
+    {
+        Bitboard b1 = shift<UpRight>(pawnsNotOn7) & enemies;
+        Bitboard b2 = shift<UpLeft>(pawnsNotOn7) & enemies;
+        moveList = splat_pawn_moves<UpRight>(moveList, b1);
+        moveList = splat_pawn_moves<UpLeft>(moveList, b2);
+        if (pos.ep_square() != SQ_NONE)
+        {
+            assert(rank_of(pos.ep_square()) == relative_rank(Us, RANK_6));
+            // An en passant capture cannot resolve a discovered check
+            if (Type == EVASIONS && (target & (pos.ep_square() + Up)))
+                return moveList;
+            b1 = pawnsNotOn7 & attacks_bb<PAWN>(pos.ep_square(), Them);
+            assert(b1);
+            while (b1)
+                *moveList++ = Move::make<EN_PASSANT>(pop_lsb(b1), pos.ep_square());
+        }
+    }
+    return moveList;
+}
+template<Color Us, PieceType Pt>
+Move* generate_moves(const Position& pos, Move* moveList, Bitboard target) {
+    static_assert(Pt != KING && Pt != PAWN, "Unsupported piece type in generate_moves()");
+    Bitboard bb = pos.pieces(Us, Pt);
+    while (bb)
+    {
+        Square   from = pop_lsb(bb);
+        Bitboard b    = attacks_bb<Pt>(from, pos.pieces()) & target;
+        moveList = splat_moves(moveList, from, b);
+    }
+    return moveList;
+}
+template<Color Us, GenType Type>
+Move* generate_all(const Position& pos, Move* moveList) {
+    static_assert(Type != LEGAL, "Unsupported type in generate_all()");
+    const Square ksq = pos.square<KING>(Us);
+    Bitboard     target;
+    // Skip generating non-king moves when in double check
+    if (Type != EVASIONS || !more_than_one(pos.checkers()))
+    {
+        target = Type == EVASIONS     ? between_bb(ksq, lsb(pos.checkers()))
+               : Type == NON_EVASIONS ? ~pos.pieces(Us)
+               : Type == CAPTURES     ? pos.pieces(~Us)
+                                      : ~pos.pieces();  // QUIETS
+        moveList = generate_pawn_moves<Us, Type>(pos, moveList, target);
+        moveList = generate_moves<Us, KNIGHT>(pos, moveList, target);
+        moveList = generate_moves<Us, BISHOP>(pos, moveList, target);
+        moveList = generate_moves<Us, ROOK>(pos, moveList, target);
+        moveList = generate_moves<Us, QUEEN>(pos, moveList, target);
+    }
+    Bitboard b = attacks_bb<KING>(ksq) & (Type == EVASIONS ? ~pos.pieces(Us) : target);
+    moveList = splat_moves(moveList, ksq, b);
+    if ((Type == QUIETS || Type == NON_EVASIONS) && pos.can_castle(Us & ANY_CASTLING))
+        for (CastlingRights cr : {Us & KING_SIDE, Us & QUEEN_SIDE})
+            if (!pos.castling_impeded(cr) && pos.can_castle(cr))
+                *moveList++ = Move::make<CASTLING>(ksq, pos.castling_rook_square(cr));
+    return moveList;
+}
+}  // namespace
+// <CAPTURES>     Generates all pseudo-legal captures plus queen promotions
+// <QUIETS>       Generates all pseudo-legal non-captures and underpromotions
+// <EVASIONS>     Generates all pseudo-legal check evasions
+// <NON_EVASIONS> Generates all pseudo-legal captures and non-captures
+//
+// Returns a pointer to the end of the move list.
+template<GenType Type>
+Move* generate(const Position& pos, Move* moveList) {
+    static_assert(Type != LEGAL, "Unsupported type in generate()");
+    assert((Type == EVASIONS) == bool(pos.checkers()));
+    Color us = pos.side_to_move();
+    return us == WHITE ? generate_all<WHITE, Type>(pos, moveList)
+                       : generate_all<BLACK, Type>(pos, moveList);
+}
+// Explicit template instantiations
+template Move* generate<CAPTURES>(const Position&, Move*);
+template Move* generate<QUIETS>(const Position&, Move*);
+template Move* generate<EVASIONS>(const Position&, Move*);
+template Move* generate<NON_EVASIONS>(const Position&, Move*);
+// generate<LEGAL> generates all the legal moves in the given position
+template<>
+Move* generate<LEGAL>(const Position& pos, Move* moveList) {
+    Color    us     = pos.side_to_move();
+    Bitboard pinned = pos.blockers_for_king(us) & pos.pieces(us);
+    Square   ksq    = pos.square<KING>(us);
+    Move*    cur    = moveList;
+    moveList =
+      pos.checkers() ? generate<EVASIONS>(pos, moveList) : generate<NON_EVASIONS>(pos, moveList);
+    while (cur != moveList)
+        if (((pinned & cur->from_sq()) || cur->from_sq() == ksq || cur->type_of() == EN_PASSANT)
+            && !pos.legal(*cur))
+            *cur = *(--moveList);
+        else
+            ++cur;
+    return moveList;
+}
+}  // namespace Stockfish

src/movegen.h ADDED Viewed

	@@ -0,0 +1,73 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef MOVEGEN_H_INCLUDED
+#define MOVEGEN_H_INCLUDED
+#include <algorithm>  // IWYU pragma: keep
+#include <cstddef>
+#include "types.h"
+namespace Stockfish {
+class Position;
+enum GenType {
+    CAPTURES,
+    QUIETS,
+    EVASIONS,
+    NON_EVASIONS,
+    LEGAL
+};
+struct ExtMove: public Move {
+    int value;
+    void operator=(Move m) { data = m.raw(); }
+    // Inhibit unwanted implicit conversions to Move
+    // with an ambiguity that yields to a compile error.
+    operator float() const = delete;
+};
+inline bool operator<(const ExtMove& f, const ExtMove& s) { return f.value < s.value; }
+template<GenType>
+Move* generate(const Position& pos, Move* moveList);
+// The MoveList struct wraps the generate() function and returns a convenient
+// list of moves. Using MoveList is sometimes preferable to directly calling
+// the lower level generate() function.
+template<GenType T>
+struct MoveList {
+    explicit MoveList(const Position& pos) :
+        last(generate<T>(pos, moveList)) {}
+    const Move* begin() const { return moveList; }
+    const Move* end() const { return last; }
+    size_t      size() const { return last - moveList; }
+    bool        contains(Move move) const { return std::find(begin(), end(), move) != end(); }
+   private:
+    Move moveList[MAX_MOVES], *last;
+};
+}  // namespace Stockfish
+#endif  // #ifndef MOVEGEN_H_INCLUDED

src/movepick.cpp ADDED Viewed

	@@ -0,0 +1,313 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "movepick.h"
+#include <cassert>
+#include <limits>
+#include <utility>
+#include "bitboard.h"
+#include "misc.h"
+#include "position.h"
+namespace Stockfish {
+namespace {
+enum Stages {
+    // generate main search moves
+    MAIN_TT,
+    CAPTURE_INIT,
+    GOOD_CAPTURE,
+    QUIET_INIT,
+    GOOD_QUIET,
+    BAD_CAPTURE,
+    BAD_QUIET,
+    // generate evasion moves
+    EVASION_TT,
+    EVASION_INIT,
+    EVASION,
+    // generate probcut moves
+    PROBCUT_TT,
+    PROBCUT_INIT,
+    PROBCUT,
+    // generate qsearch moves
+    QSEARCH_TT,
+    QCAPTURE_INIT,
+    QCAPTURE
+};
+// Sort moves in descending order up to and including a given limit.
+// The order of moves smaller than the limit is left unspecified.
+void partial_insertion_sort(ExtMove* begin, ExtMove* end, int limit) {
+    for (ExtMove *sortedEnd = begin, *p = begin + 1; p < end; ++p)
+        if (p->value >= limit)
+        {
+            ExtMove tmp = *p, *q;
+            *p          = *++sortedEnd;
+            for (q = sortedEnd; q != begin && *(q - 1) < tmp; --q)
+                *q = *(q - 1);
+            *q = tmp;
+        }
+}
+}  // namespace
+// Constructors of the MovePicker class. As arguments, we pass information
+// to decide which class of moves to emit, to help sorting the (presumably)
+// good moves first, and how important move ordering is at the current node.
+// MovePicker constructor for the main search and for the quiescence search
+MovePicker::MovePicker(const Position&              p,
+                       Move                         ttm,
+                       Depth                        d,
+                       const ButterflyHistory*      mh,
+                       const LowPlyHistory*         lph,
+                       const CapturePieceToHistory* cph,
+                       const PieceToHistory**       ch,
+                       const SharedHistories*       sh,
+                       int                          pl) :
+    pos(p),
+    mainHistory(mh),
+    lowPlyHistory(lph),
+    captureHistory(cph),
+    continuationHistory(ch),
+    sharedHistory(sh),
+    ttMove(ttm),
+    depth(d),
+    ply(pl) {
+    if (pos.checkers())
+        stage = EVASION_TT + !(ttm && pos.pseudo_legal(ttm));
+    else
+        stage = (depth > 0 ? MAIN_TT : QSEARCH_TT) + !(ttm && pos.pseudo_legal(ttm));
+}
+// MovePicker constructor for ProbCut: we generate captures with Static Exchange
+// Evaluation (SEE) greater than or equal to the given threshold.
+MovePicker::MovePicker(const Position& p, Move ttm, int th, const CapturePieceToHistory* cph) :
+    pos(p),
+    captureHistory(cph),
+    ttMove(ttm),
+    threshold(th) {
+    assert(!pos.checkers());
+    stage = PROBCUT_TT + !(ttm && pos.capture_stage(ttm) && pos.pseudo_legal(ttm));
+}
+// Assigns a numerical value to each move in a list, used for sorting.
+// Captures are ordered by Most Valuable Victim (MVV), preferring captures
+// with a good history. Quiets moves are ordered using the history tables.
+template<GenType Type>
+ExtMove* MovePicker::score(MoveList<Type>& ml) {
+    static_assert(Type == CAPTURES || Type == QUIETS || Type == EVASIONS, "Wrong type");
+    Color us = pos.side_to_move();
+    [[maybe_unused]] Bitboard threatByLesser[KING + 1];
+    if constexpr (Type == QUIETS)
+    {
+        threatByLesser[PAWN]   = 0;
+        threatByLesser[KNIGHT] = threatByLesser[BISHOP] = pos.attacks_by<PAWN>(~us);
+        threatByLesser[ROOK] =
+          pos.attacks_by<KNIGHT>(~us) | pos.attacks_by<BISHOP>(~us) | threatByLesser[KNIGHT];
+        threatByLesser[QUEEN] = pos.attacks_by<ROOK>(~us) | threatByLesser[ROOK];
+        threatByLesser[KING]  = 0;
+    }
+    ExtMove* it = cur;
+    for (auto move : ml)
+    {
+        ExtMove& m = *it++;
+        m          = move;
+        const Square    from          = m.from_sq();
+        const Square    to            = m.to_sq();
+        const Piece     pc            = pos.moved_piece(m);
+        const PieceType pt            = type_of(pc);
+        const Piece     capturedPiece = pos.piece_on(to);
+        if constexpr (Type == CAPTURES)
+            m.value = (*captureHistory)[pc][to][type_of(capturedPiece)]
+                    + 7 * int(PieceValue[capturedPiece]);
+        else if constexpr (Type == QUIETS)
+        {
+            // histories
+            m.value = 2 * (*mainHistory)[us][m.raw()];
+            m.value += 2 * sharedHistory->pawn_entry(pos)[pc][to];
+            m.value += (*continuationHistory[0])[pc][to];
+            m.value += (*continuationHistory[1])[pc][to];
+            m.value += (*continuationHistory[2])[pc][to];
+            m.value += (*continuationHistory[3])[pc][to];
+            m.value += (*continuationHistory[5])[pc][to];
+            // bonus for checks
+            m.value += (bool(pos.check_squares(pt) & to) && pos.see_ge(m, -75)) * 16384;
+            // penalty for moving to a square threatened by a lesser piece
+            // or bonus for escaping an attack by a lesser piece.
+            int v = 20 * (bool(threatByLesser[pt] & from) - bool(threatByLesser[pt] & to));
+            m.value += PieceValue[pt] * v;
+            if (ply < LOW_PLY_HISTORY_SIZE)
+                m.value += 8 * (*lowPlyHistory)[ply][m.raw()] / (1 + ply);
+        }
+        else  // Type == EVASIONS
+        {
+            if (pos.capture_stage(m))
+                m.value = PieceValue[capturedPiece] + (1 << 28);
+            else
+                m.value = (*mainHistory)[us][m.raw()] + (*continuationHistory[0])[pc][to];
+        }
+    }
+    return it;
+}
+// Returns the next move satisfying a predicate function.
+// This never returns the TT move, as it was emitted before.
+template<typename Pred>
+Move MovePicker::select(Pred filter) {
+    for (; cur < endCur; ++cur)
+        if (*cur != ttMove && filter())
+            return *cur++;
+    return Move::none();
+}
+// This is the most important method of the MovePicker class. We emit one
+// new pseudo-legal move on every call until there are no more moves left,
+// picking the move with the highest score from a list of generated moves.
+Move MovePicker::next_move() {
+    constexpr int goodQuietThreshold = -14000;
+top:
+    switch (stage)
+    {
+    case MAIN_TT :
+    case EVASION_TT :
+    case QSEARCH_TT :
+    case PROBCUT_TT :
+        ++stage;
+        return ttMove;
+    case CAPTURE_INIT :
+    case PROBCUT_INIT :
+    case QCAPTURE_INIT : {
+        MoveList<CAPTURES> ml(pos);
+        cur = endBadCaptures = moves;
+        endCur = endCaptures = score<CAPTURES>(ml);
+        partial_insertion_sort(cur, endCur, std::numeric_limits<int>::min());
+        ++stage;
+        goto top;
+    }
+    case GOOD_CAPTURE :
+        if (select([&]() {
+                if (pos.see_ge(*cur, -cur->value / 18))
+                    return true;
+                std::swap(*endBadCaptures++, *cur);
+                return false;
+            }))
+            return *(cur - 1);
+        ++stage;
+        [[fallthrough]];
+    case QUIET_INIT :
+        if (!skipQuiets)
+        {
+            MoveList<QUIETS> ml(pos);
+            endCur = endGenerated = score<QUIETS>(ml);
+            partial_insertion_sort(cur, endCur, -3560 * depth);
+        }
+        ++stage;
+        [[fallthrough]];
+    case GOOD_QUIET :
+        if (!skipQuiets && select([&]() { return cur->value > goodQuietThreshold; }))
+            return *(cur - 1);
+        // Prepare the pointers to loop over the bad captures
+        cur    = moves;
+        endCur = endBadCaptures;
+        ++stage;
+        [[fallthrough]];
+    case BAD_CAPTURE :
+        if (select([]() { return true; }))
+            return *(cur - 1);
+        // Prepare the pointers to loop over quiets again
+        cur    = endCaptures;
+        endCur = endGenerated;
+        ++stage;
+        [[fallthrough]];
+    case BAD_QUIET :
+        if (!skipQuiets)
+            return select([&]() { return cur->value <= goodQuietThreshold; });
+        return Move::none();
+    case EVASION_INIT : {
+        MoveList<EVASIONS> ml(pos);
+        cur    = moves;
+        endCur = endGenerated = score<EVASIONS>(ml);
+        partial_insertion_sort(cur, endCur, std::numeric_limits<int>::min());
+        ++stage;
+        [[fallthrough]];
+    }
+    case EVASION :
+    case QCAPTURE :
+        return select([]() { return true; });
+    case PROBCUT :
+        return select([&]() { return pos.see_ge(*cur, threshold); });
+    }
+    assert(false);
+    return Move::none();  // Silence warning
+}
+void MovePicker::skip_quiet_moves() { skipQuiets = true; }
+}  // namespace Stockfish

src/movepick.h ADDED Viewed

	@@ -0,0 +1,80 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef MOVEPICK_H_INCLUDED
+#define MOVEPICK_H_INCLUDED
+#include "history.h"
+#include "movegen.h"
+#include "types.h"
+namespace Stockfish {
+class Position;
+// The MovePicker class is used to pick one pseudo-legal move at a time from the
+// current position. The most important method is next_move(), which emits one
+// new pseudo-legal move on every call, until there are no moves left, when
+// Move::none() is returned. In order to improve the efficiency of the alpha-beta
+// algorithm, MovePicker attempts to return the moves which are most likely to get
+// a cut-off first.
+class MovePicker {
+   public:
+    MovePicker(const MovePicker&)            = delete;
+    MovePicker& operator=(const MovePicker&) = delete;
+    MovePicker(const Position&,
+               Move,
+               Depth,
+               const ButterflyHistory*,
+               const LowPlyHistory*,
+               const CapturePieceToHistory*,
+               const PieceToHistory**,
+               const SharedHistories*,
+               int);
+    MovePicker(const Position&, Move, int, const CapturePieceToHistory*);
+    Move next_move();
+    void skip_quiet_moves();
+   private:
+    template<typename Pred>
+    Move select(Pred);
+    template<GenType T>
+    ExtMove* score(MoveList<T>&);
+    ExtMove* begin() { return cur; }
+    ExtMove* end() { return endCur; }
+    const Position&              pos;
+    const ButterflyHistory*      mainHistory;
+    const LowPlyHistory*         lowPlyHistory;
+    const CapturePieceToHistory* captureHistory;
+    const PieceToHistory**       continuationHistory;
+    const SharedHistories*       sharedHistory;
+    Move                         ttMove;
+    ExtMove *                    cur, *endCur, *endBadCaptures, *endCaptures, *endGenerated;
+    int                          stage;
+    int                          threshold;
+    Depth                        depth;
+    int                          ply;
+    bool                         skipQuiets = false;
+    ExtMove                      moves[MAX_MOVES];
+};
+}  // namespace Stockfish
+#endif  // #ifndef MOVEPICK_H_INCLUDED

src/nnue/features/full_threats.cpp ADDED Viewed

	@@ -0,0 +1,343 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+//Definition of input features FullThreats of NNUE evaluation function
+#include "full_threats.h"
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <utility>
+#include "../../bitboard.h"
+#include "../../misc.h"
+#include "../../position.h"
+#include "../../types.h"
+#include "../nnue_common.h"
+namespace Stockfish::Eval::NNUE::Features {
+struct HelperOffsets {
+    int cumulativePieceOffset, cumulativeOffset;
+};
+constexpr std::array<Piece, 12> AllPieces = {
+  W_PAWN, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
+  B_PAWN, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
+};
+template<PieceType PT>
+constexpr auto make_piece_indices_type() {
+    static_assert(PT != PieceType::PAWN);
+    std::array<std::array<uint8_t, SQUARE_NB>, SQUARE_NB> out{};
+    for (Square from = SQ_A1; from <= SQ_H8; ++from)
+    {
+        Bitboard attacks = PseudoAttacks[PT][from];
+        for (Square to = SQ_A1; to <= SQ_H8; ++to)
+        {
+            out[from][to] = constexpr_popcount(((1ULL << to) - 1) & attacks);
+        }
+    }
+    return out;
+}
+template<Piece P>
+constexpr auto make_piece_indices_piece() {
+    static_assert(type_of(P) == PieceType::PAWN);
+    std::array<std::array<uint8_t, SQUARE_NB>, SQUARE_NB> out{};
+    constexpr Color C = color_of(P);
+    for (Square from = SQ_A1; from <= SQ_H8; ++from)
+    {
+        Bitboard attacks = PseudoAttacks[C][from];
+        for (Square to = SQ_A1; to <= SQ_H8; ++to)
+        {
+            out[from][to] = constexpr_popcount(((1ULL << to) - 1) & attacks);
+        }
+    }
+    return out;
+}
+constexpr auto index_lut2_array() {
+    constexpr auto KNIGHT_ATTACKS = make_piece_indices_type<PieceType::KNIGHT>();
+    constexpr auto BISHOP_ATTACKS = make_piece_indices_type<PieceType::BISHOP>();
+    constexpr auto ROOK_ATTACKS   = make_piece_indices_type<PieceType::ROOK>();
+    constexpr auto QUEEN_ATTACKS  = make_piece_indices_type<PieceType::QUEEN>();
+    constexpr auto KING_ATTACKS   = make_piece_indices_type<PieceType::KING>();
+    std::array<std::array<std::array<uint8_t, SQUARE_NB>, SQUARE_NB>, PIECE_NB> indices{};
+    indices[W_PAWN] = make_piece_indices_piece<W_PAWN>();
+    indices[B_PAWN] = make_piece_indices_piece<B_PAWN>();
+    indices[W_KNIGHT] = KNIGHT_ATTACKS;
+    indices[B_KNIGHT] = KNIGHT_ATTACKS;
+    indices[W_BISHOP] = BISHOP_ATTACKS;
+    indices[B_BISHOP] = BISHOP_ATTACKS;
+    indices[W_ROOK] = ROOK_ATTACKS;
+    indices[B_ROOK] = ROOK_ATTACKS;
+    indices[W_QUEEN] = QUEEN_ATTACKS;
+    indices[B_QUEEN] = QUEEN_ATTACKS;
+    indices[W_KING] = KING_ATTACKS;
+    indices[B_KING] = KING_ATTACKS;
+    return indices;
+}
+constexpr auto init_threat_offsets() {
+    std::array<HelperOffsets, PIECE_NB>                    indices{};
+    std::array<std::array<IndexType, SQUARE_NB>, PIECE_NB> offsets{};
+    int cumulativeOffset = 0;
+    for (Piece piece : AllPieces)
+    {
+        int pieceIdx              = piece;
+        int cumulativePieceOffset = 0;
+        for (Square from = SQ_A1; from <= SQ_H8; ++from)
+        {
+            offsets[pieceIdx][from] = cumulativePieceOffset;
+            if (type_of(piece) != PAWN)
+            {
+                Bitboard attacks = PseudoAttacks[type_of(piece)][from];
+                cumulativePieceOffset += constexpr_popcount(attacks);
+            }
+            else if (from >= SQ_A2 && from <= SQ_H7)
+            {
+                Bitboard attacks = (pieceIdx < 8) ? pawn_attacks_bb<WHITE>(square_bb(from))
+                                                  : pawn_attacks_bb<BLACK>(square_bb(from));
+                cumulativePieceOffset += constexpr_popcount(attacks);
+            }
+        }
+        indices[pieceIdx] = {cumulativePieceOffset, cumulativeOffset};
+        cumulativeOffset += numValidTargets[pieceIdx] * cumulativePieceOffset;
+    }
+    return std::pair{indices, offsets};
+}
+constexpr auto helper_offsets = init_threat_offsets().first;
+// Lookup array for indexing threats
+constexpr auto offsets = init_threat_offsets().second;
+constexpr auto init_index_luts() {
+    std::array<std::array<std::array<uint32_t, 2>, PIECE_NB>, PIECE_NB> indices{};
+    for (Piece attacker : AllPieces)
+    {
+        for (Piece attacked : AllPieces)
+        {
+            bool      enemy        = (attacker ^ attacked) == 8;
+            PieceType attackerType = type_of(attacker);
+            PieceType attackedType = type_of(attacked);
+            int  map           = FullThreats::map[attackerType - 1][attackedType - 1];
+            bool semi_excluded = attackerType == attackedType && (enemy || attackerType != PAWN);
+            IndexType feature  = helper_offsets[attacker].cumulativeOffset
+                              + (color_of(attacked) * (numValidTargets[attacker] / 2) + map)
+                                  * helper_offsets[attacker].cumulativePieceOffset;
+            bool excluded                  = map < 0;
+            indices[attacker][attacked][0] = excluded ? FullThreats::Dimensions : feature;
+            indices[attacker][attacked][1] =
+              excluded || semi_excluded ? FullThreats::Dimensions : feature;
+        }
+    }
+    return indices;
+}
+// The final index is calculated from summing data found in these two LUTs, as well
+// as offsets[attacker][from]
+// [attacker][attacked][from < to]
+constexpr auto index_lut1 = init_index_luts();
+// [attacker][from][to]
+constexpr auto index_lut2 = index_lut2_array();
+// Index of a feature for a given king position and another piece on some square
+inline sf_always_inline IndexType FullThreats::make_index(
+  Color perspective, Piece attacker, Square from, Square to, Piece attacked, Square ksq) {
+    const std::int8_t orientation   = OrientTBL[ksq] ^ (56 * perspective);
+    unsigned          from_oriented = uint8_t(from) ^ orientation;
+    unsigned          to_oriented   = uint8_t(to) ^ orientation;
+    std::int8_t swap              = 8 * perspective;
+    unsigned    attacker_oriented = attacker ^ swap;
+    unsigned    attacked_oriented = attacked ^ swap;
+    return index_lut1[attacker_oriented][attacked_oriented][from_oriented < to_oriented]
+         + offsets[attacker_oriented][from_oriented]
+         + index_lut2[attacker_oriented][from_oriented][to_oriented];
+}
+// Get a list of indices for active features in ascending order
+void FullThreats::append_active_indices(Color perspective, const Position& pos, IndexList& active) {
+    Square   ksq      = pos.square<KING>(perspective);
+    Bitboard occupied = pos.pieces();
+    for (Color color : {WHITE, BLACK})
+    {
+        for (PieceType pt = PAWN; pt < KING; ++pt)
+        {
+            Color    c        = Color(perspective ^ color);
+            Piece    attacker = make_piece(c, pt);
+            Bitboard bb       = pos.pieces(c, pt);
+            if (pt == PAWN)
+            {
+                auto right = (c == WHITE) ? NORTH_EAST : SOUTH_WEST;
+                auto left  = (c == WHITE) ? NORTH_WEST : SOUTH_EAST;
+                auto attacks_left =
+                  ((c == WHITE) ? shift<NORTH_EAST>(bb) : shift<SOUTH_WEST>(bb)) & occupied;
+                auto attacks_right =
+                  ((c == WHITE) ? shift<NORTH_WEST>(bb) : shift<SOUTH_EAST>(bb)) & occupied;
+                while (attacks_left)
+                {
+                    Square    to       = pop_lsb(attacks_left);
+                    Square    from     = to - right;
+                    Piece     attacked = pos.piece_on(to);
+                    IndexType index    = make_index(perspective, attacker, from, to, attacked, ksq);
+                    if (index < Dimensions)
+                        active.push_back(index);
+                }
+                while (attacks_right)
+                {
+                    Square    to       = pop_lsb(attacks_right);
+                    Square    from     = to - left;
+                    Piece     attacked = pos.piece_on(to);
+                    IndexType index    = make_index(perspective, attacker, from, to, attacked, ksq);
+                    if (index < Dimensions)
+                        active.push_back(index);
+                }
+            }
+            else
+            {
+                while (bb)
+                {
+                    Square   from    = pop_lsb(bb);
+                    Bitboard attacks = (attacks_bb(pt, from, occupied)) & occupied;
+                    while (attacks)
+                    {
+                        Square    to       = pop_lsb(attacks);
+                        Piece     attacked = pos.piece_on(to);
+                        IndexType index =
+                          make_index(perspective, attacker, from, to, attacked, ksq);
+                        if (index < Dimensions)
+                            active.push_back(index);
+                    }
+                }
+            }
+        }
+    }
+}
+// Get a list of indices for recently changed features
+void FullThreats::append_changed_indices(Color                   perspective,
+                                         Square                  ksq,
+                                         const DiffType&         diff,
+                                         IndexList&              removed,
+                                         IndexList&              added,
+                                         FusedUpdateData*        fusedData,
+                                         bool                    first,
+                                         const ThreatWeightType* prefetchBase,
+                                         IndexType               prefetchStride) {
+    for (const auto& dirty : diff.list)
+    {
+        auto attacker = dirty.pc();
+        auto attacked = dirty.threatened_pc();
+        auto from     = dirty.pc_sq();
+        auto to       = dirty.threatened_sq();
+        auto add      = dirty.add();
+        if (fusedData)
+        {
+            if (from == fusedData->dp2removed)
+            {
+                if (add)
+                {
+                    if (first)
+                    {
+                        fusedData->dp2removedOriginBoard |= to;
+                        continue;
+                    }
+                }
+                else if (fusedData->dp2removedOriginBoard & to)
+                    continue;
+            }
+            if (to != SQ_NONE && to == fusedData->dp2removed)
+            {
+                if (add)
+                {
+                    if (first)
+                    {
+                        fusedData->dp2removedTargetBoard |= from;
+                        continue;
+                    }
+                }
+                else if (fusedData->dp2removedTargetBoard & from)
+                    continue;
+            }
+        }
+        auto&           insert = add ? added : removed;
+        const IndexType index  = make_index(perspective, attacker, from, to, attacked, ksq);
+        if (index < Dimensions)
+        {
+            if (prefetchBase)
+                prefetch<PrefetchRw::READ, PrefetchLoc::LOW>(
+                  prefetchBase + static_cast<std::ptrdiff_t>(index) * prefetchStride);
+            insert.push_back(index);
+        }
+    }
+}
+bool FullThreats::requires_refresh(const DiffType& diff, Color perspective) {
+    return perspective == diff.us && (int8_t(diff.ksq) & 0b100) != (int8_t(diff.prevKsq) & 0b100);
+}
+}  // namespace Stockfish::Eval::NNUE::Features

src/nnue/features/full_threats.h ADDED Viewed

	@@ -0,0 +1,106 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+//Definition of input features Simplified_Threats of NNUE evaluation function
+#ifndef NNUE_FEATURES_FULL_THREATS_INCLUDED
+#define NNUE_FEATURES_FULL_THREATS_INCLUDED
+#include <cstdint>
+#include "../../misc.h"
+#include "../../types.h"
+#include "../nnue_common.h"
+namespace Stockfish {
+class Position;
+}
+namespace Stockfish::Eval::NNUE::Features {
+static constexpr int numValidTargets[PIECE_NB] = {0, 6, 10, 8, 8, 10, 0, 0,
+                                                  0, 6, 10, 8, 8, 10, 0, 0};
+class FullThreats {
+   public:
+    // Feature name
+    static constexpr const char* Name = "Full_Threats(Friend)";
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t HashValue = 0x8f234cb8u;
+    // Number of feature dimensions
+    static constexpr IndexType Dimensions = 60144;
+    // clang-format off
+    // Orient a square according to perspective (rotates by 180 for black)
+    static constexpr std::int8_t OrientTBL[SQUARE_NB] = {
+        SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1,
+        SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1,
+        SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1,
+        SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1,
+        SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1,
+        SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1,
+        SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1,
+        SQ_A1, SQ_A1, SQ_A1, SQ_A1, SQ_H1, SQ_H1, SQ_H1, SQ_H1,
+    };
+    static constexpr int map[PIECE_TYPE_NB-2][PIECE_TYPE_NB-2] = {
+      { 0,  1, -1,  2, -1, -1},
+      { 0,  1,  2,  3,  4, -1},
+      { 0,  1,  2,  3, -1, -1},
+      { 0,  1,  2,  3, -1, -1},
+      { 0,  1,  2,  3,  4, -1},
+      {-1, -1, -1, -1, -1, -1}
+    };
+    // clang-format on
+    struct FusedUpdateData {
+        Bitboard dp2removedOriginBoard = 0;
+        Bitboard dp2removedTargetBoard = 0;
+        Square dp2removed;
+    };
+    // Maximum number of simultaneously active features.
+    static constexpr IndexType MaxActiveDimensions = 128;
+    using IndexList                                = ValueList<IndexType, MaxActiveDimensions>;
+    using DiffType                                 = DirtyThreats;
+    static IndexType
+    make_index(Color perspective, Piece attkr, Square from, Square to, Piece attkd, Square ksq);
+    // Get a list of indices for active features
+    static void append_active_indices(Color perspective, const Position& pos, IndexList& active);
+    // Get a list of indices for recently changed features
+    static void append_changed_indices(Color                   perspective,
+                                       Square                  ksq,
+                                       const DiffType&         diff,
+                                       IndexList&              removed,
+                                       IndexList&              added,
+                                       FusedUpdateData*        fd             = nullptr,
+                                       bool                    first          = false,
+                                       const ThreatWeightType* prefetchBase   = nullptr,
+                                       IndexType               prefetchStride = 0);
+    // Returns whether the change stored in this DirtyPiece means
+    // that a full accumulator refresh is required.
+    static bool requires_refresh(const DiffType& diff, Color perspective);
+};
+}  // namespace Stockfish::Eval::NNUE::Features
+#endif  // #ifndef NNUE_FEATURES_FULL_THREATS_INCLUDED

src/nnue/features/half_ka_v2_hm.cpp ADDED Viewed

	@@ -0,0 +1,69 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+//Definition of input features HalfKAv2_hm of NNUE evaluation function
+#include "half_ka_v2_hm.h"
+#include "../../bitboard.h"
+#include "../../position.h"
+#include "../../types.h"
+#include "../nnue_common.h"
+namespace Stockfish::Eval::NNUE::Features {
+// Index of a feature for a given king position and another piece on some square
+IndexType HalfKAv2_hm::make_index(Color perspective, Square s, Piece pc, Square ksq) {
+    const IndexType flip = 56 * perspective;
+    return (IndexType(s) ^ OrientTBL[ksq] ^ flip) + PieceSquareIndex[perspective][pc]
+         + KingBuckets[int(ksq) ^ flip];
+}
+// Get a list of indices for active features
+void HalfKAv2_hm::append_active_indices(Color perspective, const Position& pos, IndexList& active) {
+    Square   ksq = pos.square<KING>(perspective);
+    Bitboard bb  = pos.pieces();
+    while (bb)
+    {
+        Square s = pop_lsb(bb);
+        active.push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+    }
+}
+// Get a list of indices for recently changed features
+void HalfKAv2_hm::append_changed_indices(
+  Color perspective, Square ksq, const DiffType& diff, IndexList& removed, IndexList& added) {
+    removed.push_back(make_index(perspective, diff.from, diff.pc, ksq));
+    if (diff.to != SQ_NONE)
+        added.push_back(make_index(perspective, diff.to, diff.pc, ksq));
+    if (diff.remove_sq != SQ_NONE)
+        removed.push_back(make_index(perspective, diff.remove_sq, diff.remove_pc, ksq));
+    if (diff.add_sq != SQ_NONE)
+        added.push_back(make_index(perspective, diff.add_sq, diff.add_pc, ksq));
+}
+bool HalfKAv2_hm::requires_refresh(const DiffType& diff, Color perspective) {
+    return diff.pc == make_piece(perspective, KING);
+}
+}  // namespace Stockfish::Eval::NNUE::Features

src/nnue/features/half_ka_v2_hm.h ADDED Viewed

	@@ -0,0 +1,128 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+//Definition of input features HalfKP of NNUE evaluation function
+#ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED
+#define NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED
+#include <cstdint>
+#include "../../misc.h"
+#include "../../types.h"
+#include "../nnue_common.h"
+namespace Stockfish {
+class Position;
+}
+namespace Stockfish::Eval::NNUE::Features {
+// Feature HalfKAv2_hm: Combination of the position of own king and the
+// position of pieces. Position mirrored such that king is always on e..h files.
+class HalfKAv2_hm {
+    // Unique number for each piece type on each square
+    enum {
+        PS_NONE     = 0,
+        PS_W_PAWN   = 0,
+        PS_B_PAWN   = 1 * SQUARE_NB,
+        PS_W_KNIGHT = 2 * SQUARE_NB,
+        PS_B_KNIGHT = 3 * SQUARE_NB,
+        PS_W_BISHOP = 4 * SQUARE_NB,
+        PS_B_BISHOP = 5 * SQUARE_NB,
+        PS_W_ROOK   = 6 * SQUARE_NB,
+        PS_B_ROOK   = 7 * SQUARE_NB,
+        PS_W_QUEEN  = 8 * SQUARE_NB,
+        PS_B_QUEEN  = 9 * SQUARE_NB,
+        PS_KING     = 10 * SQUARE_NB,
+        PS_NB       = 11 * SQUARE_NB
+    };
+    static constexpr IndexType PieceSquareIndex[COLOR_NB][PIECE_NB] = {
+      // Convention: W - us, B - them
+      // Viewed from other side, W and B are reversed
+      {PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_KING, PS_NONE,
+       PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_KING, PS_NONE},
+      {PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_KING, PS_NONE,
+       PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_KING, PS_NONE}};
+   public:
+    // Feature name
+    static constexpr const char* Name = "HalfKAv2_hm(Friend)";
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t HashValue = 0x7f234cb8u;
+    // Number of feature dimensions
+    static constexpr IndexType Dimensions =
+      static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_NB) / 2;
+#define B(v) (v * PS_NB)
+    // clang-format off
+    static constexpr IndexType KingBuckets[SQUARE_NB] = {
+        B(28), B(29), B(30), B(31), B(31), B(30), B(29), B(28),
+        B(24), B(25), B(26), B(27), B(27), B(26), B(25), B(24),
+        B(20), B(21), B(22), B(23), B(23), B(22), B(21), B(20),
+        B(16), B(17), B(18), B(19), B(19), B(18), B(17), B(16),
+        B(12), B(13), B(14), B(15), B(15), B(14), B(13), B(12),
+        B( 8), B( 9), B(10), B(11), B(11), B(10), B( 9), B( 8),
+        B( 4), B( 5), B( 6), B( 7), B( 7), B( 6), B( 5), B( 4),
+        B( 0), B( 1), B( 2), B( 3), B( 3), B( 2), B( 1), B( 0),
+    };
+    // clang-format on
+#undef B
+    // clang-format off
+    // Orient a square according to perspective (rotates by 180 for black)
+    static constexpr IndexType OrientTBL[SQUARE_NB] = {
+        SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1,
+        SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1,
+        SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1,
+        SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1,
+        SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1,
+        SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1,
+        SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1,
+        SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1 ,
+    };
+    // clang-format on
+    // Maximum number of simultaneously active features.
+    static constexpr IndexType MaxActiveDimensions = 32;
+    using IndexList                                = ValueList<IndexType, MaxActiveDimensions>;
+    using DiffType                                 = DirtyPiece;
+    // Index of a feature for a given king position and another piece on some square
+    static IndexType make_index(Color perspective, Square s, Piece pc, Square ksq);
+    // Get a list of indices for active features
+    static void append_active_indices(Color perspective, const Position& pos, IndexList& active);
+    // Get a list of indices for recently changed features
+    static void append_changed_indices(
+      Color perspective, Square ksq, const DiffType& diff, IndexList& removed, IndexList& added);
+    // Returns whether the change stored in this DirtyPiece means
+    // that a full accumulator refresh is required.
+    static bool requires_refresh(const DiffType& diff, Color perspective);
+};
+}  // namespace Stockfish::Eval::NNUE::Features
+#endif  // #ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED

src/nnue/layers/affine_transform.h ADDED Viewed

	@@ -0,0 +1,312 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+// Definition of layer AffineTransform of NNUE evaluation function
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#include <cstdint>
+#include <iostream>
+#include "../../memory.h"
+#include "../nnue_common.h"
+#include "../simd.h"
+/*
+  This file contains the definition for a fully connected layer (aka affine transform).
+    - expected use-case is for when PaddedInputDimensions == 32 and InputDimensions <= 32.
+      - that's why AVX512 is hard to implement
+    - expected use-case is small layers
+    - inputs are processed in chunks of 4, weights are respectively transposed
+    - accumulation happens directly to int32s
+*/
+namespace Stockfish::Eval::NNUE::Layers {
+#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD)
+    #define ENABLE_SEQ_OPT
+#endif
+// Fallback implementation for older/other architectures.
+// Requires the input to be padded to at least 16 values.
+#ifndef ENABLE_SEQ_OPT
+template<IndexType InputDimensions, IndexType PaddedInputDimensions, IndexType OutputDimensions>
+static void affine_transform_non_ssse3(std::int32_t*       output,
+                                       const std::int8_t*  weights,
+                                       const std::int32_t* biases,
+                                       const std::uint8_t* input) {
+    #if defined(USE_SSE2) || defined(USE_NEON)
+        #if defined(USE_SSE2)
+    // At least a multiple of 16, with SSE2.
+    constexpr IndexType NumChunks   = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
+    const __m128i       Zeros       = _mm_setzero_si128();
+    const auto          inputVector = reinterpret_cast<const __m128i*>(input);
+        #elif defined(USE_NEON)
+    constexpr IndexType NumChunks   = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
+    const auto          inputVector = reinterpret_cast<const int8x8_t*>(input);
+        #endif
+    for (IndexType i = 0; i < OutputDimensions; ++i)
+    {
+        const IndexType offset = i * PaddedInputDimensions;
+        #if defined(USE_SSE2)
+        __m128i    sumLo = _mm_cvtsi32_si128(biases[i]);
+        __m128i    sumHi = Zeros;
+        const auto row   = reinterpret_cast<const __m128i*>(&weights[offset]);
+        for (IndexType j = 0; j < NumChunks; ++j)
+        {
+            __m128i row_j           = _mm_load_si128(&row[j]);
+            __m128i input_j         = _mm_load_si128(&inputVector[j]);
+            __m128i extendedRowLo   = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8);
+            __m128i extendedRowHi   = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8);
+            __m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros);
+            __m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros);
+            __m128i productLo       = _mm_madd_epi16(extendedRowLo, extendedInputLo);
+            __m128i productHi       = _mm_madd_epi16(extendedRowHi, extendedInputHi);
+            sumLo                   = _mm_add_epi32(sumLo, productLo);
+            sumHi                   = _mm_add_epi32(sumHi, productHi);
+        }
+        __m128i sum           = _mm_add_epi32(sumLo, sumHi);
+        __m128i sumHigh_64    = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+        sum                   = _mm_add_epi32(sum, sumHigh_64);
+        __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+        sum                   = _mm_add_epi32(sum, sum_second_32);
+        output[i]             = _mm_cvtsi128_si32(sum);
+        #elif defined(USE_NEON)
+        int32x4_t  sum = {biases[i]};
+        const auto row = reinterpret_cast<const SIMD::vec_i8x8_t*>(&weights[offset]);
+        for (IndexType j = 0; j < NumChunks; ++j)
+        {
+            int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]);
+            product           = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]);
+            sum               = vpadalq_s16(sum, product);
+        }
+        output[i] = SIMD::neon_m128_reduce_add_epi32(sum);
+        #endif
+    }
+    #else
+    std::memcpy(output, biases, sizeof(std::int32_t) * OutputDimensions);
+    // Traverse weights in transpose order to take advantage of input sparsity
+    for (IndexType i = 0; i < InputDimensions; ++i)
+        if (input[i])
+        {
+            const std::int8_t* w  = &weights[i];
+            const int          in = input[i];
+            for (IndexType j = 0; j < OutputDimensions; ++j)
+                output[j] += w[j * PaddedInputDimensions] * in;
+        }
+    #endif
+}
+#endif  // !ENABLE_SEQ_OPT
+template<IndexType InDims, IndexType OutDims>
+class AffineTransform {
+   public:
+    // Input/output type
+    using InputType  = std::uint8_t;
+    using OutputType = std::int32_t;
+    // Number of input/output dimensions
+    static constexpr IndexType InputDimensions  = InDims;
+    static constexpr IndexType OutputDimensions = OutDims;
+    static constexpr IndexType PaddedInputDimensions =
+      ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
+    static constexpr IndexType PaddedOutputDimensions =
+      ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth);
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
+        std::uint32_t hashValue = 0xCC03DAE4u;
+        hashValue += OutputDimensions;
+        hashValue ^= prevHash >> 1;
+        hashValue ^= prevHash << 31;
+        return hashValue;
+    }
+    static constexpr IndexType get_weight_index_scrambled(IndexType i) {
+        return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4
+             + i / PaddedInputDimensions * 4 + i % 4;
+    }
+    static constexpr IndexType get_weight_index(IndexType i) {
+#ifdef ENABLE_SEQ_OPT
+        return get_weight_index_scrambled(i);
+#else
+        return i;
+#endif
+    }
+    // Read network parameters
+    bool read_parameters(std::istream& stream) {
+        read_little_endian<BiasType>(stream, biases, OutputDimensions);
+        for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+            weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
+        return !stream.fail();
+    }
+    // Write network parameters
+    bool write_parameters(std::ostream& stream) const {
+        write_little_endian<BiasType>(stream, biases, OutputDimensions);
+        for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+            write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
+        return !stream.fail();
+    }
+    std::size_t get_content_hash() const {
+        std::size_t h = 0;
+        hash_combine(h, get_raw_data_hash(biases));
+        hash_combine(h, get_raw_data_hash(weights));
+        hash_combine(h, get_hash_value(0));
+        return h;
+    }
+    // Forward propagation
+    void propagate(const InputType* input, OutputType* output) const {
+#ifdef ENABLE_SEQ_OPT
+        if constexpr (OutputDimensions > 1)
+        {
+    #if defined(USE_AVX512)
+            using vec_t = __m512i;
+        #define vec_set_32 _mm512_set1_epi32
+        #define vec_add_dpbusd_32 SIMD::m512_add_dpbusd_epi32
+    #elif defined(USE_AVX2)
+            using vec_t = __m256i;
+        #define vec_set_32 _mm256_set1_epi32
+        #define vec_add_dpbusd_32 SIMD::m256_add_dpbusd_epi32
+    #elif defined(USE_SSSE3)
+            using vec_t = __m128i;
+        #define vec_set_32 _mm_set1_epi32
+        #define vec_add_dpbusd_32 SIMD::m128_add_dpbusd_epi32
+    #elif defined(USE_NEON_DOTPROD)
+            using vec_t = int32x4_t;
+        #define vec_set_32 vdupq_n_s32
+        #define vec_add_dpbusd_32(acc, a, b) \
+            SIMD::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \
+                                                vreinterpretq_s8_s32(b))
+    #endif
+            static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType);
+            static_assert(OutputDimensions % OutputSimdWidth == 0);
+            constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
+            constexpr IndexType NumRegs   = OutputDimensions / OutputSimdWidth;
+            const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
+            vec_t        acc[NumRegs];
+            for (IndexType k = 0; k < NumRegs; ++k)
+                acc[k] = biasvec[k];
+            for (IndexType i = 0; i < NumChunks; ++i)
+            {
+                const vec_t in0 =
+                  vec_set_32(load_as<std::int32_t>(input + i * sizeof(std::int32_t)));
+                const auto col0 =
+                  reinterpret_cast<const vec_t*>(&weights[i * OutputDimensions * 4]);
+                for (IndexType k = 0; k < NumRegs; ++k)
+                    vec_add_dpbusd_32(acc[k], in0, col0[k]);
+            }
+            vec_t* outptr = reinterpret_cast<vec_t*>(output);
+            for (IndexType k = 0; k < NumRegs; ++k)
+                outptr[k] = acc[k];
+    #undef vec_set_32
+    #undef vec_add_dpbusd_32
+        }
+        else if constexpr (OutputDimensions == 1)
+        {
+    // We cannot use AVX512 for the last layer because there are only 32 inputs
+    // and the buffer is not padded to 64 elements.
+    #if defined(USE_AVX2)
+            using vec_t = __m256i;
+        #define vec_setzero() _mm256_setzero_si256()
+        #define vec_add_dpbusd_32 SIMD::m256_add_dpbusd_epi32
+        #define vec_hadd SIMD::m256_hadd
+    #elif defined(USE_SSSE3)
+            using vec_t = __m128i;
+        #define vec_setzero() _mm_setzero_si128()
+        #define vec_add_dpbusd_32 SIMD::m128_add_dpbusd_epi32
+        #define vec_hadd SIMD::m128_hadd
+    #elif defined(USE_NEON_DOTPROD)
+            using vec_t = int32x4_t;
+        #define vec_setzero() vdupq_n_s32(0)
+        #define vec_add_dpbusd_32(acc, a, b) \
+            SIMD::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \
+                                                vreinterpretq_s8_s32(b))
+        #define vec_hadd SIMD::neon_m128_hadd
+    #endif
+            const auto inputVector = reinterpret_cast<const vec_t*>(input);
+            static constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(InputType);
+            static_assert(PaddedInputDimensions % InputSimdWidth == 0);
+            constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth;
+            vec_t               sum0      = vec_setzero();
+            const auto          row0      = reinterpret_cast<const vec_t*>(&weights[0]);
+            for (int j = 0; j < int(NumChunks); ++j)
+            {
+                const vec_t in = inputVector[j];
+                vec_add_dpbusd_32(sum0, in, row0[j]);
+            }
+            output[0] = vec_hadd(sum0, biases[0]);
+    #undef vec_setzero
+    #undef vec_add_dpbusd_32
+    #undef vec_hadd
+        }
+#else
+        // Use old implementation for the other architectures.
+        affine_transform_non_ssse3<InputDimensions, PaddedInputDimensions, OutputDimensions>(
+          output, weights, biases, input);
+#endif
+    }
+   private:
+    using BiasType   = OutputType;
+    using WeightType = std::int8_t;
+    alignas(CacheLineSize) BiasType biases[OutputDimensions];
+    alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
+};
+}  // namespace Stockfish::Eval::NNUE::Layers
+#endif  // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED

src/nnue/layers/affine_transform_sparse_input.h ADDED Viewed

	@@ -0,0 +1,379 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+// Definition of layer AffineTransformSparseInput of NNUE evaluation function
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
+#define NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include "../../bitboard.h"
+#include "../../memory.h"
+#include "../simd.h"
+#include "../nnue_common.h"
+/*
+  This file contains the definition for a fully connected layer (aka affine transform) with block sparse input.
+*/
+namespace Stockfish::Eval::NNUE::Layers {
+#if (USE_SSSE3 | (USE_NEON >= 8))
+static constexpr int lsb_index64[64] = {
+  0,  47, 1,  56, 48, 27, 2,  60, 57, 49, 41, 37, 28, 16, 3,  61, 54, 58, 35, 52, 50, 42,
+  21, 44, 38, 32, 29, 23, 17, 11, 4,  62, 46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43,
+  31, 22, 10, 45, 25, 39, 14, 33, 19, 30, 9,  24, 13, 18, 8,  12, 7,  6,  5,  63};
+constexpr int constexpr_lsb(uint64_t bb) {
+    assert(bb != 0);
+    constexpr uint64_t debruijn64 = 0x03F79D71B4CB0A89ULL;
+    return lsb_index64[((bb ^ (bb - 1)) * debruijn64) >> 58];
+}
+alignas(CacheLineSize) static constexpr struct OffsetIndices {
+    std::uint16_t offset_indices[256][8];
+    constexpr OffsetIndices() :
+        offset_indices() {
+        for (int i = 0; i < 256; ++i)
+        {
+            std::uint64_t j = i, k = 0;
+            while (j)
+            {
+                offset_indices[i][k++] = constexpr_lsb(j);
+                j &= j - 1;
+            }
+            while (k < 8)
+                offset_indices[i][k++] = 0;
+        }
+    }
+} Lookup;
+    #if defined(__GNUC__) || defined(__clang__)
+        #define RESTRICT __restrict__
+    #elif defined(_MSC_VER)
+        #define RESTRICT __restrict
+    #else
+        #define RESTRICT
+    #endif
+// Find indices of nonzero 32-bit values in a packed byte buffer.
+// The input pointer addresses a sequence of 32-bit blocks stored in a
+// std::uint8_t array.
+template<const IndexType InputDimensions>
+void find_nnz(const std::uint8_t* RESTRICT input,
+              std::uint16_t* RESTRICT      out,
+              IndexType&                   count_out) {
+    #if defined(USE_AVX512ICL)
+    constexpr IndexType SimdWidthIn  = 64;  // 512 bits
+    constexpr IndexType SimdWidthOut = 32;  // 512 bits / 16 bits
+    constexpr IndexType NumChunks    = InputDimensions / SimdWidthOut;
+    const __m512i       increment    = _mm512_set1_epi16(SimdWidthOut);
+    __m512i             base = _mm512_set_epi16(  // Same permute order as _mm512_packus_epi32()
+      31, 30, 29, 28, 15, 14, 13, 12, 27, 26, 25, 24, 11, 10, 9, 8, 23, 22, 21, 20, 7, 6, 5, 4, 19,
+      18, 17, 16, 3, 2, 1, 0);
+    IndexType count = 0;
+    for (IndexType i = 0; i < NumChunks; ++i)
+    {
+        const __m512i inputV0 = _mm512_load_si512(input + i * 2 * SimdWidthIn);
+        const __m512i inputV1 = _mm512_load_si512(input + i * 2 * SimdWidthIn + SimdWidthIn);
+        // Get a bitmask and gather non zero indices
+        const __m512i   inputV01 = _mm512_packus_epi32(inputV0, inputV1);
+        const __mmask32 nnzMask  = _mm512_test_epi16_mask(inputV01, inputV01);
+        // Avoid _mm512_mask_compressstoreu_epi16() as it's 256 uOps on Zen4
+        __m512i nnz = _mm512_maskz_compress_epi16(nnzMask, base);
+        _mm512_storeu_si512(out + count, nnz);
+        count += popcount(nnzMask);
+        base = _mm512_add_epi16(base, increment);
+    }
+    count_out = count;
+    #elif defined(USE_AVX512)
+    constexpr IndexType SimdWidth = 16;  // 512 bits / 32 bits
+    constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+    const __m512i       increment = _mm512_set1_epi32(SimdWidth);
+    __m512i base = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    IndexType count = 0;
+    for (IndexType i = 0; i < NumChunks; ++i)
+    {
+        const __m512i inputV = _mm512_load_si512(input + i * SimdWidth * sizeof(std::uint32_t));
+        // Get a bitmask and gather non zero indices
+        const __mmask16 nnzMask = _mm512_test_epi32_mask(inputV, inputV);
+        const __m512i   nnzV    = _mm512_maskz_compress_epi32(nnzMask, base);
+        _mm512_mask_cvtepi32_storeu_epi16(out + count, 0xFFFF, nnzV);
+        count += popcount(nnzMask);
+        base = _mm512_add_epi32(base, increment);
+    }
+    count_out = count;
+    #else
+    using namespace SIMD;
+    constexpr IndexType InputSimdWidth = sizeof(vec_uint_t) / sizeof(std::int32_t);
+    // Outputs are processed 8 elements at a time, even if the SIMD width is narrower
+    constexpr IndexType ChunkSize      = 8;
+    constexpr IndexType NumChunks      = InputDimensions / ChunkSize;
+    constexpr IndexType InputsPerChunk = ChunkSize / InputSimdWidth;
+    static_assert(InputsPerChunk > 0 && "SIMD width too wide");
+    const auto     inputVector = reinterpret_cast<const vec_uint_t*>(input);
+    IndexType      count       = 0;
+    vec128_t       base        = vec128_zero;
+    const vec128_t increment   = vec128_set_16(8);
+    for (IndexType i = 0; i < NumChunks; ++i)
+    {
+        // bitmask of nonzero values in this chunk
+        unsigned nnz = 0;
+        for (IndexType j = 0; j < InputsPerChunk; ++j)
+        {
+            const vec_uint_t inputChunk = inputVector[i * InputsPerChunk + j];
+            nnz |= unsigned(vec_nnz(inputChunk)) << (j * InputSimdWidth);
+        }
+        const vec128_t offsets =
+          vec128_load(reinterpret_cast<const vec128_t*>(&Lookup.offset_indices[nnz]));
+        vec128_storeu(reinterpret_cast<vec128_t*>(out + count), vec128_add(base, offsets));
+        count += popcount(nnz);
+        base = vec128_add(base, increment);
+    }
+    count_out = count;
+    #endif
+}
+#endif
+// Sparse input implementation
+template<IndexType InDims, IndexType OutDims>
+class AffineTransformSparseInput {
+   public:
+    // Input/output type
+    using InputType  = std::uint8_t;
+    using OutputType = std::int32_t;
+    // Number of input/output dimensions
+    static constexpr IndexType InputDimensions  = InDims;
+    static constexpr IndexType OutputDimensions = OutDims;
+    static_assert(OutputDimensions % 16 == 0,
+                  "Only implemented for OutputDimensions divisible by 16.");
+    static constexpr IndexType PaddedInputDimensions =
+      ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
+    static constexpr IndexType PaddedOutputDimensions =
+      ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth);
+#if (USE_SSSE3 | (USE_NEON >= 8))
+    static constexpr IndexType ChunkSize = 4;
+#else
+    static constexpr IndexType ChunkSize = 1;
+#endif
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
+        std::uint32_t hashValue = 0xCC03DAE4u;
+        hashValue += OutputDimensions;
+        hashValue ^= prevHash >> 1;
+        hashValue ^= prevHash << 31;
+        return hashValue;
+    }
+    static constexpr IndexType get_weight_index_scrambled(IndexType i) {
+        return (i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize
+             + i / PaddedInputDimensions * ChunkSize + i % ChunkSize;
+    }
+    static constexpr IndexType get_weight_index(IndexType i) {
+#if (USE_SSSE3 | (USE_NEON >= 8))
+        return get_weight_index_scrambled(i);
+#else
+        return i;
+#endif
+    }
+    // Read network parameters
+    bool read_parameters(std::istream& stream) {
+        read_little_endian<BiasType>(stream, biases, OutputDimensions);
+        for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+            weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
+        return !stream.fail();
+    }
+    // Write network parameters
+    bool write_parameters(std::ostream& stream) const {
+        write_little_endian<BiasType>(stream, biases, OutputDimensions);
+        for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+            write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
+        return !stream.fail();
+    }
+    std::size_t get_content_hash() const {
+        std::size_t h = 0;
+        hash_combine(h, get_raw_data_hash(biases));
+        hash_combine(h, get_raw_data_hash(weights));
+        hash_combine(h, get_hash_value(0));
+        return h;
+    }
+    // Forward propagation
+    void propagate(const InputType* input, OutputType* output) const {
+#if (USE_SSSE3 | (USE_NEON >= 8))
+    #if defined(USE_AVX512)
+        using invec_t  = __m512i;
+        using outvec_t = __m512i;
+        #define vec_add_32 _mm512_add_epi32
+        #define vec_set_32 _mm512_set1_epi32
+        #define vec_add_dpbusd_32 SIMD::m512_add_dpbusd_epi32
+    #elif defined(USE_AVX2)
+        using invec_t  = __m256i;
+        using outvec_t = __m256i;
+        #define vec_add_32 _mm256_add_epi32
+        #define vec_set_32 _mm256_set1_epi32
+        #define vec_add_dpbusd_32 SIMD::m256_add_dpbusd_epi32
+    #elif defined(USE_SSSE3)
+        using invec_t  = __m128i;
+        using outvec_t = __m128i;
+        #define vec_set_32 _mm_set1_epi32
+        #define vec_add_dpbusd_32 SIMD::m128_add_dpbusd_epi32
+    #elif defined(USE_NEON_DOTPROD)
+        using invec_t  = int8x16_t;
+        using outvec_t = int32x4_t;
+        #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
+        #define vec_add_dpbusd_32 SIMD::dotprod_m128_add_dpbusd_epi32
+    #elif defined(USE_NEON)
+        using invec_t  = int8x16_t;
+        using outvec_t = int32x4_t;
+        #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
+        #define vec_add_dpbusd_32 SIMD::neon_m128_add_dpbusd_epi32
+    #endif
+        constexpr IndexType OutputSimdWidth = sizeof(outvec_t) / sizeof(OutputType);
+        constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / ChunkSize;
+        constexpr IndexType NumAccums = OutputDimensions / OutputSimdWidth;
+        // If we're using high-latency dot product instructions, split the accumulators
+        // to create 3 separate dependency chains and merge at the end
+        constexpr IndexType NumRegs =
+    #if defined(USE_VNNI)
+          3 * NumAccums;
+    #else
+          NumAccums;
+    #endif
+        std::uint16_t nnz[NumChunks];
+        IndexType     count;
+        // Find indices of nonzero 32-bit blocks
+        find_nnz<NumChunks>(input, nnz, count);
+        const outvec_t* biasvec = reinterpret_cast<const outvec_t*>(biases);
+        outvec_t        acc[NumRegs];
+        for (IndexType k = 0; k < NumAccums; ++k)
+            acc[k] = biasvec[k];
+        const auto* start = nnz;
+        const auto* end   = nnz + count;
+        // convince GCC to not do weird pointer arithmetic in the following loop
+        const std::int8_t* weights_cp = weights;
+    #if defined(USE_VNNI)
+        for (IndexType k = NumAccums; k < NumRegs; ++k)
+            acc[k] = vec_zero();
+        while (start < end - 2)
+        {
+            const std::ptrdiff_t i0 = *start++;
+            const std::ptrdiff_t i1 = *start++;
+            const std::ptrdiff_t i2 = *start++;
+            const invec_t        in0 =
+              vec_set_32(load_as<std::int32_t>(input + i0 * sizeof(std::int32_t)));
+            const invec_t in1 =
+              vec_set_32(load_as<std::int32_t>(input + i1 * sizeof(std::int32_t)));
+            const invec_t in2 =
+              vec_set_32(load_as<std::int32_t>(input + i2 * sizeof(std::int32_t)));
+            const auto col0 =
+              reinterpret_cast<const invec_t*>(&weights_cp[i0 * OutputDimensions * ChunkSize]);
+            const auto col1 =
+              reinterpret_cast<const invec_t*>(&weights_cp[i1 * OutputDimensions * ChunkSize]);
+            const auto col2 =
+              reinterpret_cast<const invec_t*>(&weights_cp[i2 * OutputDimensions * ChunkSize]);
+            for (IndexType k = 0; k < NumAccums; ++k)
+            {
+                vec_add_dpbusd_32(acc[k], in0, col0[k]);
+                vec_add_dpbusd_32(acc[k + NumAccums], in1, col1[k]);
+                vec_add_dpbusd_32(acc[k + 2 * NumAccums], in2, col2[k]);
+            }
+        }
+        for (IndexType k = 0; k < NumAccums; ++k)
+            acc[k] = vec_add_32(vec_add_32(acc[k], acc[k + NumAccums]), acc[k + 2 * NumAccums]);
+    #endif
+        while (start < end)
+        {
+            const std::ptrdiff_t i = *start++;
+            const invec_t in = vec_set_32(load_as<std::int32_t>(input + i * sizeof(std::int32_t)));
+            const auto    col =
+              reinterpret_cast<const invec_t*>(&weights_cp[i * OutputDimensions * ChunkSize]);
+            for (IndexType k = 0; k < NumAccums; ++k)
+                vec_add_dpbusd_32(acc[k], in, col[k]);
+        }
+        outvec_t* outptr = reinterpret_cast<outvec_t*>(output);
+        for (IndexType k = 0; k < NumAccums; ++k)
+            outptr[k] = acc[k];
+    #undef vec_set_32
+    #undef vec_add_dpbusd_32
+    #ifdef vec_add_32
+        #undef vec_add_32
+    #endif
+#else
+        // Use dense implementation for the other architectures.
+        affine_transform_non_ssse3<InputDimensions, PaddedInputDimensions, OutputDimensions>(
+          output, weights, biases, input);
+#endif
+    }
+   private:
+    using BiasType   = OutputType;
+    using WeightType = std::int8_t;
+    alignas(CacheLineSize) BiasType biases[OutputDimensions];
+    alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
+};
+}  // namespace Stockfish::Eval::NNUE::Layers
+#endif  // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED

src/nnue/layers/clipped_relu.h ADDED Viewed

	@@ -0,0 +1,170 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+// Definition of layer ClippedReLU of NNUE evaluation function
+#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#include <algorithm>
+#include <cstdint>
+#include <iosfwd>
+#include "../nnue_common.h"
+namespace Stockfish::Eval::NNUE::Layers {
+// Clipped ReLU
+template<IndexType InDims>
+class ClippedReLU {
+   public:
+    // Input/output type
+    using InputType  = std::int32_t;
+    using OutputType = std::uint8_t;
+    // Number of input/output dimensions
+    static constexpr IndexType InputDimensions  = InDims;
+    static constexpr IndexType OutputDimensions = InputDimensions;
+    static constexpr IndexType PaddedOutputDimensions =
+      ceil_to_multiple<IndexType>(OutputDimensions, 32);
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
+        std::uint32_t hashValue = 0x538D24C7u;
+        hashValue += prevHash;
+        return hashValue;
+    }
+    // Read network parameters
+    bool read_parameters(std::istream&) { return true; }
+    // Write network parameters
+    bool write_parameters(std::ostream&) const { return true; }
+    std::size_t get_content_hash() const {
+        std::size_t h = 0;
+        hash_combine(h, get_hash_value(0));
+        return h;
+    }
+    // Forward propagation
+    void propagate(const InputType* input, OutputType* output) const {
+#if defined(USE_AVX2)
+        if constexpr (InputDimensions % SimdWidth == 0)
+        {
+            constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+            const __m256i       Offsets   = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+            const auto          in        = reinterpret_cast<const __m256i*>(input);
+            const auto          out       = reinterpret_cast<__m256i*>(output);
+            for (IndexType i = 0; i < NumChunks; ++i)
+            {
+                const __m256i words0 =
+                  _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 0]),
+                                                        _mm256_load_si256(&in[i * 4 + 1])),
+                                    WeightScaleBits);
+                const __m256i words1 =
+                  _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 2]),
+                                                        _mm256_load_si256(&in[i * 4 + 3])),
+                                    WeightScaleBits);
+                _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(
+                                              _mm256_packs_epi16(words0, words1), Offsets));
+            }
+        }
+        else
+        {
+            constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
+            const auto          in        = reinterpret_cast<const __m128i*>(input);
+            const auto          out       = reinterpret_cast<__m128i*>(output);
+            for (IndexType i = 0; i < NumChunks; ++i)
+            {
+                const __m128i words0 = _mm_srli_epi16(
+                  _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
+                  WeightScaleBits);
+                const __m128i words1 = _mm_srli_epi16(
+                  _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
+                  WeightScaleBits);
+                _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
+            }
+        }
+        constexpr IndexType Start = InputDimensions % SimdWidth == 0
+                                    ? InputDimensions / SimdWidth * SimdWidth
+                                    : InputDimensions / (SimdWidth / 2) * (SimdWidth / 2);
+#elif defined(USE_SSE2)
+        constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+    #ifndef USE_SSE41
+        const __m128i k0x80s = _mm_set1_epi8(-128);
+    #endif
+        const auto in  = reinterpret_cast<const __m128i*>(input);
+        const auto out = reinterpret_cast<__m128i*>(output);
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+    #if defined(USE_SSE41)
+            const __m128i words0 = _mm_srli_epi16(
+              _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
+              WeightScaleBits);
+            const __m128i words1 = _mm_srli_epi16(
+              _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
+              WeightScaleBits);
+            _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
+    #else
+            const __m128i words0 = _mm_srai_epi16(
+              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
+              WeightScaleBits);
+            const __m128i words1 = _mm_srai_epi16(
+              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
+              WeightScaleBits);
+            const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+            _mm_store_si128(&out[i], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s));
+    #endif
+        }
+        constexpr IndexType Start = NumChunks * SimdWidth;
+#elif defined(USE_NEON)
+        constexpr IndexType    NumChunks = InputDimensions / (SimdWidth / 2);
+        const SIMD::vec_i8x8_t Zero      = {0};
+        const auto             in        = reinterpret_cast<const SIMD::vec_i32x4_t*>(input);
+        const auto             out       = reinterpret_cast<SIMD::vec_i8x8_t*>(output);
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+            int16x8_t  shifted;
+            const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
+            pack[0]         = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
+            pack[1]         = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
+            out[i]          = vmax_s8(vqmovn_s16(shifted), Zero);
+        }
+        constexpr IndexType Start = NumChunks * (SimdWidth / 2);
+#else
+        constexpr IndexType Start = 0;
+#endif
+        for (IndexType i = Start; i < InputDimensions; ++i)
+        {
+            output[i] = static_cast<OutputType>(std::clamp(input[i] >> WeightScaleBits, 0, 127));
+        }
+    }
+};
+}  // namespace Stockfish::Eval::NNUE::Layers
+#endif  // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED

src/nnue/layers/sqr_clipped_relu.h ADDED Viewed

	@@ -0,0 +1,109 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+// Definition of layer ClippedReLU of NNUE evaluation function
+#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#define NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#include <algorithm>
+#include <cstdint>
+#include <iosfwd>
+#include "../nnue_common.h"
+namespace Stockfish::Eval::NNUE::Layers {
+// Clipped ReLU
+template<IndexType InDims>
+class SqrClippedReLU {
+   public:
+    // Input/output type
+    using InputType  = std::int32_t;
+    using OutputType = std::uint8_t;
+    // Number of input/output dimensions
+    static constexpr IndexType InputDimensions  = InDims;
+    static constexpr IndexType OutputDimensions = InputDimensions;
+    static constexpr IndexType PaddedOutputDimensions =
+      ceil_to_multiple<IndexType>(OutputDimensions, 32);
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
+        std::uint32_t hashValue = 0x538D24C7u;
+        hashValue += prevHash;
+        return hashValue;
+    }
+    // Read network parameters
+    bool read_parameters(std::istream&) { return true; }
+    // Write network parameters
+    bool write_parameters(std::ostream&) const { return true; }
+    std::size_t get_content_hash() const {
+        std::size_t h = 0;
+        hash_combine(h, get_hash_value(0));
+        return h;
+    }
+    // Forward propagation
+    void propagate(const InputType* input, OutputType* output) const {
+#if defined(USE_SSE2)
+        constexpr IndexType NumChunks = InputDimensions / 16;
+        static_assert(WeightScaleBits == 6);
+        const auto in  = reinterpret_cast<const __m128i*>(input);
+        const auto out = reinterpret_cast<__m128i*>(output);
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+            __m128i words0 =
+              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1]));
+            __m128i words1 =
+              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3]));
+            // We shift by WeightScaleBits * 2 = 12 and divide by 128
+            // which is an additional shift-right of 7, meaning 19 in total.
+            // MulHi strips the lower 16 bits so we need to shift out 3 more to match.
+            words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3);
+            words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3);
+            _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
+        }
+        constexpr IndexType Start = NumChunks * 16;
+#else
+        constexpr IndexType Start = 0;
+#endif
+        for (IndexType i = Start; i < InputDimensions; ++i)
+        {
+            output[i] = static_cast<OutputType>(
+              // Really should be /127 but we need to make it fast so we right-shift
+              // by an extra 7 bits instead. Needs to be accounted for in the trainer.
+              std::min(127ll, ((long long) (input[i]) * input[i]) >> (2 * WeightScaleBits + 7)));
+        }
+    }
+};
+}  // namespace Stockfish::Eval::NNUE::Layers
+#endif  // NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED

src/nnue/network.cpp ADDED Viewed

	@@ -0,0 +1,415 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "network.h"
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <optional>
+#include <type_traits>
+#include <vector>
+#define INCBIN_SILENCE_BITCODE_WARNING
+#include "../incbin/incbin.h"
+#include "../evaluate.h"
+#include "../misc.h"
+#include "../position.h"
+#include "../types.h"
+#include "nnue_architecture.h"
+#include "nnue_common.h"
+#include "nnue_misc.h"
+// Macro to embed the default efficiently updatable neural network (NNUE) file
+// data in the engine binary (using incbin.h, by Dale Weiler).
+// This macro invocation will declare the following three variables
+//     const unsigned char        gEmbeddedNNUEData[];  // a pointer to the embedded data
+//     const unsigned char *const gEmbeddedNNUEEnd;     // a marker to the end
+//     const unsigned int         gEmbeddedNNUESize;    // the size of the embedded file
+// Note that this does not work in Microsoft Visual Studio.
+#if !defined(_MSC_VER) && !defined(NNUE_EMBEDDING_OFF)
+INCBIN(EmbeddedNNUEBig, EvalFileDefaultNameBig);
+INCBIN(EmbeddedNNUESmall, EvalFileDefaultNameSmall);
+#else
+const unsigned char        gEmbeddedNNUEBigData[1]   = {0x0};
+const unsigned char* const gEmbeddedNNUEBigEnd       = &gEmbeddedNNUEBigData[1];
+const unsigned int         gEmbeddedNNUEBigSize      = 1;
+const unsigned char        gEmbeddedNNUESmallData[1] = {0x0};
+const unsigned char* const gEmbeddedNNUESmallEnd     = &gEmbeddedNNUESmallData[1];
+const unsigned int         gEmbeddedNNUESmallSize    = 1;
+#endif
+namespace {
+struct EmbeddedNNUE {
+    EmbeddedNNUE(const unsigned char* embeddedData,
+                 const unsigned char* embeddedEnd,
+                 const unsigned int   embeddedSize) :
+        data(embeddedData),
+        end(embeddedEnd),
+        size(embeddedSize) {}
+    const unsigned char* data;
+    const unsigned char* end;
+    const unsigned int   size;
+};
+using namespace Stockfish::Eval::NNUE;
+EmbeddedNNUE get_embedded(EmbeddedNNUEType type) {
+    if (type == EmbeddedNNUEType::BIG)
+        return EmbeddedNNUE(gEmbeddedNNUEBigData, gEmbeddedNNUEBigEnd, gEmbeddedNNUEBigSize);
+    else
+        return EmbeddedNNUE(gEmbeddedNNUESmallData, gEmbeddedNNUESmallEnd, gEmbeddedNNUESmallSize);
+}
+}
+namespace Stockfish::Eval::NNUE {
+namespace Detail {
+// Read evaluation function parameters
+template<typename T>
+bool read_parameters(std::istream& stream, T& reference) {
+    std::uint32_t header;
+    header = read_little_endian<std::uint32_t>(stream);
+    if (!stream)
+        return false;
+    return reference.read_parameters(stream);
+}
+// Write evaluation function parameters
+template<typename T>
+bool write_parameters(std::ostream& stream, const T& reference) {
+    write_little_endian<std::uint32_t>(stream, T::get_hash_value());
+    return reference.write_parameters(stream);
+}
+}  // namespace Detail
+template<typename Arch, typename Transformer>
+void Network<Arch, Transformer>::load(const std::string& rootDirectory, std::string evalfilePath) {
+#if defined(DEFAULT_NNUE_DIRECTORY)
+    std::vector<std::string> dirs = {"<internal>", "", rootDirectory,
+                                     stringify(DEFAULT_NNUE_DIRECTORY)};
+#else
+    std::vector<std::string> dirs = {"<internal>", "", rootDirectory};
+#endif
+    if (evalfilePath.empty())
+        evalfilePath = evalFile.defaultName;
+    for (const auto& directory : dirs)
+    {
+        if (std::string(evalFile.current) != evalfilePath)
+        {
+            if (directory != "<internal>")
+            {
+                load_user_net(directory, evalfilePath);
+            }
+            if (directory == "<internal>" && evalfilePath == std::string(evalFile.defaultName))
+            {
+                load_internal();
+            }
+        }
+    }
+}
+template<typename Arch, typename Transformer>
+bool Network<Arch, Transformer>::save(const std::optional<std::string>& filename) const {
+    std::string actualFilename;
+    std::string msg;
+    if (filename.has_value())
+        actualFilename = filename.value();
+    else
+    {
+        if (std::string(evalFile.current) != std::string(evalFile.defaultName))
+        {
+            msg = "Failed to export a net. "
+                  "A non-embedded net can only be saved if the filename is specified";
+            sync_cout << msg << sync_endl;
+            return false;
+        }
+        actualFilename = evalFile.defaultName;
+    }
+    std::ofstream stream(actualFilename, std::ios_base::binary);
+    bool          saved = save(stream, evalFile.current, evalFile.netDescription);
+    msg = saved ? "Network saved successfully to " + actualFilename : "Failed to export a net";
+    sync_cout << msg << sync_endl;
+    return saved;
+}
+template<typename Arch, typename Transformer>
+NetworkOutput
+Network<Arch, Transformer>::evaluate(const Position&                         pos,
+                                     AccumulatorStack&                       accumulatorStack,
+                                     AccumulatorCaches::Cache<FTDimensions>& cache) const {
+    constexpr uint64_t alignment = CacheLineSize;
+    alignas(alignment)
+      TransformedFeatureType transformedFeatures[FeatureTransformer<FTDimensions>::BufferSize];
+    ASSERT_ALIGNED(transformedFeatures, alignment);
+    const int  bucket = (pos.count<ALL_PIECES>() - 1) / 4;
+    const auto psqt =
+      featureTransformer.transform(pos, accumulatorStack, cache, transformedFeatures, bucket);
+    const auto positional = network[bucket].propagate(transformedFeatures);
+    return {static_cast<Value>(psqt / OutputScale), static_cast<Value>(positional / OutputScale)};
+}
+template<typename Arch, typename Transformer>
+void Network<Arch, Transformer>::verify(std::string                                  evalfilePath,
+                                        const std::function<void(std::string_view)>& f) const {
+    if (evalfilePath.empty())
+        evalfilePath = evalFile.defaultName;
+    if (std::string(evalFile.current) != evalfilePath)
+    {
+        if (f)
+        {
+            std::string msg1 =
+              "Network evaluation parameters compatible with the engine must be available.";
+            std::string msg2 = "The network file " + evalfilePath + " was not loaded successfully.";
+            std::string msg3 = "The UCI option EvalFile might need to specify the full path, "
+                               "including the directory name, to the network file.";
+            std::string msg4 = "The default net can be downloaded from: "
+                               "https://tests.stockfishchess.org/api/nn/"
+                             + std::string(evalFile.defaultName);
+            std::string msg5 = "The engine will be terminated now.";
+            std::string msg = "ERROR: " + msg1 + '\n' + "ERROR: " + msg2 + '\n' + "ERROR: " + msg3
+                            + '\n' + "ERROR: " + msg4 + '\n' + "ERROR: " + msg5 + '\n';
+            f(msg);
+        }
+        exit(EXIT_FAILURE);
+    }
+    if (f)
+    {
+        size_t size = sizeof(featureTransformer) + sizeof(Arch) * LayerStacks;
+        f("NNUE evaluation using " + evalfilePath + " (" + std::to_string(size / (1024 * 1024))
+          + "MiB, (" + std::to_string(featureTransformer.TotalInputDimensions) + ", "
+          + std::to_string(network[0].TransformedFeatureDimensions) + ", "
+          + std::to_string(network[0].FC_0_OUTPUTS) + ", " + std::to_string(network[0].FC_1_OUTPUTS)
+          + ", 1))");
+    }
+}
+template<typename Arch, typename Transformer>
+NnueEvalTrace
+Network<Arch, Transformer>::trace_evaluate(const Position&                         pos,
+                                           AccumulatorStack&                       accumulatorStack,
+                                           AccumulatorCaches::Cache<FTDimensions>& cache) const {
+    constexpr uint64_t alignment = CacheLineSize;
+    alignas(alignment)
+      TransformedFeatureType transformedFeatures[FeatureTransformer<FTDimensions>::BufferSize];
+    ASSERT_ALIGNED(transformedFeatures, alignment);
+    NnueEvalTrace t{};
+    t.correctBucket = (pos.count<ALL_PIECES>() - 1) / 4;
+    for (IndexType bucket = 0; bucket < LayerStacks; ++bucket)
+    {
+        const auto materialist =
+          featureTransformer.transform(pos, accumulatorStack, cache, transformedFeatures, bucket);
+        const auto positional = network[bucket].propagate(transformedFeatures);
+        t.psqt[bucket]       = static_cast<Value>(materialist / OutputScale);
+        t.positional[bucket] = static_cast<Value>(positional / OutputScale);
+    }
+    return t;
+}
+template<typename Arch, typename Transformer>
+void Network<Arch, Transformer>::load_user_net(const std::string& dir,
+                                               const std::string& evalfilePath) {
+    std::ifstream stream(dir + evalfilePath, std::ios::binary);
+    auto          description = load(stream);
+    if (description.has_value())
+    {
+        evalFile.current        = evalfilePath;
+        evalFile.netDescription = description.value();
+    }
+}
+template<typename Arch, typename Transformer>
+void Network<Arch, Transformer>::load_internal() {
+    // C++ way to prepare a buffer for a memory stream
+    class MemoryBuffer: public std::basic_streambuf<char> {
+       public:
+        MemoryBuffer(char* p, size_t n) {
+            setg(p, p, p + n);
+            setp(p, p + n);
+        }
+    };
+    const auto embedded = get_embedded(embeddedType);
+    MemoryBuffer buffer(const_cast<char*>(reinterpret_cast<const char*>(embedded.data)),
+                        size_t(embedded.size));
+    std::istream stream(&buffer);
+    auto         description = load(stream);
+    if (description.has_value())
+    {
+        evalFile.current        = evalFile.defaultName;
+        evalFile.netDescription = description.value();
+    }
+}
+template<typename Arch, typename Transformer>
+void Network<Arch, Transformer>::initialize() {
+    initialized = true;
+}
+template<typename Arch, typename Transformer>
+bool Network<Arch, Transformer>::save(std::ostream&      stream,
+                                      const std::string& name,
+                                      const std::string& netDescription) const {
+    if (name.empty() || name == "None")
+        return false;
+    return write_parameters(stream, netDescription);
+}
+template<typename Arch, typename Transformer>
+std::optional<std::string> Network<Arch, Transformer>::load(std::istream& stream) {
+    initialize();
+    std::string description;
+    return read_parameters(stream, description) ? std::make_optional(description) : std::nullopt;
+}
+template<typename Arch, typename Transformer>
+std::size_t Network<Arch, Transformer>::get_content_hash() const {
+    if (!initialized)
+        return 0;
+    std::size_t h = 0;
+    hash_combine(h, featureTransformer);
+    for (auto&& layerstack : network)
+        hash_combine(h, layerstack);
+    hash_combine(h, evalFile);
+    hash_combine(h, static_cast<int>(embeddedType));
+    return h;
+}
+// Read network header
+template<typename Arch, typename Transformer>
+bool Network<Arch, Transformer>::read_header(std::istream&  stream,
+                                             std::uint32_t* hashValue,
+                                             std::string*   desc) const {
+    std::uint32_t magic;
+    magic = read_little_endian<std::uint32_t>(stream);
+    *hashValue = read_little_endian<std::uint32_t>(stream);
+    std::uint32_t size = read_little_endian<std::uint32_t>(stream);
+    if (!stream || magic != Version)
+        return false;
+    desc->resize(size);
+    stream.read(&(*desc)[0], size);
+    return !stream.fail();
+}
+// Write network header
+template<typename Arch, typename Transformer>
+bool Network<Arch, Transformer>::write_header(std::ostream&      stream,
+                                              std::uint32_t      hashValue,
+                                              const std::string& desc) const {
+    write_little_endian<std::uint32_t>(stream, Version);
+    write_little_endian<std::uint32_t>(stream, hashValue);
+    write_little_endian<std::uint32_t>(stream, std::uint32_t(desc.size()));
+    stream.write(&desc[0], desc.size());
+    return !stream.fail();
+}
+template<typename Arch, typename Transformer>
+bool Network<Arch, Transformer>::read_parameters(std::istream& stream,
+                                                 std::string&  netDescription) {
+    std::uint32_t hashValue;
+    if (!read_header(stream, &hashValue, &netDescription))
+        return false;
+    if (false && hashValue != Network::hash)
+        return false;
+    if (!Detail::read_parameters(stream, featureTransformer))
+        return false;
+    for (std::size_t i = 0; i < LayerStacks; ++i)
+    {
+        if (!Detail::read_parameters(stream, network[i]))
+            return false;
+    }
+    return stream && stream.peek() == std::ios::traits_type::eof();
+}
+template<typename Arch, typename Transformer>
+bool Network<Arch, Transformer>::write_parameters(std::ostream&      stream,
+                                                  const std::string& netDescription) const {
+    if (!write_header(stream, Network::hash, netDescription))
+        return false;
+    if (!Detail::write_parameters(stream, featureTransformer))
+        return false;
+    for (std::size_t i = 0; i < LayerStacks; ++i)
+    {
+        if (!Detail::write_parameters(stream, network[i]))
+            return false;
+    }
+    return bool(stream);
+}
+// Explicit template instantiations
+template class Network<NetworkArchitecture<TransformedFeatureDimensionsBig, L2Big, L3Big>,
+                       FeatureTransformer<TransformedFeatureDimensionsBig>>;
+template class Network<NetworkArchitecture<TransformedFeatureDimensionsSmall, L2Small, L3Small>,
+                       FeatureTransformer<TransformedFeatureDimensionsSmall>>;
+}  // namespace Stockfish::Eval::NNUE

src/nnue/network.h ADDED Viewed

	@@ -0,0 +1,161 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef NETWORK_H_INCLUDED
+#define NETWORK_H_INCLUDED
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include "../misc.h"
+#include "../types.h"
+#include "nnue_accumulator.h"
+#include "nnue_architecture.h"
+#include "nnue_common.h"
+#include "nnue_feature_transformer.h"
+#include "nnue_misc.h"
+namespace Stockfish {
+class Position;
+}
+namespace Stockfish::Eval::NNUE {
+enum class EmbeddedNNUEType {
+    BIG,
+    SMALL,
+};
+using NetworkOutput = std::tuple<Value, Value>;
+// The network must be a trivial type, i.e. the memory must be in-line.
+// This is required to allow sharing the network via shared memory, as
+// there is no way to run destructors.
+template<typename Arch, typename Transformer>
+class Network {
+    static constexpr IndexType FTDimensions = Arch::TransformedFeatureDimensions;
+   public:
+    Network(EvalFile file, EmbeddedNNUEType type) :
+        evalFile(file),
+        embeddedType(type) {}
+    Network(const Network& other) = default;
+    Network(Network&& other)      = default;
+    Network& operator=(const Network& other) = default;
+    Network& operator=(Network&& other)      = default;
+    void load(const std::string& rootDirectory, std::string evalfilePath);
+    bool save(const std::optional<std::string>& filename) const;
+    std::size_t get_content_hash() const;
+    NetworkOutput evaluate(const Position&                         pos,
+                           AccumulatorStack&                       accumulatorStack,
+                           AccumulatorCaches::Cache<FTDimensions>& cache) const;
+    void verify(std::string evalfilePath, const std::function<void(std::string_view)>&) const;
+    NnueEvalTrace trace_evaluate(const Position&                         pos,
+                                 AccumulatorStack&                       accumulatorStack,
+                                 AccumulatorCaches::Cache<FTDimensions>& cache) const;
+   private:
+    void load_user_net(const std::string&, const std::string&);
+    void load_internal();
+    void initialize();
+    bool                       save(std::ostream&, const std::string&, const std::string&) const;
+    std::optional<std::string> load(std::istream&);
+    bool read_header(std::istream&, std::uint32_t*, std::string*) const;
+    bool write_header(std::ostream&, std::uint32_t, const std::string&) const;
+    bool read_parameters(std::istream&, std::string&);
+    bool write_parameters(std::ostream&, const std::string&) const;
+    // Input feature converter
+    Transformer featureTransformer;
+    // Evaluation function
+    Arch network[LayerStacks];
+    EvalFile         evalFile;
+    EmbeddedNNUEType embeddedType;
+    bool initialized = false;
+    // Hash value of evaluation function structure
+    static constexpr std::uint32_t hash = Transformer::get_hash_value() ^ Arch::get_hash_value();
+    template<IndexType Size>
+    friend struct AccumulatorCaches::Cache;
+};
+// Definitions of the network types
+using SmallFeatureTransformer = FeatureTransformer<TransformedFeatureDimensionsSmall>;
+using SmallNetworkArchitecture =
+  NetworkArchitecture<TransformedFeatureDimensionsSmall, L2Small, L3Small>;
+using BigFeatureTransformer  = FeatureTransformer<TransformedFeatureDimensionsBig>;
+using BigNetworkArchitecture = NetworkArchitecture<TransformedFeatureDimensionsBig, L2Big, L3Big>;
+using NetworkBig   = Network<BigNetworkArchitecture, BigFeatureTransformer>;
+using NetworkSmall = Network<SmallNetworkArchitecture, SmallFeatureTransformer>;
+struct Networks {
+    Networks(EvalFile bigFile, EvalFile smallFile) :
+        big(bigFile, EmbeddedNNUEType::BIG),
+        small(smallFile, EmbeddedNNUEType::SMALL) {}
+    NetworkBig   big;
+    NetworkSmall small;
+};
+}  // namespace Stockfish
+template<typename ArchT, typename FeatureTransformerT>
+struct std::hash<Stockfish::Eval::NNUE::Network<ArchT, FeatureTransformerT>> {
+    std::size_t operator()(
+      const Stockfish::Eval::NNUE::Network<ArchT, FeatureTransformerT>& network) const noexcept {
+        return network.get_content_hash();
+    }
+};
+template<>
+struct std::hash<Stockfish::Eval::NNUE::Networks> {
+    std::size_t operator()(const Stockfish::Eval::NNUE::Networks& networks) const noexcept {
+        std::size_t h = 0;
+        Stockfish::hash_combine(h, networks.big);
+        Stockfish::hash_combine(h, networks.small);
+        return h;
+    }
+};
+#endif

src/nnue/nnue_accumulator.cpp ADDED Viewed

	@@ -0,0 +1,952 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "nnue_accumulator.h"
+#include <cassert>
+#include <cstdint>
+#include <new>
+#include <type_traits>
+#include "../bitboard.h"
+#include "../misc.h"
+#include "../position.h"
+#include "../types.h"
+#include "features/half_ka_v2_hm.h"
+#include "nnue_architecture.h"
+#include "nnue_common.h"
+#include "nnue_feature_transformer.h"  // IWYU pragma: keep
+#include "simd.h"
+namespace Stockfish::Eval::NNUE {
+using namespace SIMD;
+namespace {
+template<IndexType TransformedFeatureDimensions>
+void double_inc_update(Color                                                   perspective,
+                       const FeatureTransformer<TransformedFeatureDimensions>& featureTransformer,
+                       const Square                                            ksq,
+                       AccumulatorState<PSQFeatureSet>&                        middle_state,
+                       AccumulatorState<PSQFeatureSet>&                        target_state,
+                       const AccumulatorState<PSQFeatureSet>&                  computed);
+template<IndexType TransformedFeatureDimensions>
+void double_inc_update(Color                                                   perspective,
+                       const FeatureTransformer<TransformedFeatureDimensions>& featureTransformer,
+                       const Square                                            ksq,
+                       AccumulatorState<ThreatFeatureSet>&                     middle_state,
+                       AccumulatorState<ThreatFeatureSet>&                     target_state,
+                       const AccumulatorState<ThreatFeatureSet>&               computed,
+                       const DirtyPiece&                                       dp2);
+template<bool Forward, typename FeatureSet, IndexType TransformedFeatureDimensions>
+void update_accumulator_incremental(
+  Color                                                   perspective,
+  const FeatureTransformer<TransformedFeatureDimensions>& featureTransformer,
+  const Square                                            ksq,
+  AccumulatorState<FeatureSet>&                           target_state,
+  const AccumulatorState<FeatureSet>&                     computed);
+template<IndexType Dimensions>
+void update_accumulator_refresh_cache(Color                                 perspective,
+                                      const FeatureTransformer<Dimensions>& featureTransformer,
+                                      const Position&                       pos,
+                                      AccumulatorState<PSQFeatureSet>&      accumulatorState,
+                                      AccumulatorCaches::Cache<Dimensions>& cache);
+template<IndexType Dimensions>
+void update_threats_accumulator_full(Color                                 perspective,
+                                     const FeatureTransformer<Dimensions>& featureTransformer,
+                                     const Position&                       pos,
+                                     AccumulatorState<ThreatFeatureSet>&   accumulatorState);
+}
+template<typename T>
+const AccumulatorState<T>& AccumulatorStack::latest() const noexcept {
+    return accumulators<T>()[size - 1];
+}
+// Explicit template instantiations
+template const AccumulatorState<PSQFeatureSet>&    AccumulatorStack::latest() const noexcept;
+template const AccumulatorState<ThreatFeatureSet>& AccumulatorStack::latest() const noexcept;
+template<typename T>
+AccumulatorState<T>& AccumulatorStack::mut_latest() noexcept {
+    return mut_accumulators<T>()[size - 1];
+}
+template<typename T>
+const std::array<AccumulatorState<T>, AccumulatorStack::MaxSize>&
+AccumulatorStack::accumulators() const noexcept {
+    static_assert(std::is_same_v<T, PSQFeatureSet> || std::is_same_v<T, ThreatFeatureSet>,
+                  "Invalid Feature Set Type");
+    if constexpr (std::is_same_v<T, PSQFeatureSet>)
+        return psq_accumulators;
+    if constexpr (std::is_same_v<T, ThreatFeatureSet>)
+        return threat_accumulators;
+}
+template<typename T>
+std::array<AccumulatorState<T>, AccumulatorStack::MaxSize>&
+AccumulatorStack::mut_accumulators() noexcept {
+    static_assert(std::is_same_v<T, PSQFeatureSet> || std::is_same_v<T, ThreatFeatureSet>,
+                  "Invalid Feature Set Type");
+    if constexpr (std::is_same_v<T, PSQFeatureSet>)
+        return psq_accumulators;
+    if constexpr (std::is_same_v<T, ThreatFeatureSet>)
+        return threat_accumulators;
+}
+void AccumulatorStack::reset() noexcept {
+    psq_accumulators[0].reset({});
+    threat_accumulators[0].reset({});
+    size = 1;
+}
+std::pair<DirtyPiece&, DirtyThreats&> AccumulatorStack::push() noexcept {
+    assert(size < MaxSize);
+    auto& dp  = psq_accumulators[size].reset();
+    auto& dts = threat_accumulators[size].reset();
+    new (&dts) DirtyThreats;
+    size++;
+    return {dp, dts};
+}
+void AccumulatorStack::pop() noexcept {
+    assert(size > 1);
+    size--;
+}
+template<IndexType Dimensions>
+void AccumulatorStack::evaluate(const Position&                       pos,
+                                const FeatureTransformer<Dimensions>& featureTransformer,
+                                AccumulatorCaches::Cache<Dimensions>& cache) noexcept {
+    constexpr bool UseThreats = (Dimensions == TransformedFeatureDimensionsBig);
+    evaluate_side<PSQFeatureSet>(WHITE, pos, featureTransformer, cache);
+    if (UseThreats)
+        evaluate_side<ThreatFeatureSet>(WHITE, pos, featureTransformer, cache);
+    evaluate_side<PSQFeatureSet>(BLACK, pos, featureTransformer, cache);
+    if (UseThreats)
+        evaluate_side<ThreatFeatureSet>(BLACK, pos, featureTransformer, cache);
+}
+template<typename FeatureSet, IndexType Dimensions>
+void AccumulatorStack::evaluate_side(Color                                 perspective,
+                                     const Position&                       pos,
+                                     const FeatureTransformer<Dimensions>& featureTransformer,
+                                     AccumulatorCaches::Cache<Dimensions>& cache) noexcept {
+    const auto last_usable_accum =
+      find_last_usable_accumulator<FeatureSet, Dimensions>(perspective);
+    if ((accumulators<FeatureSet>()[last_usable_accum].template acc<Dimensions>())
+          .computed[perspective])
+        forward_update_incremental<FeatureSet>(perspective, pos, featureTransformer,
+                                               last_usable_accum);
+    else
+    {
+        if constexpr (std::is_same_v<FeatureSet, PSQFeatureSet>)
+            update_accumulator_refresh_cache(perspective, featureTransformer, pos,
+                                             mut_latest<PSQFeatureSet>(), cache);
+        else
+            update_threats_accumulator_full(perspective, featureTransformer, pos,
+                                            mut_latest<ThreatFeatureSet>());
+        backward_update_incremental<FeatureSet>(perspective, pos, featureTransformer,
+                                                last_usable_accum);
+    }
+}
+// Find the earliest usable accumulator, this can either be a computed accumulator or the accumulator
+// state just before a change that requires full refresh.
+template<typename FeatureSet, IndexType Dimensions>
+std::size_t AccumulatorStack::find_last_usable_accumulator(Color perspective) const noexcept {
+    for (std::size_t curr_idx = size - 1; curr_idx > 0; curr_idx--)
+    {
+        if ((accumulators<FeatureSet>()[curr_idx].template acc<Dimensions>()).computed[perspective])
+            return curr_idx;
+        if (FeatureSet::requires_refresh(accumulators<FeatureSet>()[curr_idx].diff, perspective))
+            return curr_idx;
+    }
+    return 0;
+}
+template<typename FeatureSet, IndexType Dimensions>
+void AccumulatorStack::forward_update_incremental(
+  Color                                 perspective,
+  const Position&                       pos,
+  const FeatureTransformer<Dimensions>& featureTransformer,
+  const std::size_t                     begin) noexcept {
+    assert(begin < accumulators<FeatureSet>().size());
+    assert((accumulators<FeatureSet>()[begin].template acc<Dimensions>()).computed[perspective]);
+    const Square ksq = pos.square<KING>(perspective);
+    for (std::size_t next = begin + 1; next < size; next++)
+    {
+        if (next + 1 < size)
+        {
+            DirtyPiece& dp1 = mut_accumulators<PSQFeatureSet>()[next].diff;
+            DirtyPiece& dp2 = mut_accumulators<PSQFeatureSet>()[next + 1].diff;
+            auto& accumulators = mut_accumulators<FeatureSet>();
+            if constexpr (std::is_same_v<FeatureSet, ThreatFeatureSet>)
+            {
+                if (dp2.remove_sq != SQ_NONE
+                    && (accumulators[next].diff.threateningSqs & square_bb(dp2.remove_sq)))
+                {
+                    double_inc_update(perspective, featureTransformer, ksq, accumulators[next],
+                                      accumulators[next + 1], accumulators[next - 1], dp2);
+                    next++;
+                    continue;
+                }
+            }
+            if constexpr (std::is_same_v<FeatureSet, PSQFeatureSet>)
+            {
+                if (dp1.to != SQ_NONE && dp1.to == dp2.remove_sq)
+                {
+                    const Square captureSq = dp1.to;
+                    dp1.to = dp2.remove_sq = SQ_NONE;
+                    double_inc_update(perspective, featureTransformer, ksq, accumulators[next],
+                                      accumulators[next + 1], accumulators[next - 1]);
+                    dp1.to = dp2.remove_sq = captureSq;
+                    next++;
+                    continue;
+                }
+            }
+        }
+        update_accumulator_incremental<true>(perspective, featureTransformer, ksq,
+                                             mut_accumulators<FeatureSet>()[next],
+                                             accumulators<FeatureSet>()[next - 1]);
+    }
+    assert((latest<PSQFeatureSet>().acc<Dimensions>()).computed[perspective]);
+}
+template<typename FeatureSet, IndexType Dimensions>
+void AccumulatorStack::backward_update_incremental(
+  Color perspective,
+  const Position&                       pos,
+  const FeatureTransformer<Dimensions>& featureTransformer,
+  const std::size_t                     end) noexcept {
+    assert(end < accumulators<FeatureSet>().size());
+    assert(end < size);
+    assert((latest<FeatureSet>().template acc<Dimensions>()).computed[perspective]);
+    const Square ksq = pos.square<KING>(perspective);
+    for (std::int64_t next = std::int64_t(size) - 2; next >= std::int64_t(end); next--)
+        update_accumulator_incremental<false>(perspective, featureTransformer, ksq,
+                                              mut_accumulators<FeatureSet>()[next],
+                                              accumulators<FeatureSet>()[next + 1]);
+    assert((accumulators<FeatureSet>()[end].template acc<Dimensions>()).computed[perspective]);
+}
+// Explicit template instantiations
+template void AccumulatorStack::evaluate<TransformedFeatureDimensionsBig>(
+  const Position&                                            pos,
+  const FeatureTransformer<TransformedFeatureDimensionsBig>& featureTransformer,
+  AccumulatorCaches::Cache<TransformedFeatureDimensionsBig>& cache) noexcept;
+template void AccumulatorStack::evaluate<TransformedFeatureDimensionsSmall>(
+  const Position&                                              pos,
+  const FeatureTransformer<TransformedFeatureDimensionsSmall>& featureTransformer,
+  AccumulatorCaches::Cache<TransformedFeatureDimensionsSmall>& cache) noexcept;
+namespace {
+template<typename VectorWrapper,
+         IndexType Width,
+         UpdateOperation... ops,
+         typename ElementType,
+         typename... Ts,
+         std::enable_if_t<is_all_same_v<ElementType, Ts...>, bool> = true>
+void fused_row_reduce(const ElementType* in, ElementType* out, const Ts* const... rows) {
+    constexpr IndexType size = Width * sizeof(ElementType) / sizeof(typename VectorWrapper::type);
+    auto* vecIn  = reinterpret_cast<const typename VectorWrapper::type*>(in);
+    auto* vecOut = reinterpret_cast<typename VectorWrapper::type*>(out);
+    for (IndexType i = 0; i < size; ++i)
+        vecOut[i] = fused<VectorWrapper, ops...>(
+          vecIn[i], reinterpret_cast<const typename VectorWrapper::type*>(rows)[i]...);
+}
+template<typename FeatureSet, IndexType Dimensions>
+struct AccumulatorUpdateContext {
+    Color                                 perspective;
+    const FeatureTransformer<Dimensions>& featureTransformer;
+    const AccumulatorState<FeatureSet>&   from;
+    AccumulatorState<FeatureSet>&         to;
+    AccumulatorUpdateContext(Color                                 persp,
+                             const FeatureTransformer<Dimensions>& ft,
+                             const AccumulatorState<FeatureSet>&   accF,
+                             AccumulatorState<FeatureSet>&         accT) noexcept :
+        perspective{persp},
+        featureTransformer{ft},
+        from{accF},
+        to{accT} {}
+    template<UpdateOperation... ops,
+             typename... Ts,
+             std::enable_if_t<is_all_same_v<IndexType, Ts...>, bool> = true>
+    void apply(const Ts... indices) {
+        auto to_weight_vector = [&](const IndexType index) {
+            return &featureTransformer.weights[index * Dimensions];
+        };
+        auto to_psqt_weight_vector = [&](const IndexType index) {
+            return &featureTransformer.psqtWeights[index * PSQTBuckets];
+        };
+        fused_row_reduce<Vec16Wrapper, Dimensions, ops...>(
+          (from.template acc<Dimensions>()).accumulation[perspective].data(),
+          (to.template acc<Dimensions>()).accumulation[perspective].data(),
+          to_weight_vector(indices)...);
+        fused_row_reduce<Vec32Wrapper, PSQTBuckets, ops...>(
+          (from.template acc<Dimensions>()).psqtAccumulation[perspective].data(),
+          (to.template acc<Dimensions>()).psqtAccumulation[perspective].data(),
+          to_psqt_weight_vector(indices)...);
+    }
+    void apply(const typename FeatureSet::IndexList& added,
+               const typename FeatureSet::IndexList& removed) {
+        const auto& fromAcc = from.template acc<Dimensions>().accumulation[perspective];
+        auto&       toAcc   = to.template acc<Dimensions>().accumulation[perspective];
+        const auto& fromPsqtAcc = from.template acc<Dimensions>().psqtAccumulation[perspective];
+        auto&       toPsqtAcc   = to.template acc<Dimensions>().psqtAccumulation[perspective];
+#ifdef VECTOR
+        using Tiling = SIMDTiling<Dimensions, Dimensions, PSQTBuckets>;
+        vec_t      acc[Tiling::NumRegs];
+        psqt_vec_t psqt[Tiling::NumPsqtRegs];
+        const auto* threatWeights = &featureTransformer.threatWeights[0];
+        for (IndexType j = 0; j < Dimensions / Tiling::TileHeight; ++j)
+        {
+            auto* fromTile = reinterpret_cast<const vec_t*>(&fromAcc[j * Tiling::TileHeight]);
+            auto* toTile   = reinterpret_cast<vec_t*>(&toAcc[j * Tiling::TileHeight]);
+            for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                acc[k] = fromTile[k];
+            for (int i = 0; i < removed.ssize(); ++i)
+            {
+                size_t       index  = removed[i];
+                const size_t offset = Dimensions * index;
+                auto*        column = reinterpret_cast<const vec_i8_t*>(&threatWeights[offset]);
+    #ifdef USE_NEON
+                for (IndexType k = 0; k < Tiling::NumRegs; k += 2)
+                {
+                    acc[k]     = vec_sub_16(acc[k], vmovl_s8(vget_low_s8(column[k / 2])));
+                    acc[k + 1] = vec_sub_16(acc[k + 1], vmovl_high_s8(column[k / 2]));
+                }
+    #else
+                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                    acc[k] = vec_sub_16(acc[k], vec_convert_8_16(column[k]));
+    #endif
+            }
+            for (int i = 0; i < added.ssize(); ++i)
+            {
+                size_t       index  = added[i];
+                const size_t offset = Dimensions * index;
+                auto*        column = reinterpret_cast<const vec_i8_t*>(&threatWeights[offset]);
+    #ifdef USE_NEON
+                for (IndexType k = 0; k < Tiling::NumRegs; k += 2)
+                {
+                    acc[k]     = vec_add_16(acc[k], vmovl_s8(vget_low_s8(column[k / 2])));
+                    acc[k + 1] = vec_add_16(acc[k + 1], vmovl_high_s8(column[k / 2]));
+                }
+    #else
+                for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                    acc[k] = vec_add_16(acc[k], vec_convert_8_16(column[k]));
+    #endif
+            }
+            for (IndexType k = 0; k < Tiling::NumRegs; k++)
+                vec_store(&toTile[k], acc[k]);
+            threatWeights += Tiling::TileHeight;
+        }
+        for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j)
+        {
+            auto* fromTilePsqt =
+              reinterpret_cast<const psqt_vec_t*>(&fromPsqtAcc[j * Tiling::PsqtTileHeight]);
+            auto* toTilePsqt =
+              reinterpret_cast<psqt_vec_t*>(&toPsqtAcc[j * Tiling::PsqtTileHeight]);
+            for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k)
+                psqt[k] = fromTilePsqt[k];
+            for (int i = 0; i < removed.ssize(); ++i)
+            {
+                size_t       index      = removed[i];
+                const size_t offset     = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
+                auto*        columnPsqt = reinterpret_cast<const psqt_vec_t*>(
+                  &featureTransformer.threatPsqtWeights[offset]);
+                for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+                    psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
+            }
+            for (int i = 0; i < added.ssize(); ++i)
+            {
+                size_t       index      = added[i];
+                const size_t offset     = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
+                auto*        columnPsqt = reinterpret_cast<const psqt_vec_t*>(
+                  &featureTransformer.threatPsqtWeights[offset]);
+                for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+            }
+            for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k)
+                vec_store_psqt(&toTilePsqt[k], psqt[k]);
+        }
+#else
+        toAcc     = fromAcc;
+        toPsqtAcc = fromPsqtAcc;
+        for (const auto index : removed)
+        {
+            const IndexType offset = Dimensions * index;
+            for (IndexType j = 0; j < Dimensions; ++j)
+                toAcc[j] -= featureTransformer.threatWeights[offset + j];
+            for (std::size_t k = 0; k < PSQTBuckets; ++k)
+                toPsqtAcc[k] -= featureTransformer.threatPsqtWeights[index * PSQTBuckets + k];
+        }
+        for (const auto index : added)
+        {
+            const IndexType offset = Dimensions * index;
+            for (IndexType j = 0; j < Dimensions; ++j)
+                toAcc[j] += featureTransformer.threatWeights[offset + j];
+            for (std::size_t k = 0; k < PSQTBuckets; ++k)
+                toPsqtAcc[k] += featureTransformer.threatPsqtWeights[index * PSQTBuckets + k];
+        }
+#endif
+    }
+};
+template<typename FeatureSet, IndexType Dimensions>
+auto make_accumulator_update_context(Color                                 perspective,
+                                     const FeatureTransformer<Dimensions>& featureTransformer,
+                                     const AccumulatorState<FeatureSet>&   accumulatorFrom,
+                                     AccumulatorState<FeatureSet>&         accumulatorTo) noexcept {
+    return AccumulatorUpdateContext<FeatureSet, Dimensions>{perspective, featureTransformer,
+                                                            accumulatorFrom, accumulatorTo};
+}
+template<IndexType TransformedFeatureDimensions>
+void double_inc_update(Color                                                   perspective,
+                       const FeatureTransformer<TransformedFeatureDimensions>& featureTransformer,
+                       const Square                                            ksq,
+                       AccumulatorState<PSQFeatureSet>&                        middle_state,
+                       AccumulatorState<PSQFeatureSet>&                        target_state,
+                       const AccumulatorState<PSQFeatureSet>&                  computed) {
+    assert(computed.acc<TransformedFeatureDimensions>().computed[perspective]);
+    assert(!middle_state.acc<TransformedFeatureDimensions>().computed[perspective]);
+    assert(!target_state.acc<TransformedFeatureDimensions>().computed[perspective]);
+    PSQFeatureSet::IndexList removed, added;
+    PSQFeatureSet::append_changed_indices(perspective, ksq, middle_state.diff, removed, added);
+    // you can't capture a piece that was just involved in castling since the rook ends up
+    // in a square that the king passed
+    assert(added.size() < 2);
+    PSQFeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added);
+    [[maybe_unused]] const int addedSize   = added.ssize();
+    [[maybe_unused]] const int removedSize = removed.ssize();
+    assert(addedSize == 1);
+    assert(removedSize == 2 || removedSize == 3);
+    // Workaround compiler warning for uninitialized variables, replicated on
+    // profile builds on windows with gcc 14.2.0.
+    // Also helps with optimizations on some compilers.
+    sf_assume(addedSize == 1);
+    sf_assume(removedSize == 2 || removedSize == 3);
+    auto updateContext =
+      make_accumulator_update_context(perspective, featureTransformer, computed, target_state);
+    if (removedSize == 2)
+    {
+        updateContext.template apply<Add, Sub, Sub>(added[0], removed[0], removed[1]);
+    }
+    else
+    {
+        updateContext.template apply<Add, Sub, Sub, Sub>(added[0], removed[0], removed[1],
+                                                         removed[2]);
+    }
+    target_state.acc<TransformedFeatureDimensions>().computed[perspective] = true;
+}
+template<IndexType TransformedFeatureDimensions>
+void double_inc_update(Color                                                   perspective,
+                       const FeatureTransformer<TransformedFeatureDimensions>& featureTransformer,
+                       const Square                                            ksq,
+                       AccumulatorState<ThreatFeatureSet>&                     middle_state,
+                       AccumulatorState<ThreatFeatureSet>&                     target_state,
+                       const AccumulatorState<ThreatFeatureSet>&               computed,
+                       const DirtyPiece&                                       dp2) {
+    assert(computed.acc<TransformedFeatureDimensions>().computed[perspective]);
+    assert(!middle_state.acc<TransformedFeatureDimensions>().computed[perspective]);
+    assert(!target_state.acc<TransformedFeatureDimensions>().computed[perspective]);
+    ThreatFeatureSet::FusedUpdateData fusedData;
+    fusedData.dp2removed = dp2.remove_sq;
+    ThreatFeatureSet::IndexList removed, added;
+    const auto*                 pfBase   = &featureTransformer.threatWeights[0];
+    auto                        pfStride = static_cast<IndexType>(TransformedFeatureDimensions);
+    ThreatFeatureSet::append_changed_indices(perspective, ksq, middle_state.diff, removed, added,
+                                             &fusedData, true, pfBase, pfStride);
+    ThreatFeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added,
+                                             &fusedData, false, pfBase, pfStride);
+    auto updateContext =
+      make_accumulator_update_context(perspective, featureTransformer, computed, target_state);
+    updateContext.apply(added, removed);
+    target_state.acc<TransformedFeatureDimensions>().computed[perspective] = true;
+}
+template<bool Forward, typename FeatureSet, IndexType TransformedFeatureDimensions>
+void update_accumulator_incremental(
+  Color                                                   perspective,
+  const FeatureTransformer<TransformedFeatureDimensions>& featureTransformer,
+  const Square                                            ksq,
+  AccumulatorState<FeatureSet>&                           target_state,
+  const AccumulatorState<FeatureSet>&                     computed) {
+    assert((computed.template acc<TransformedFeatureDimensions>()).computed[perspective]);
+    assert(!(target_state.template acc<TransformedFeatureDimensions>()).computed[perspective]);
+    // The size must be enough to contain the largest possible update.
+    // That might depend on the feature set and generally relies on the
+    // feature set's update cost calculation to be correct and never allow
+    // updates with more added/removed features than MaxActiveDimensions.
+    // In this case, the maximum size of both feature addition and removal
+    // is 2, since we are incrementally updating one move at a time.
+    typename FeatureSet::IndexList removed, added;
+    if constexpr (std::is_same_v<FeatureSet, ThreatFeatureSet>)
+    {
+        const auto* pfBase   = &featureTransformer.threatWeights[0];
+        auto        pfStride = static_cast<IndexType>(TransformedFeatureDimensions);
+        if constexpr (Forward)
+            FeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added,
+                                               nullptr, false, pfBase, pfStride);
+        else
+            FeatureSet::append_changed_indices(perspective, ksq, computed.diff, added, removed,
+                                               nullptr, false, pfBase, pfStride);
+    }
+    else
+    {
+        if constexpr (Forward)
+            FeatureSet::append_changed_indices(perspective, ksq, target_state.diff, removed, added);
+        else
+            FeatureSet::append_changed_indices(perspective, ksq, computed.diff, added, removed);
+    }
+    auto updateContext =
+      make_accumulator_update_context(perspective, featureTransformer, computed, target_state);
+    if constexpr (std::is_same_v<FeatureSet, ThreatFeatureSet>)
+        updateContext.apply(added, removed);
+    else
+    {
+        [[maybe_unused]] const int addedSize   = added.ssize();
+        [[maybe_unused]] const int removedSize = removed.ssize();
+        assert(addedSize == 1 || addedSize == 2);
+        assert(removedSize == 1 || removedSize == 2);
+        assert((Forward && addedSize <= removedSize) || (!Forward && addedSize >= removedSize));
+        // Workaround compiler warning for uninitialized variables, replicated
+        // on profile builds on windows with gcc 14.2.0.
+        // Also helps with optimizations on some compilers.
+        sf_assume(addedSize == 1 || addedSize == 2);
+        sf_assume(removedSize == 1 || removedSize == 2);
+        if (!(removedSize == 1 || removedSize == 2) || !(addedSize == 1 || addedSize == 2))
+            sf_unreachable();
+        if ((Forward && removedSize == 1) || (!Forward && addedSize == 1))
+        {
+            assert(addedSize == 1 && removedSize == 1);
+            updateContext.template apply<Add, Sub>(added[0], removed[0]);
+        }
+        else if (Forward && addedSize == 1)
+        {
+            assert(removedSize == 2);
+            updateContext.template apply<Add, Sub, Sub>(added[0], removed[0], removed[1]);
+        }
+        else if (!Forward && removedSize == 1)
+        {
+            assert(addedSize == 2);
+            updateContext.template apply<Add, Add, Sub>(added[0], added[1], removed[0]);
+        }
+        else
+        {
+            assert(addedSize == 2 && removedSize == 2);
+            updateContext.template apply<Add, Add, Sub, Sub>(added[0], added[1], removed[0],
+                                                             removed[1]);
+        }
+    }
+    (target_state.template acc<TransformedFeatureDimensions>()).computed[perspective] = true;
+}
+Bitboard get_changed_pieces(const std::array<Piece, SQUARE_NB>& oldPieces,
+                            const std::array<Piece, SQUARE_NB>& newPieces) {
+#if defined(USE_AVX512) || defined(USE_AVX2)
+    static_assert(sizeof(Piece) == 1);
+    Bitboard sameBB = 0;
+    for (int i = 0; i < 64; i += 32)
+    {
+        const __m256i old_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&oldPieces[i]));
+        const __m256i new_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&newPieces[i]));
+        const __m256i cmpEqual        = _mm256_cmpeq_epi8(old_v, new_v);
+        const std::uint32_t equalMask = _mm256_movemask_epi8(cmpEqual);
+        sameBB |= static_cast<Bitboard>(equalMask) << i;
+    }
+    return ~sameBB;
+#elif defined(USE_NEON)
+    uint8x16x4_t old_v = vld4q_u8(reinterpret_cast<const uint8_t*>(oldPieces.data()));
+    uint8x16x4_t new_v = vld4q_u8(reinterpret_cast<const uint8_t*>(newPieces.data()));
+    auto         cmp   = [=](const int i) { return vceqq_u8(old_v.val[i], new_v.val[i]); };
+    uint8x16_t cmp0_1 = vsriq_n_u8(cmp(1), cmp(0), 1);
+    uint8x16_t cmp2_3 = vsriq_n_u8(cmp(3), cmp(2), 1);
+    uint8x16_t merged = vsriq_n_u8(cmp2_3, cmp0_1, 2);
+    merged            = vsriq_n_u8(merged, merged, 4);
+    uint8x8_t sameBB  = vshrn_n_u16(vreinterpretq_u16_u8(merged), 4);
+    return ~vget_lane_u64(vreinterpret_u64_u8(sameBB), 0);
+#else
+    Bitboard changed = 0;
+    for (Square sq = SQUARE_ZERO; sq < SQUARE_NB; ++sq)
+        changed |= static_cast<Bitboard>(oldPieces[sq] != newPieces[sq]) << sq;
+    return changed;
+#endif
+}
+template<IndexType Dimensions>
+void update_accumulator_refresh_cache(Color                                 perspective,
+                                      const FeatureTransformer<Dimensions>& featureTransformer,
+                                      const Position&                       pos,
+                                      AccumulatorState<PSQFeatureSet>&      accumulatorState,
+                                      AccumulatorCaches::Cache<Dimensions>& cache) {
+    using Tiling [[maybe_unused]] = SIMDTiling<Dimensions, Dimensions, PSQTBuckets>;
+    const Square             ksq   = pos.square<KING>(perspective);
+    auto&                    entry = cache[ksq][perspective];
+    PSQFeatureSet::IndexList removed, added;
+    const Bitboard changedBB = get_changed_pieces(entry.pieces, pos.piece_array());
+    Bitboard       removedBB = changedBB & entry.pieceBB;
+    Bitboard       addedBB   = changedBB & pos.pieces();
+    while (removedBB)
+    {
+        Square sq = pop_lsb(removedBB);
+        removed.push_back(PSQFeatureSet::make_index(perspective, sq, entry.pieces[sq], ksq));
+    }
+    while (addedBB)
+    {
+        Square sq = pop_lsb(addedBB);
+        added.push_back(PSQFeatureSet::make_index(perspective, sq, pos.piece_on(sq), ksq));
+    }
+    entry.pieceBB = pos.pieces();
+    entry.pieces  = pos.piece_array();
+    auto& accumulator                 = accumulatorState.acc<Dimensions>();
+    accumulator.computed[perspective] = true;
+#ifdef VECTOR
+    vec_t      acc[Tiling::NumRegs];
+    psqt_vec_t psqt[Tiling::NumPsqtRegs];
+    const auto* weights = &featureTransformer.weights[0];
+    for (IndexType j = 0; j < Dimensions / Tiling::TileHeight; ++j)
+    {
+        auto* accTile =
+          reinterpret_cast<vec_t*>(&accumulator.accumulation[perspective][j * Tiling::TileHeight]);
+        auto* entryTile = reinterpret_cast<vec_t*>(&entry.accumulation[j * Tiling::TileHeight]);
+        for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+            acc[k] = entryTile[k];
+        int i = 0;
+        for (; i < std::min(removed.ssize(), added.ssize()); ++i)
+        {
+            size_t       indexR  = removed[i];
+            const size_t offsetR = Dimensions * indexR;
+            auto*        columnR = reinterpret_cast<const vec_t*>(&weights[offsetR]);
+            size_t       indexA  = added[i];
+            const size_t offsetA = Dimensions * indexA;
+            auto*        columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]);
+            for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                acc[k] = fused<Vec16Wrapper, Add, Sub>(acc[k], columnA[k], columnR[k]);
+        }
+        for (; i < removed.ssize(); ++i)
+        {
+            size_t       index  = removed[i];
+            const size_t offset = Dimensions * index;
+            auto*        column = reinterpret_cast<const vec_t*>(&weights[offset]);
+            for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                acc[k] = vec_sub_16(acc[k], column[k]);
+        }
+        for (; i < added.ssize(); ++i)
+        {
+            size_t       index  = added[i];
+            const size_t offset = Dimensions * index;
+            auto*        column = reinterpret_cast<const vec_t*>(&weights[offset]);
+            for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], column[k]);
+        }
+        for (IndexType k = 0; k < Tiling::NumRegs; k++)
+            vec_store(&entryTile[k], acc[k]);
+        for (IndexType k = 0; k < Tiling::NumRegs; k++)
+            vec_store(&accTile[k], acc[k]);
+        weights += Tiling::TileHeight;
+    }
+    for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j)
+    {
+        auto* accTilePsqt = reinterpret_cast<psqt_vec_t*>(
+          &accumulator.psqtAccumulation[perspective][j * Tiling::PsqtTileHeight]);
+        auto* entryTilePsqt =
+          reinterpret_cast<psqt_vec_t*>(&entry.psqtAccumulation[j * Tiling::PsqtTileHeight]);
+        for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k)
+            psqt[k] = entryTilePsqt[k];
+        for (int i = 0; i < removed.ssize(); ++i)
+        {
+            size_t       index  = removed[i];
+            const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
+            auto*        columnPsqt =
+              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offset]);
+            for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+                psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
+        }
+        for (int i = 0; i < added.ssize(); ++i)
+        {
+            size_t       index  = added[i];
+            const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
+            auto*        columnPsqt =
+              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.psqtWeights[offset]);
+            for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+                psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+        }
+        for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k)
+            vec_store_psqt(&entryTilePsqt[k], psqt[k]);
+        for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k)
+            vec_store_psqt(&accTilePsqt[k], psqt[k]);
+    }
+#else
+    for (const auto index : removed)
+    {
+        const IndexType offset = Dimensions * index;
+        for (IndexType j = 0; j < Dimensions; ++j)
+            entry.accumulation[j] -= featureTransformer.weights[offset + j];
+        for (std::size_t k = 0; k < PSQTBuckets; ++k)
+            entry.psqtAccumulation[k] -= featureTransformer.psqtWeights[index * PSQTBuckets + k];
+    }
+    for (const auto index : added)
+    {
+        const IndexType offset = Dimensions * index;
+        for (IndexType j = 0; j < Dimensions; ++j)
+            entry.accumulation[j] += featureTransformer.weights[offset + j];
+        for (std::size_t k = 0; k < PSQTBuckets; ++k)
+            entry.psqtAccumulation[k] += featureTransformer.psqtWeights[index * PSQTBuckets + k];
+    }
+    // The accumulator of the refresh entry has been updated.
+    // Now copy its content to the actual accumulator we were refreshing.
+    accumulator.accumulation[perspective]     = entry.accumulation;
+    accumulator.psqtAccumulation[perspective] = entry.psqtAccumulation;
+#endif
+}
+template<IndexType Dimensions>
+void update_threats_accumulator_full(Color                                 perspective,
+                                     const FeatureTransformer<Dimensions>& featureTransformer,
+                                     const Position&                       pos,
+                                     AccumulatorState<ThreatFeatureSet>&   accumulatorState) {
+    using Tiling [[maybe_unused]] = SIMDTiling<Dimensions, Dimensions, PSQTBuckets>;
+    ThreatFeatureSet::IndexList active;
+    ThreatFeatureSet::append_active_indices(perspective, pos, active);
+    auto& accumulator                 = accumulatorState.acc<Dimensions>();
+    accumulator.computed[perspective] = true;
+#ifdef VECTOR
+    vec_t      acc[Tiling::NumRegs];
+    psqt_vec_t psqt[Tiling::NumPsqtRegs];
+    const auto* threatWeights = &featureTransformer.threatWeights[0];
+    for (IndexType j = 0; j < Dimensions / Tiling::TileHeight; ++j)
+    {
+        auto* accTile =
+          reinterpret_cast<vec_t*>(&accumulator.accumulation[perspective][j * Tiling::TileHeight]);
+        for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+            acc[k] = vec_zero();
+        int i = 0;
+        for (; i < active.ssize(); ++i)
+        {
+            size_t       index  = active[i];
+            const size_t offset = Dimensions * index;
+            auto*        column = reinterpret_cast<const vec_i8_t*>(&threatWeights[offset]);
+    #ifdef USE_NEON
+            for (IndexType k = 0; k < Tiling::NumRegs; k += 2)
+            {
+                acc[k]     = vec_add_16(acc[k], vmovl_s8(vget_low_s8(column[k / 2])));
+                acc[k + 1] = vec_add_16(acc[k + 1], vmovl_high_s8(column[k / 2]));
+            }
+    #else
+            for (IndexType k = 0; k < Tiling::NumRegs; ++k)
+                acc[k] = vec_add_16(acc[k], vec_convert_8_16(column[k]));
+    #endif
+        }
+        for (IndexType k = 0; k < Tiling::NumRegs; k++)
+            vec_store(&accTile[k], acc[k]);
+        threatWeights += Tiling::TileHeight;
+    }
+    for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j)
+    {
+        auto* accTilePsqt = reinterpret_cast<psqt_vec_t*>(
+          &accumulator.psqtAccumulation[perspective][j * Tiling::PsqtTileHeight]);
+        for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k)
+            psqt[k] = vec_zero_psqt();
+        for (int i = 0; i < active.ssize(); ++i)
+        {
+            size_t       index  = active[i];
+            const size_t offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
+            auto*        columnPsqt =
+              reinterpret_cast<const psqt_vec_t*>(&featureTransformer.threatPsqtWeights[offset]);
+            for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
+                psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+        }
+        for (IndexType k = 0; k < Tiling::NumPsqtRegs; ++k)
+            vec_store_psqt(&accTilePsqt[k], psqt[k]);
+    }
+#else
+    for (IndexType j = 0; j < Dimensions; ++j)
+        accumulator.accumulation[perspective][j] = 0;
+    for (std::size_t k = 0; k < PSQTBuckets; ++k)
+        accumulator.psqtAccumulation[perspective][k] = 0;
+    for (const auto index : active)
+    {
+        const IndexType offset = Dimensions * index;
+        for (IndexType j = 0; j < Dimensions; ++j)
+            accumulator.accumulation[perspective][j] +=
+              featureTransformer.threatWeights[offset + j];
+        for (std::size_t k = 0; k < PSQTBuckets; ++k)
+            accumulator.psqtAccumulation[perspective][k] +=
+              featureTransformer.threatPsqtWeights[index * PSQTBuckets + k];
+    }
+#endif
+}
+}
+}

src/nnue/nnue_accumulator.h ADDED Viewed

	@@ -0,0 +1,206 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+// Class for difference calculation of NNUE evaluation function
+#ifndef NNUE_ACCUMULATOR_H_INCLUDED
+#define NNUE_ACCUMULATOR_H_INCLUDED
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+#include "../types.h"
+#include "nnue_architecture.h"
+#include "nnue_common.h"
+namespace Stockfish {
+class Position;
+}
+namespace Stockfish::Eval::NNUE {
+template<IndexType Size>
+struct alignas(CacheLineSize) Accumulator;
+template<IndexType TransformedFeatureDimensions>
+class FeatureTransformer;
+// Class that holds the result of affine transformation of input features
+template<IndexType Size>
+struct alignas(CacheLineSize) Accumulator {
+    std::array<std::array<std::int16_t, Size>, COLOR_NB>        accumulation;
+    std::array<std::array<std::int32_t, PSQTBuckets>, COLOR_NB> psqtAccumulation;
+    std::array<bool, COLOR_NB>                                  computed = {};
+};
+// AccumulatorCaches struct provides per-thread accumulator caches, where each
+// cache contains multiple entries for each of the possible king squares.
+// When the accumulator needs to be refreshed, the cached entry is used to more
+// efficiently update the accumulator, instead of rebuilding it from scratch.
+// This idea, was first described by Luecx (author of Koivisto) and
+// is commonly referred to as "Finny Tables".
+struct AccumulatorCaches {
+    template<typename Networks>
+    AccumulatorCaches(const Networks& networks) {
+        clear(networks);
+    }
+    template<IndexType Size>
+    struct alignas(CacheLineSize) Cache {
+        struct alignas(CacheLineSize) Entry {
+            std::array<BiasType, Size>              accumulation;
+            std::array<PSQTWeightType, PSQTBuckets> psqtAccumulation;
+            std::array<Piece, SQUARE_NB>            pieces;
+            Bitboard                                pieceBB;
+            // To initialize a refresh entry, we set all its bitboards empty,
+            // so we put the biases in the accumulation, without any weights on top
+            void clear(const std::array<BiasType, Size>& biases) {
+                accumulation = biases;
+                std::memset(reinterpret_cast<std::byte*>(this) + offsetof(Entry, psqtAccumulation),
+                            0, sizeof(Entry) - offsetof(Entry, psqtAccumulation));
+            }
+        };
+        template<typename Network>
+        void clear(const Network& network) {
+            for (auto& entries1D : entries)
+                for (auto& entry : entries1D)
+                    entry.clear(network.featureTransformer.biases);
+        }
+        std::array<Entry, COLOR_NB>& operator[](Square sq) { return entries[sq]; }
+        std::array<std::array<Entry, COLOR_NB>, SQUARE_NB> entries;
+    };
+    template<typename Networks>
+    void clear(const Networks& networks) {
+        big.clear(networks.big);
+        small.clear(networks.small);
+    }
+    Cache<TransformedFeatureDimensionsBig>   big;
+    Cache<TransformedFeatureDimensionsSmall> small;
+};
+template<typename FeatureSet>
+struct AccumulatorState {
+    Accumulator<TransformedFeatureDimensionsBig>   accumulatorBig;
+    Accumulator<TransformedFeatureDimensionsSmall> accumulatorSmall;
+    typename FeatureSet::DiffType                  diff;
+    template<IndexType Size>
+    auto& acc() noexcept {
+        static_assert(Size == TransformedFeatureDimensionsBig
+                        || Size == TransformedFeatureDimensionsSmall,
+                      "Invalid size for accumulator");
+        if constexpr (Size == TransformedFeatureDimensionsBig)
+            return accumulatorBig;
+        else if constexpr (Size == TransformedFeatureDimensionsSmall)
+            return accumulatorSmall;
+    }
+    template<IndexType Size>
+    const auto& acc() const noexcept {
+        static_assert(Size == TransformedFeatureDimensionsBig
+                        || Size == TransformedFeatureDimensionsSmall,
+                      "Invalid size for accumulator");
+        if constexpr (Size == TransformedFeatureDimensionsBig)
+            return accumulatorBig;
+        else if constexpr (Size == TransformedFeatureDimensionsSmall)
+            return accumulatorSmall;
+    }
+    void reset(const typename FeatureSet::DiffType& dp) noexcept {
+        diff = dp;
+        accumulatorBig.computed.fill(false);
+        accumulatorSmall.computed.fill(false);
+    }
+    typename FeatureSet::DiffType& reset() noexcept {
+        accumulatorBig.computed.fill(false);
+        accumulatorSmall.computed.fill(false);
+        return diff;
+    }
+};
+class AccumulatorStack {
+   public:
+    static constexpr std::size_t MaxSize = MAX_PLY + 1;
+    template<typename T>
+    [[nodiscard]] const AccumulatorState<T>& latest() const noexcept;
+    void                                  reset() noexcept;
+    std::pair<DirtyPiece&, DirtyThreats&> push() noexcept;
+    void                                  pop() noexcept;
+    template<IndexType Dimensions>
+    void evaluate(const Position&                       pos,
+                  const FeatureTransformer<Dimensions>& featureTransformer,
+                  AccumulatorCaches::Cache<Dimensions>& cache) noexcept;
+   private:
+    template<typename T>
+    [[nodiscard]] AccumulatorState<T>& mut_latest() noexcept;
+    template<typename T>
+    [[nodiscard]] const std::array<AccumulatorState<T>, MaxSize>& accumulators() const noexcept;
+    template<typename T>
+    [[nodiscard]] std::array<AccumulatorState<T>, MaxSize>& mut_accumulators() noexcept;
+    template<typename FeatureSet, IndexType Dimensions>
+    void evaluate_side(Color                                 perspective,
+                       const Position&                       pos,
+                       const FeatureTransformer<Dimensions>& featureTransformer,
+                       AccumulatorCaches::Cache<Dimensions>& cache) noexcept;
+    template<typename FeatureSet, IndexType Dimensions>
+    [[nodiscard]] std::size_t find_last_usable_accumulator(Color perspective) const noexcept;
+    template<typename FeatureSet, IndexType Dimensions>
+    void forward_update_incremental(Color                                 perspective,
+                                    const Position&                       pos,
+                                    const FeatureTransformer<Dimensions>& featureTransformer,
+                                    const std::size_t                     begin) noexcept;
+    template<typename FeatureSet, IndexType Dimensions>
+    void backward_update_incremental(Color                                 perspective,
+                                     const Position&                       pos,
+                                     const FeatureTransformer<Dimensions>& featureTransformer,
+                                     const std::size_t                     end) noexcept;
+    std::array<AccumulatorState<PSQFeatureSet>, MaxSize>    psq_accumulators;
+    std::array<AccumulatorState<ThreatFeatureSet>, MaxSize> threat_accumulators;
+    std::size_t                                             size = 1;
+};
+}  // namespace Stockfish::Eval::NNUE
+#endif  // NNUE_ACCUMULATOR_H_INCLUDED

src/nnue/nnue_architecture.h ADDED Viewed

	@@ -0,0 +1,165 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+// Input features and network structure used in NNUE evaluation function
+#ifndef NNUE_ARCHITECTURE_H_INCLUDED
+#define NNUE_ARCHITECTURE_H_INCLUDED
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include "features/half_ka_v2_hm.h"
+#include "features/full_threats.h"
+#include "layers/affine_transform.h"
+#include "layers/affine_transform_sparse_input.h"
+#include "layers/clipped_relu.h"
+#include "layers/sqr_clipped_relu.h"
+#include "nnue_common.h"
+namespace Stockfish::Eval::NNUE {
+// Input features used in evaluation function
+using ThreatFeatureSet = Features::FullThreats;
+using PSQFeatureSet    = Features::HalfKAv2_hm;
+// Number of input feature dimensions after conversion
+constexpr IndexType TransformedFeatureDimensionsBig = 256;
+constexpr int       L2Big                           = 31;
+constexpr int       L3Big                           = 32;
+constexpr IndexType TransformedFeatureDimensionsSmall = 128;
+constexpr int       L2Small                           = 15;
+constexpr int       L3Small                           = 32;
+constexpr IndexType PSQTBuckets = 8;
+constexpr IndexType LayerStacks = 8;
+// If vector instructions are enabled, we update and refresh the
+// accumulator tile by tile such that each tile fits in the CPU's
+// vector registers.
+static_assert(PSQTBuckets % 8 == 0,
+              "Per feature PSQT values cannot be processed at granularity lower than 8 at a time.");
+template<IndexType L1, int L2, int L3>
+struct NetworkArchitecture {
+    static constexpr IndexType TransformedFeatureDimensions = L1;
+    static constexpr int       FC_0_OUTPUTS                 = L2;
+    static constexpr int       FC_1_OUTPUTS                 = L3;
+    Layers::AffineTransformSparseInput<TransformedFeatureDimensions, FC_0_OUTPUTS + 1> fc_0;
+    Layers::SqrClippedReLU<FC_0_OUTPUTS + 1>                                           ac_sqr_0;
+    Layers::ClippedReLU<FC_0_OUTPUTS + 1>                                              ac_0;
+    Layers::AffineTransform<FC_0_OUTPUTS * 2, FC_1_OUTPUTS>                            fc_1;
+    Layers::ClippedReLU<FC_1_OUTPUTS>                                                  ac_1;
+    Layers::AffineTransform<FC_1_OUTPUTS, 1>                                           fc_2;
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t get_hash_value() {
+        // input slice hash
+        std::uint32_t hashValue = 0xEC42E90Du;
+        hashValue ^= TransformedFeatureDimensions * 2;
+        hashValue = decltype(fc_0)::get_hash_value(hashValue);
+        hashValue = decltype(ac_0)::get_hash_value(hashValue);
+        hashValue = decltype(fc_1)::get_hash_value(hashValue);
+        hashValue = decltype(ac_1)::get_hash_value(hashValue);
+        hashValue = decltype(fc_2)::get_hash_value(hashValue);
+        return hashValue;
+    }
+    // Read network parameters
+    bool read_parameters(std::istream& stream) {
+        return fc_0.read_parameters(stream) && ac_0.read_parameters(stream)
+            && fc_1.read_parameters(stream) && ac_1.read_parameters(stream)
+            && fc_2.read_parameters(stream);
+    }
+    // Write network parameters
+    bool write_parameters(std::ostream& stream) const {
+        return fc_0.write_parameters(stream) && ac_0.write_parameters(stream)
+            && fc_1.write_parameters(stream) && ac_1.write_parameters(stream)
+            && fc_2.write_parameters(stream);
+    }
+    std::int32_t propagate(const TransformedFeatureType* transformedFeatures) const {
+        struct alignas(CacheLineSize) Buffer {
+            alignas(CacheLineSize) typename decltype(fc_0)::OutputBuffer fc_0_out;
+            alignas(CacheLineSize) typename decltype(ac_sqr_0)::OutputType
+              ac_sqr_0_out[ceil_to_multiple<IndexType>(FC_0_OUTPUTS * 2, 32)];
+            alignas(CacheLineSize) typename decltype(ac_0)::OutputBuffer ac_0_out;
+            alignas(CacheLineSize) typename decltype(fc_1)::OutputBuffer fc_1_out;
+            alignas(CacheLineSize) typename decltype(ac_1)::OutputBuffer ac_1_out;
+            alignas(CacheLineSize) typename decltype(fc_2)::OutputBuffer fc_2_out;
+            Buffer() { std::memset(this, 0, sizeof(*this)); }
+        };
+#if defined(__clang__) && (__APPLE__)
+        // workaround for a bug reported with xcode 12
+        static thread_local auto tlsBuffer = std::make_unique<Buffer>();
+        // Access TLS only once, cache result.
+        Buffer& buffer = *tlsBuffer;
+#else
+        alignas(CacheLineSize) static thread_local Buffer buffer;
+#endif
+        fc_0.propagate(transformedFeatures, buffer.fc_0_out);
+        ac_sqr_0.propagate(buffer.fc_0_out, buffer.ac_sqr_0_out);
+        ac_0.propagate(buffer.fc_0_out, buffer.ac_0_out);
+        std::memcpy(buffer.ac_sqr_0_out + FC_0_OUTPUTS, buffer.ac_0_out,
+                    FC_0_OUTPUTS * sizeof(typename decltype(ac_0)::OutputType));
+        fc_1.propagate(buffer.ac_sqr_0_out, buffer.fc_1_out);
+        ac_1.propagate(buffer.fc_1_out, buffer.ac_1_out);
+        fc_2.propagate(buffer.ac_1_out, buffer.fc_2_out);
+        // buffer.fc_0_out[FC_0_OUTPUTS] is such that 1.0 is equal to 127*(1<<WeightScaleBits) in
+        // quantized form, but we want 1.0 to be equal to 600*OutputScale
+        std::int32_t fwdOut =
+          (buffer.fc_0_out[FC_0_OUTPUTS]) * (600 * OutputScale) / (127 * (1 << WeightScaleBits));
+        std::int32_t outputValue = buffer.fc_2_out[0] + fwdOut;
+        return outputValue;
+    }
+    std::size_t get_content_hash() const {
+        std::size_t h = 0;
+        hash_combine(h, fc_0.get_content_hash());
+        hash_combine(h, ac_sqr_0.get_content_hash());
+        hash_combine(h, ac_0.get_content_hash());
+        hash_combine(h, fc_1.get_content_hash());
+        hash_combine(h, ac_1.get_content_hash());
+        hash_combine(h, fc_2.get_content_hash());
+        hash_combine(h, get_hash_value());
+        return h;
+    }
+};
+}  // namespace Stockfish::Eval::NNUE
+template<Stockfish::Eval::NNUE::IndexType L1, int L2, int L3>
+struct std::hash<Stockfish::Eval::NNUE::NetworkArchitecture<L1, L2, L3>> {
+    std::size_t
+    operator()(const Stockfish::Eval::NNUE::NetworkArchitecture<L1, L2, L3>& arch) const noexcept {
+        return arch.get_content_hash();
+    }
+};
+#endif  // #ifndef NNUE_ARCHITECTURE_H_INCLUDED

src/nnue/nnue_common.h ADDED Viewed

	@@ -0,0 +1,298 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+// Constants used in NNUE evaluation function
+#ifndef NNUE_COMMON_H_INCLUDED
+#define NNUE_COMMON_H_INCLUDED
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <type_traits>
+#include "../misc.h"
+#if defined(USE_AVX2)
+    #include <immintrin.h>
+#elif defined(USE_SSE41)
+    #include <smmintrin.h>
+#elif defined(USE_SSSE3)
+    #include <tmmintrin.h>
+#elif defined(USE_SSE2)
+    #include <emmintrin.h>
+#elif defined(USE_NEON)
+    #include <arm_neon.h>
+#endif
+namespace Stockfish::Eval::NNUE {
+using BiasType         = std::int16_t;
+using ThreatWeightType = std::int8_t;
+using WeightType       = std::int16_t;
+using PSQTWeightType   = std::int32_t;
+using IndexType        = std::uint32_t;
+// Version of the evaluation file
+constexpr std::uint32_t Version = 0x7AF32F20u;
+// Constant used in evaluation value calculation
+constexpr int OutputScale     = 16;
+constexpr int WeightScaleBits = 6;
+// Size of cache line (in bytes)
+constexpr std::size_t CacheLineSize = 64;
+constexpr const char        Leb128MagicString[]   = "COMPRESSED_LEB128";
+constexpr const std::size_t Leb128MagicStringSize = sizeof(Leb128MagicString) - 1;
+// SIMD width (in bytes)
+#if defined(USE_AVX2)
+constexpr std::size_t SimdWidth = 32;
+#elif defined(USE_SSE2)
+constexpr std::size_t SimdWidth = 16;
+#elif defined(USE_NEON)
+constexpr std::size_t SimdWidth = 16;
+#endif
+constexpr std::size_t MaxSimdWidth = 32;
+// Type of input feature after conversion
+using TransformedFeatureType = std::uint8_t;
+// Round n up to be a multiple of base
+template<typename IntType>
+constexpr IntType ceil_to_multiple(IntType n, IntType base) {
+    return (n + base - 1) / base * base;
+}
+// Utility to read an integer (signed or unsigned, any size)
+// from a stream in little-endian order. We swap the byte order after the read if
+// necessary to return a result with the byte ordering of the compiling machine.
+template<typename IntType>
+inline IntType read_little_endian(std::istream& stream) {
+    IntType result;
+    if (IsLittleEndian)
+        stream.read(reinterpret_cast<char*>(&result), sizeof(IntType));
+    else
+    {
+        std::uint8_t                  u[sizeof(IntType)];
+        std::make_unsigned_t<IntType> v = 0;
+        stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
+        for (std::size_t i = 0; i < sizeof(IntType); ++i)
+            v = (v << 8) | u[sizeof(IntType) - i - 1];
+        std::memcpy(&result, &v, sizeof(IntType));
+    }
+    return result;
+}
+// Utility to write an integer (signed or unsigned, any size)
+// to a stream in little-endian order. We swap the byte order before the write if
+// necessary to always write in little-endian order, independently of the byte
+// ordering of the compiling machine.
+template<typename IntType>
+inline void write_little_endian(std::ostream& stream, IntType value) {
+    if (IsLittleEndian)
+        stream.write(reinterpret_cast<const char*>(&value), sizeof(IntType));
+    else
+    {
+        std::uint8_t                  u[sizeof(IntType)];
+        std::make_unsigned_t<IntType> v = value;
+        std::size_t i = 0;
+        // if constexpr to silence the warning about shift by 8
+        if constexpr (sizeof(IntType) > 1)
+        {
+            for (; i + 1 < sizeof(IntType); ++i)
+            {
+                u[i] = std::uint8_t(v);
+                v >>= 8;
+            }
+        }
+        u[i] = std::uint8_t(v);
+        stream.write(reinterpret_cast<char*>(u), sizeof(IntType));
+    }
+}
+// Read integers in bulk from a little-endian stream.
+// This reads N integers from stream s and puts them in array out.
+template<typename IntType>
+inline void read_little_endian(std::istream& stream, IntType* out, std::size_t count) {
+    if (IsLittleEndian)
+        stream.read(reinterpret_cast<char*>(out), sizeof(IntType) * count);
+    else
+        for (std::size_t i = 0; i < count; ++i)
+            out[i] = read_little_endian<IntType>(stream);
+}
+// Write integers in bulk to a little-endian stream.
+// This takes N integers from array values and writes them on stream s.
+template<typename IntType>
+inline void write_little_endian(std::ostream& stream, const IntType* values, std::size_t count) {
+    if (IsLittleEndian)
+        stream.write(reinterpret_cast<const char*>(values), sizeof(IntType) * count);
+    else
+        for (std::size_t i = 0; i < count; ++i)
+            write_little_endian<IntType>(stream, values[i]);
+}
+// Read N signed integers from the stream s, putting them in the array out.
+// The stream is assumed to be compressed using the signed LEB128 format.
+// See https://en.wikipedia.org/wiki/LEB128 for a description of the compression scheme.
+template<typename BufType, typename IntType, std::size_t Count>
+inline void read_leb_128_detail(std::istream&               stream,
+                                std::array<IntType, Count>& out,
+                                std::uint32_t&              bytes_left,
+                                BufType&                    buf,
+                                std::uint32_t&              buf_pos) {
+    static_assert(std::is_signed_v<IntType>, "Not implemented for unsigned types");
+    static_assert(sizeof(IntType) <= 4, "Not implemented for types larger than 32 bit");
+    IntType result = 0;
+    size_t  shift = 0, i = 0;
+    while (i < Count)
+    {
+        if (buf_pos == buf.size())
+        {
+            stream.read(reinterpret_cast<char*>(buf.data()),
+                        std::min(std::size_t(bytes_left), buf.size()));
+            buf_pos = 0;
+        }
+        std::uint8_t byte = buf[buf_pos++];
+        --bytes_left;
+        result |= (byte & 0x7f) << (shift % 32);
+        shift += 7;
+        if ((byte & 0x80) == 0)
+        {
+            out[i++] = (shift >= 32 || (byte & 0x40) == 0) ? result : result | ~((1 << shift) - 1);
+            result   = 0;
+            shift    = 0;
+        }
+    }
+}
+template<typename... Arrays>
+inline void read_leb_128(std::istream& stream, Arrays&... outs) {
+    // Check the presence of our LEB128 magic string
+    char leb128MagicString[Leb128MagicStringSize];
+    stream.read(leb128MagicString, Leb128MagicStringSize);
+    if (stream.fail() || strncmp(Leb128MagicString, leb128MagicString, Leb128MagicStringSize) != 0)
+    {
+        stream.setstate(std::ios::failbit);
+        return;
+    }
+    auto                           bytes_left = read_little_endian<std::uint32_t>(stream);
+    std::array<std::uint8_t, 8192> buf;
+    std::uint32_t                  buf_pos = std::uint32_t(buf.size());
+    (read_leb_128_detail(stream, outs, bytes_left, buf, buf_pos), ...);
+    if (bytes_left != 0)
+        stream.setstate(std::ios::failbit);
+}
+// Write signed integers to a stream with LEB128 compression.
+// This takes N integers from array values, compresses them with
+// the LEB128 algorithm and writes the result on the stream s.
+// See https://en.wikipedia.org/wiki/LEB128 for a description of the compression scheme.
+template<typename IntType, std::size_t Count>
+inline void write_leb_128(std::ostream& stream, const std::array<IntType, Count>& values) {
+    // Write our LEB128 magic string
+    stream.write(Leb128MagicString, Leb128MagicStringSize);
+    static_assert(std::is_signed_v<IntType>, "Not implemented for unsigned types");
+    std::uint32_t byte_count = 0;
+    for (std::size_t i = 0; i < Count; ++i)
+    {
+        IntType      value = values[i];
+        std::uint8_t byte;
+        do
+        {
+            byte = value & 0x7f;
+            value >>= 7;
+            ++byte_count;
+        } while ((byte & 0x40) == 0 ? value != 0 : value != -1);
+    }
+    write_little_endian(stream, byte_count);
+    const std::uint32_t BUF_SIZE = 4096;
+    std::uint8_t        buf[BUF_SIZE];
+    std::uint32_t       buf_pos = 0;
+    auto flush = [&]() {
+        if (buf_pos > 0)
+        {
+            stream.write(reinterpret_cast<char*>(buf), buf_pos);
+            buf_pos = 0;
+        }
+    };
+    auto write = [&](std::uint8_t b) {
+        buf[buf_pos++] = b;
+        if (buf_pos == BUF_SIZE)
+            flush();
+    };
+    for (std::size_t i = 0; i < Count; ++i)
+    {
+        IntType value = values[i];
+        while (true)
+        {
+            std::uint8_t byte = value & 0x7f;
+            value >>= 7;
+            if ((byte & 0x40) == 0 ? value == 0 : value == -1)
+            {
+                write(byte);
+                break;
+            }
+            write(byte | 0x80);
+        }
+    }
+    flush();
+}
+}  // namespace Stockfish::Eval::NNUE
+#endif  // #ifndef NNUE_COMMON_H_INCLUDED

src/nnue/nnue_feature_transformer.h ADDED Viewed

	@@ -0,0 +1,456 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+// A class that converts the input features of the NNUE evaluation function
+#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <iterator>
+#include "../position.h"
+#include "../types.h"
+#include "nnue_accumulator.h"
+#include "nnue_architecture.h"
+#include "nnue_common.h"
+#include "simd.h"
+namespace Stockfish::Eval::NNUE {
+// Returns the inverse of a permutation
+template<std::size_t Len>
+constexpr std::array<std::size_t, Len>
+invert_permutation(const std::array<std::size_t, Len>& order) {
+    std::array<std::size_t, Len> inverse{};
+    for (std::size_t i = 0; i < order.size(); i++)
+        inverse[order[i]] = i;
+    return inverse;
+}
+// Divide a byte region of size TotalSize to chunks of size
+// BlockSize, and permute the blocks by a given order
+template<std::size_t BlockSize, typename T, std::size_t N, std::size_t OrderSize>
+void permute(std::array<T, N>& data, const std::array<std::size_t, OrderSize>& order) {
+    constexpr std::size_t TotalSize = N * sizeof(T);
+    static_assert(TotalSize % (BlockSize * OrderSize) == 0,
+                  "ChunkSize * OrderSize must perfectly divide TotalSize");
+    constexpr std::size_t ProcessChunkSize = BlockSize * OrderSize;
+    std::array<std::byte, ProcessChunkSize> buffer{};
+    std::byte* const bytes = reinterpret_cast<std::byte*>(data.data());
+    for (std::size_t i = 0; i < TotalSize; i += ProcessChunkSize)
+    {
+        std::byte* const values = &bytes[i];
+        for (std::size_t j = 0; j < OrderSize; j++)
+        {
+            auto* const buffer_chunk = &buffer[j * BlockSize];
+            auto* const value_chunk  = &values[order[j] * BlockSize];
+            std::copy(value_chunk, value_chunk + BlockSize, buffer_chunk);
+        }
+        std::copy(std::begin(buffer), std::end(buffer), values);
+    }
+}
+// Input feature converter
+template<IndexType TransformedFeatureDimensions>
+class FeatureTransformer {
+    static constexpr bool UseThreats =
+      (TransformedFeatureDimensions == TransformedFeatureDimensionsBig);
+    // Number of output dimensions for one side
+    static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
+   public:
+    // Output type
+    using OutputType = TransformedFeatureType;
+    // Number of input/output dimensions
+    static constexpr IndexType InputDimensions       = PSQFeatureSet::Dimensions;
+    static constexpr IndexType ThreatInputDimensions = ThreatFeatureSet::Dimensions;
+    static constexpr IndexType TotalInputDimensions =
+      InputDimensions + (UseThreats ? ThreatInputDimensions : 0);
+    static constexpr IndexType OutputDimensions = HalfDimensions;
+    // Size of forward propagation buffer
+    static constexpr std::size_t BufferSize = OutputDimensions * sizeof(OutputType);
+    // Store the order by which 128-bit blocks of a 1024-bit data must
+    // be permuted so that calling packus on adjacent vectors of 16-bit
+    // integers loaded from the data results in the pre-permutation order
+    static constexpr auto PackusEpi16Order = []() -> std::array<std::size_t, 8> {
+#if defined(USE_AVX512)
+        // _mm512_packus_epi16 after permutation:
+        // |   0   |   2   |   4   |   6   | // Vector 0
+        // |   1   |   3   |   5   |   7   | // Vector 1
+        // | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | // Packed Result
+        return {0, 2, 4, 6, 1, 3, 5, 7};
+#elif defined(USE_AVX2)
+        // _mm256_packus_epi16 after permutation:
+        // |   0   |   2   |  |   4   |   6   | // Vector 0, 2
+        // |   1   |   3   |  |   5   |   7   | // Vector 1, 3
+        // | 0 | 1 | 2 | 3 |  | 4 | 5 | 6 | 7 | // Packed Result
+        return {0, 2, 1, 3, 4, 6, 5, 7};
+#else
+        return {0, 1, 2, 3, 4, 5, 6, 7};
+#endif
+    }();
+    static constexpr auto InversePackusEpi16Order = invert_permutation(PackusEpi16Order);
+    static constexpr std::uint32_t combine_hash(std::initializer_list<std::uint32_t> hashes) {
+        std::uint32_t hash = 0;
+        for (const auto component_hash : hashes)
+        {
+            hash = (hash << 1) | (hash >> 31);
+            hash ^= component_hash;
+        }
+        return hash;
+    }
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t get_hash_value() {
+        return (UseThreats ? combine_hash({ThreatFeatureSet::HashValue, PSQFeatureSet::HashValue})
+                           : PSQFeatureSet::HashValue)
+             ^ (OutputDimensions * 2);
+    }
+    void permute_weights() {
+        permute<16>(biases, PackusEpi16Order);
+        permute<16>(weights, PackusEpi16Order);
+        if constexpr (UseThreats)
+            permute<8>(threatWeights, PackusEpi16Order);
+    }
+    void unpermute_weights() {
+        permute<16>(biases, InversePackusEpi16Order);
+        permute<16>(weights, InversePackusEpi16Order);
+        if constexpr (UseThreats)
+            permute<8>(threatWeights, InversePackusEpi16Order);
+    }
+    // Read network parameters
+    bool read_parameters(std::istream& stream) {
+        const std::streampos beginPos = stream.tellg();
+        if constexpr (UseThreats)
+        {
+            // Primary path: Full_Threats + HalfKAv2_hm^ export layout
+            read_leb_128(stream, biases);
+            read_little_endian<ThreatWeightType>(stream, threatWeights.data(),
+                                                 ThreatInputDimensions * HalfDimensions);
+            read_leb_128(stream, weights);
+            read_leb_128(stream, threatPsqtWeights);
+            read_leb_128(stream, psqtWeights);
+            if (stream.fail())
+            {
+                // Fallback path: HalfKAv2_hm^ only export layout (no threat tensors)
+                stream.clear();
+                stream.seekg(beginPos);
+                std::fill(threatWeights.begin(), threatWeights.end(), 0);
+                std::fill(threatPsqtWeights.begin(), threatPsqtWeights.end(), 0);
+                read_leb_128(stream, biases);
+                read_leb_128(stream, weights);
+                read_leb_128(stream, psqtWeights);
+            }
+        }
+        else
+        {
+            read_leb_128(stream, biases);
+            read_leb_128(stream, weights);
+            read_leb_128(stream, psqtWeights);
+        }
+        if (stream.fail())
+            return false;
+        permute_weights();
+        return true;
+    }
+    // Write network parameters
+    bool write_parameters(std::ostream& stream) const {
+        std::unique_ptr<FeatureTransformer> copy = std::make_unique<FeatureTransformer>(*this);
+        copy->unpermute_weights();
+        write_leb_128<BiasType>(stream, copy->biases);
+        if constexpr (UseThreats)
+        {
+            write_little_endian<ThreatWeightType>(stream, copy->threatWeights.data(),
+                                                  ThreatInputDimensions * HalfDimensions);
+            write_leb_128<WeightType>(stream, copy->weights);
+            auto combinedPsqtWeights =
+              std::make_unique<std::array<PSQTWeightType, TotalInputDimensions * PSQTBuckets>>();
+            std::copy(std::begin(copy->threatPsqtWeights),
+                      std::begin(copy->threatPsqtWeights) + ThreatInputDimensions * PSQTBuckets,
+                      combinedPsqtWeights->begin());
+            std::copy(std::begin(copy->psqtWeights),
+                      std::begin(copy->psqtWeights) + InputDimensions * PSQTBuckets,
+                      combinedPsqtWeights->begin() + ThreatInputDimensions * PSQTBuckets);
+            write_leb_128<PSQTWeightType>(stream, *combinedPsqtWeights);
+        }
+        else
+        {
+            write_leb_128<WeightType>(stream, copy->weights);
+            write_leb_128<PSQTWeightType>(stream, copy->psqtWeights);
+        }
+        return !stream.fail();
+    }
+    std::size_t get_content_hash() const {
+        std::size_t h = 0;
+        hash_combine(h, get_raw_data_hash(biases));
+        hash_combine(h, get_raw_data_hash(weights));
+        hash_combine(h, get_raw_data_hash(psqtWeights));
+        if constexpr (UseThreats)
+        {
+            hash_combine(h, get_raw_data_hash(threatWeights));
+            hash_combine(h, get_raw_data_hash(threatPsqtWeights));
+        }
+        hash_combine(h, get_hash_value());
+        return h;
+    }
+    // Convert input features
+    std::int32_t transform(const Position&                           pos,
+                           AccumulatorStack&                         accumulatorStack,
+                           AccumulatorCaches::Cache<HalfDimensions>& cache,
+                           OutputType*                               output,
+                           int                                       bucket) const {
+        using namespace SIMD;
+        accumulatorStack.evaluate(pos, *this, cache);
+        const auto& accumulatorState       = accumulatorStack.latest<PSQFeatureSet>();
+        const auto& threatAccumulatorState = accumulatorStack.latest<ThreatFeatureSet>();
+        const Color perspectives[2]  = {pos.side_to_move(), ~pos.side_to_move()};
+        const auto& psqtAccumulation = (accumulatorState.acc<HalfDimensions>()).psqtAccumulation;
+        auto        psqt =
+          (psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket]);
+        if constexpr (UseThreats)
+        {
+            const auto& threatPsqtAccumulation =
+              (threatAccumulatorState.acc<HalfDimensions>()).psqtAccumulation;
+            psqt = (psqt + threatPsqtAccumulation[perspectives[0]][bucket]
+                    - threatPsqtAccumulation[perspectives[1]][bucket])
+                 / 2;
+        }
+        else
+            psqt /= 2;
+        const auto& accumulation = (accumulatorState.acc<HalfDimensions>()).accumulation;
+        const auto& threatAccumulation =
+          (threatAccumulatorState.acc<HalfDimensions>()).accumulation;
+        for (IndexType p = 0; p < 2; ++p)
+        {
+            const IndexType offset = (HalfDimensions / 2) * p;
+#if defined(VECTOR)
+            constexpr IndexType OutputChunkSize = MaxChunkSize;
+            static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
+            constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
+            const vec_t Zero = vec_zero();
+            const vec_t One  = vec_set_16(255);
+            const vec_t* in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
+            const vec_t* in1 =
+              reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+            vec_t* out = reinterpret_cast<vec_t*>(output + offset);
+            // Per the NNUE architecture, here we want to multiply pairs of
+            // clipped elements and divide the product by 128. To do this,
+            // we can naively perform min/max operation to clip each of the
+            // four int16 vectors, mullo pairs together, then pack them into
+            // one int8 vector. However, there exists a faster way.
+            // The idea here is to use the implicit clipping from packus to
+            // save us two vec_max_16 instructions. This clipping works due
+            // to the fact that any int16 integer below zero will be zeroed
+            // on packus.
+            // Consider the case where the second element is negative.
+            // If we do standard clipping, that element will be zero, which
+            // means our pairwise product is zero. If we perform packus and
+            // remove the lower-side clip for the second element, then our
+            // product before packus will be negative, and is zeroed on pack.
+            // The two operation produce equivalent results, but the second
+            // one (using packus) saves one max operation per pair.
+            // But here we run into a problem: mullo does not preserve the
+            // sign of the multiplication. We can get around this by doing
+            // mulhi, which keeps the sign. But that requires an additional
+            // tweak.
+            // mulhi cuts off the last 16 bits of the resulting product,
+            // which is the same as performing a rightward shift of 16 bits.
+            // We can use this to our advantage. Recall that we want to
+            // divide the final product by 128, which is equivalent to a
+            // 7-bit right shift. Intuitively, if we shift the clipped
+            // value left by 9, and perform mulhi, which shifts the product
+            // right by 16 bits, then we will net a right shift of 7 bits.
+            // However, this won't work as intended. Since we clip the
+            // values to have a maximum value of 127, shifting it by 9 bits
+            // might occupy the signed bit, resulting in some positive
+            // values being interpreted as negative after the shift.
+            // There is a way, however, to get around this limitation. When
+            // loading the network, scale accumulator weights and biases by
+            // 2. To get the same pairwise multiplication result as before,
+            // we need to divide the product by 128 * 2 * 2 = 512, which
+            // amounts to a right shift of 9 bits. So now we only have to
+            // shift left by 7 bits, perform mulhi (shifts right by 16 bits)
+            // and net a 9 bit right shift. Since we scaled everything by
+            // two, the values are clipped at 127 * 2 = 254, which occupies
+            // 8 bits. Shifting it by 7 bits left will no longer occupy the
+            // signed bit, so we are safe.
+            // Note that on NEON processors, we shift left by 6 instead
+            // because the instruction "vqdmulhq_s16" also doubles the
+            // return value after the multiplication, adding an extra shift
+            // to the left by 1, so we compensate by shifting less before
+            // the multiplication.
+            constexpr int shift =
+    #if defined(USE_SSE2)
+              7;
+    #else
+              6;
+    #endif
+            if constexpr (UseThreats)
+            {
+                const vec_t* tin0 =
+                  reinterpret_cast<const vec_t*>(&(threatAccumulation[perspectives[p]][0]));
+                const vec_t* tin1 = reinterpret_cast<const vec_t*>(
+                  &(threatAccumulation[perspectives[p]][HalfDimensions / 2]));
+                for (IndexType j = 0; j < NumOutputChunks; ++j)
+                {
+                    const vec_t acc0a = vec_add_16(in0[j * 2 + 0], tin0[j * 2 + 0]);
+                    const vec_t acc0b = vec_add_16(in0[j * 2 + 1], tin0[j * 2 + 1]);
+                    const vec_t acc1a = vec_add_16(in1[j * 2 + 0], tin1[j * 2 + 0]);
+                    const vec_t acc1b = vec_add_16(in1[j * 2 + 1], tin1[j * 2 + 1]);
+                    const vec_t sum0a =
+                      vec_slli_16(vec_max_16(vec_min_16(acc0a, One), Zero), shift);
+                    const vec_t sum0b =
+                      vec_slli_16(vec_max_16(vec_min_16(acc0b, One), Zero), shift);
+                    const vec_t sum1a = vec_min_16(acc1a, One);
+                    const vec_t sum1b = vec_min_16(acc1b, One);
+                    const vec_t pa = vec_mulhi_16(sum0a, sum1a);
+                    const vec_t pb = vec_mulhi_16(sum0b, sum1b);
+                    out[j] = vec_packus_16(pa, pb);
+                }
+            }
+            else
+            {
+                for (IndexType j = 0; j < NumOutputChunks; ++j)
+                {
+                    const vec_t sum0a =
+                      vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero), shift);
+                    const vec_t sum0b =
+                      vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero), shift);
+                    const vec_t sum1a = vec_min_16(in1[j * 2 + 0], One);
+                    const vec_t sum1b = vec_min_16(in1[j * 2 + 1], One);
+                    const vec_t pa = vec_mulhi_16(sum0a, sum1a);
+                    const vec_t pb = vec_mulhi_16(sum0b, sum1b);
+                    out[j] = vec_packus_16(pa, pb);
+                }
+            }
+#else
+            for (IndexType j = 0; j < HalfDimensions / 2; ++j)
+            {
+                BiasType sum0 = accumulation[static_cast<int>(perspectives[p])][j + 0];
+                BiasType sum1 =
+                  accumulation[static_cast<int>(perspectives[p])][j + HalfDimensions / 2];
+                if constexpr (UseThreats)
+                {
+                    sum0 += threatAccumulation[static_cast<int>(perspectives[p])][j + 0];
+                    sum1 +=
+                      threatAccumulation[static_cast<int>(perspectives[p])][j + HalfDimensions / 2];
+                }
+                sum0 = std::clamp<BiasType>(sum0, 0, 255);
+                sum1 = std::clamp<BiasType>(sum1, 0, 255);
+                output[offset + j] = static_cast<OutputType>(unsigned(sum0 * sum1) / 512);
+            }
+#endif
+        }
+        return psqt;
+    }  // end of function transform()
+    alignas(CacheLineSize) std::array<BiasType, HalfDimensions> biases;
+    alignas(CacheLineSize) std::array<WeightType, HalfDimensions * InputDimensions> weights;
+    alignas(CacheLineSize)
+      std::array<ThreatWeightType,
+                 UseThreats ? HalfDimensions * ThreatInputDimensions : 0> threatWeights;
+    alignas(CacheLineSize) std::array<PSQTWeightType, InputDimensions * PSQTBuckets> psqtWeights;
+    alignas(CacheLineSize)
+      std::array<PSQTWeightType,
+                 UseThreats ? ThreatInputDimensions * PSQTBuckets : 0> threatPsqtWeights;
+};
+}  // namespace Stockfish::Eval::NNUE
+template<Stockfish::Eval::NNUE::IndexType TransformedFeatureDimensions>
+struct std::hash<Stockfish::Eval::NNUE::FeatureTransformer<TransformedFeatureDimensions>> {
+    std::size_t
+    operator()(const Stockfish::Eval::NNUE::FeatureTransformer<TransformedFeatureDimensions>& ft)
+      const noexcept {
+        return ft.get_content_hash();
+    }
+};
+#endif  // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED

src/nnue/nnue_misc.cpp ADDED Viewed

	@@ -0,0 +1,193 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+// Code for calculating NNUE evaluation function
+#include "nnue_misc.h"
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <iosfwd>
+#include <iostream>
+#include <sstream>
+#include <string_view>
+#include <tuple>
+#include "../position.h"
+#include "../types.h"
+#include "../uci.h"
+#include "network.h"
+#include "nnue_accumulator.h"
+namespace Stockfish::Eval::NNUE {
+constexpr std::string_view PieceToChar(" PNBRQK  pnbrqk");
+namespace {
+// Converts a Value into (centi)pawns and writes it in a buffer.
+// The buffer must have capacity for at least 5 chars.
+void format_cp_compact(Value v, char* buffer, const Position& pos) {
+    buffer[0] = (v < 0 ? '-' : v > 0 ? '+' : ' ');
+    int cp = std::abs(UCIEngine::to_cp(v, pos));
+    if (cp >= 10000)
+    {
+        buffer[1] = '0' + cp / 10000;
+        cp %= 10000;
+        buffer[2] = '0' + cp / 1000;
+        cp %= 1000;
+        buffer[3] = '0' + cp / 100;
+        buffer[4] = ' ';
+    }
+    else if (cp >= 1000)
+    {
+        buffer[1] = '0' + cp / 1000;
+        cp %= 1000;
+        buffer[2] = '0' + cp / 100;
+        cp %= 100;
+        buffer[3] = '.';
+        buffer[4] = '0' + cp / 10;
+    }
+    else
+    {
+        buffer[1] = '0' + cp / 100;
+        cp %= 100;
+        buffer[2] = '.';
+        buffer[3] = '0' + cp / 10;
+        cp %= 10;
+        buffer[4] = '0' + cp / 1;
+    }
+}
+// Converts a Value into pawns, always keeping two decimals
+void format_cp_aligned_dot(Value v, std::stringstream& stream, const Position& pos) {
+    const double pawns = std::abs(0.01 * UCIEngine::to_cp(v, pos));
+    stream << (v < 0   ? '-'
+               : v > 0 ? '+'
+                       : ' ')
+           << std::setiosflags(std::ios::fixed) << std::setw(6) << std::setprecision(2) << pawns;
+}
+}
+// Returns a string with the value of each piece on a board,
+// and a table for (PSQT, Layers) values bucket by bucket.
+std::string
+trace(Position& pos, const Eval::NNUE::Networks& networks, Eval::NNUE::AccumulatorCaches& caches) {
+    std::stringstream ss;
+    char board[3 * 8 + 1][8 * 8 + 2];
+    std::memset(board, ' ', sizeof(board));
+    for (int row = 0; row < 3 * 8 + 1; ++row)
+        board[row][8 * 8 + 1] = '\0';
+    // A lambda to output one box of the board
+    auto writeSquare = [&board, &pos](File file, Rank rank, Piece pc, Value value) {
+        const int x = int(file) * 8;
+        const int y = (7 - int(rank)) * 3;
+        for (int i = 1; i < 8; ++i)
+            board[y][x + i] = board[y + 3][x + i] = '-';
+        for (int i = 1; i < 3; ++i)
+            board[y + i][x] = board[y + i][x + 8] = '|';
+        board[y][x] = board[y][x + 8] = board[y + 3][x + 8] = board[y + 3][x] = '+';
+        if (pc != NO_PIECE)
+            board[y + 1][x + 4] = PieceToChar[pc];
+        if (is_valid(value))
+            format_cp_compact(value, &board[y + 2][x + 2], pos);
+    };
+    auto accumulators = std::make_unique<AccumulatorStack>();
+    // We estimate the value of each piece by doing a differential evaluation from
+    // the current base eval, simulating the removal of the piece from its square.
+    auto [psqt, positional] = networks.big.evaluate(pos, *accumulators, caches.big);
+    Value base              = psqt + positional;
+    base                    = pos.side_to_move() == WHITE ? base : -base;
+    for (File f = FILE_A; f <= FILE_H; ++f)
+        for (Rank r = RANK_1; r <= RANK_8; ++r)
+        {
+            Square sq = make_square(f, r);
+            Piece  pc = pos.piece_on(sq);
+            Value  v  = VALUE_NONE;
+            if (pc != NO_PIECE && type_of(pc) != KING)
+            {
+                pos.remove_piece(sq);
+                accumulators->reset();
+                std::tie(psqt, positional) = networks.big.evaluate(pos, *accumulators, caches.big);
+                Value eval                 = psqt + positional;
+                eval                       = pos.side_to_move() == WHITE ? eval : -eval;
+                v                          = base - eval;
+                pos.put_piece(pc, sq);
+            }
+            writeSquare(f, r, pc, v);
+        }
+    ss << " NNUE derived piece values:\n";
+    for (int row = 0; row < 3 * 8 + 1; ++row)
+        ss << board[row] << '\n';
+    ss << '\n';
+    accumulators->reset();
+    auto t = networks.big.trace_evaluate(pos, *accumulators, caches.big);
+    ss << " NNUE network contributions "
+       << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl
+       << "+------------+------------+------------+------------+\n"
+       << "|   Bucket   |  Material  | Positional |   Total    |\n"
+       << "|            |   (PSQT)   |  (Layers)  |            |\n"
+       << "+------------+------------+------------+------------+\n";
+    for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket)
+    {
+        ss << "|  " << bucket << "        "  //
+           << " |  ";
+        format_cp_aligned_dot(t.psqt[bucket], ss, pos);
+        ss << "  "  //
+           << " |  ";
+        format_cp_aligned_dot(t.positional[bucket], ss, pos);
+        ss << "  "  //
+           << " |  ";
+        format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], ss, pos);
+        ss << "  "  //
+           << " |";
+        if (bucket == t.correctBucket)
+            ss << " <-- this bucket is used";
+        ss << '\n';
+    }
+    ss << "+------------+------------+------------+------------+\n";
+    return ss.str();
+}
+}  // namespace Stockfish::Eval::NNUE

src/nnue/nnue_misc.h ADDED Viewed

	@@ -0,0 +1,74 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef NNUE_MISC_H_INCLUDED
+#define NNUE_MISC_H_INCLUDED
+#include <cstddef>
+#include <memory>
+#include <string>
+#include "../misc.h"
+#include "../types.h"
+#include "nnue_architecture.h"
+namespace Stockfish {
+class Position;
+namespace Eval::NNUE {
+// EvalFile uses fixed string types because it's part of the network structure which must be trivial.
+struct EvalFile {
+    // Default net name, will use one of the EvalFileDefaultName* macros defined
+    // in evaluate.h
+    FixedString<256> defaultName;
+    // Selected net name, either via uci option or default
+    FixedString<256> current;
+    // Net description extracted from the net file
+    FixedString<256> netDescription;
+};
+struct NnueEvalTrace {
+    static_assert(LayerStacks == PSQTBuckets);
+    Value       psqt[LayerStacks];
+    Value       positional[LayerStacks];
+    std::size_t correctBucket;
+};
+struct Networks;
+struct AccumulatorCaches;
+std::string trace(Position& pos, const Networks& networks, AccumulatorCaches& caches);
+}  // namespace Stockfish::Eval::NNUE
+}  // namespace Stockfish
+template<>
+struct std::hash<Stockfish::Eval::NNUE::EvalFile> {
+    std::size_t operator()(const Stockfish::Eval::NNUE::EvalFile& evalFile) const noexcept {
+        std::size_t h = 0;
+        Stockfish::hash_combine(h, evalFile.defaultName);
+        Stockfish::hash_combine(h, evalFile.current);
+        Stockfish::hash_combine(h, evalFile.netDescription);
+        return h;
+    }
+};
+#endif  // #ifndef NNUE_MISC_H_INCLUDED

src/nnue/simd.h ADDED Viewed

	@@ -0,0 +1,440 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef NNUE_SIMD_H_INCLUDED
+#define NNUE_SIMD_H_INCLUDED
+#if defined(USE_AVX2)
+    #include <immintrin.h>
+#elif defined(USE_SSE41)
+    #include <smmintrin.h>
+#elif defined(USE_SSSE3)
+    #include <tmmintrin.h>
+#elif defined(USE_SSE2)
+    #include <emmintrin.h>
+#elif defined(USE_NEON)
+    #include <arm_neon.h>
+#endif
+#include "../types.h"
+#include "nnue_common.h"
+namespace Stockfish::Eval::NNUE::SIMD {
+// If vector instructions are enabled, we update and refresh the
+// accumulator tile by tile such that each tile fits in the CPU's
+// vector registers.
+#define VECTOR
+#ifdef USE_AVX512
+using vec_t      = __m512i;
+using vec_i8_t   = __m256i;
+using vec128_t   = __m128i;
+using psqt_vec_t = __m256i;
+using vec_uint_t = __m512i;
+    #define vec_load(a) _mm512_load_si512(a)
+    #define vec_store(a, b) _mm512_store_si512(a, b)
+    #define vec_convert_8_16(a) _mm512_cvtepi8_epi16(a)
+    #define vec_add_16(a, b) _mm512_add_epi16(a, b)
+    #define vec_sub_16(a, b) _mm512_sub_epi16(a, b)
+    #define vec_mulhi_16(a, b) _mm512_mulhi_epi16(a, b)
+    #define vec_zero() _mm512_setzero_epi32()
+    #define vec_set_16(a) _mm512_set1_epi16(a)
+    #define vec_max_16(a, b) _mm512_max_epi16(a, b)
+    #define vec_min_16(a, b) _mm512_min_epi16(a, b)
+    #define vec_slli_16(a, b) _mm512_slli_epi16(a, b)
+    // Inverse permuted at load time
+    #define vec_packus_16(a, b) _mm512_packus_epi16(a, b)
+    #define vec_load_psqt(a) _mm256_load_si256(a)
+    #define vec_store_psqt(a, b) _mm256_store_si256(a, b)
+    #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b)
+    #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b)
+    #define vec_zero_psqt() _mm256_setzero_si256()
+    #ifdef USE_SSSE3
+        #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
+    #endif
+    #define vec128_zero _mm_setzero_si128()
+    #define vec128_set_16(a) _mm_set1_epi16(a)
+    #define vec128_load(a) _mm_load_si128(a)
+    #define vec128_storeu(a, b) _mm_storeu_si128(a, b)
+    #define vec128_add(a, b) _mm_add_epi16(a, b)
+    #define NumRegistersSIMD 16
+    #define MaxChunkSize 64
+#elif USE_AVX2
+using vec_t      = __m256i;
+using vec_i8_t   = __m128i;
+using vec128_t   = __m128i;
+using psqt_vec_t = __m256i;
+using vec_uint_t = __m256i;
+    #define vec_load(a) _mm256_load_si256(a)
+    #define vec_store(a, b) _mm256_store_si256(a, b)
+    #define vec_convert_8_16(a) _mm256_cvtepi8_epi16(a)
+    #define vec_add_16(a, b) _mm256_add_epi16(a, b)
+    #define vec_sub_16(a, b) _mm256_sub_epi16(a, b)
+    #define vec_mulhi_16(a, b) _mm256_mulhi_epi16(a, b)
+    #define vec_zero() _mm256_setzero_si256()
+    #define vec_set_16(a) _mm256_set1_epi16(a)
+    #define vec_max_16(a, b) _mm256_max_epi16(a, b)
+    #define vec_min_16(a, b) _mm256_min_epi16(a, b)
+    #define vec_slli_16(a, b) _mm256_slli_epi16(a, b)
+    // Inverse permuted at load time
+    #define vec_packus_16(a, b) _mm256_packus_epi16(a, b)
+    #define vec_load_psqt(a) _mm256_load_si256(a)
+    #define vec_store_psqt(a, b) _mm256_store_si256(a, b)
+    #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b)
+    #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b)
+    #define vec_zero_psqt() _mm256_setzero_si256()
+    #ifdef USE_SSSE3
+        #if defined(USE_VNNI) && !defined(USE_AVXVNNI)
+            #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
+        #else
+            #define vec_nnz(a) \
+                _mm256_movemask_ps( \
+                  _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
+        #endif
+    #endif
+    #define vec128_zero _mm_setzero_si128()
+    #define vec128_set_16(a) _mm_set1_epi16(a)
+    #define vec128_load(a) _mm_load_si128(a)
+    #define vec128_storeu(a, b) _mm_storeu_si128(a, b)
+    #define vec128_add(a, b) _mm_add_epi16(a, b)
+    #define NumRegistersSIMD 12
+    #define MaxChunkSize 32
+#elif USE_SSE2
+using vec_t      = __m128i;
+using vec_i8_t   = std::uint64_t;  // for the correct size -- will be loaded into an xmm reg
+using vec128_t   = __m128i;
+using psqt_vec_t = __m128i;
+using vec_uint_t = __m128i;
+    #define vec_load(a) (*(a))
+    #define vec_store(a, b) *(a) = (b)
+    #define vec_add_16(a, b) _mm_add_epi16(a, b)
+    #define vec_sub_16(a, b) _mm_sub_epi16(a, b)
+    #define vec_mulhi_16(a, b) _mm_mulhi_epi16(a, b)
+    #define vec_zero() _mm_setzero_si128()
+    #define vec_set_16(a) _mm_set1_epi16(a)
+    #define vec_max_16(a, b) _mm_max_epi16(a, b)
+    #define vec_min_16(a, b) _mm_min_epi16(a, b)
+    #define vec_slli_16(a, b) _mm_slli_epi16(a, b)
+    #define vec_packus_16(a, b) _mm_packus_epi16(a, b)
+    #define vec_load_psqt(a) (*(a))
+    #define vec_store_psqt(a, b) *(a) = (b)
+    #define vec_add_psqt_32(a, b) _mm_add_epi32(a, b)
+    #define vec_sub_psqt_32(a, b) _mm_sub_epi32(a, b)
+    #define vec_zero_psqt() _mm_setzero_si128()
+    #ifdef USE_SSSE3
+        #define vec_nnz(a) \
+            _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
+    #endif
+    #ifdef __i386__
+inline __m128i _mm_cvtsi64_si128(int64_t val) {
+    return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&val));
+}
+    #endif
+    #ifdef USE_SSE41
+        #define vec_convert_8_16(a) _mm_cvtepi8_epi16(_mm_cvtsi64_si128(static_cast<int64_t>(a)))
+    #else
+// Credit: Yoshie2000
+inline __m128i vec_convert_8_16(uint64_t x) {
+    __m128i v8   = _mm_cvtsi64_si128(static_cast<int64_t>(x));
+    __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(), v8);
+    return _mm_unpacklo_epi8(v8, sign);
+}
+    #endif
+    #define vec128_zero _mm_setzero_si128()
+    #define vec128_set_16(a) _mm_set1_epi16(a)
+    #define vec128_load(a) _mm_load_si128(a)
+    #define vec128_storeu(a, b) _mm_storeu_si128(a, b)
+    #define vec128_add(a, b) _mm_add_epi16(a, b)
+    #define NumRegistersSIMD (Is64Bit ? 12 : 6)
+    #define MaxChunkSize 16
+#elif USE_NEON
+using vec_i8x8_t __attribute__((may_alias))  = int8x8_t;
+using vec_i16x8_t __attribute__((may_alias)) = int16x8_t;
+using vec_i8x16_t __attribute__((may_alias)) = int8x16_t;
+using vec_u16x8_t __attribute__((may_alias)) = uint16x8_t;
+using vec_i32x4_t __attribute__((may_alias)) = int32x4_t;
+using vec_t __attribute__((may_alias))      = int16x8_t;
+using vec_i8_t __attribute__((may_alias))   = int8x16_t;
+using psqt_vec_t __attribute__((may_alias)) = int32x4_t;
+using vec128_t __attribute__((may_alias))   = uint16x8_t;
+using vec_uint_t __attribute__((may_alias)) = uint32x4_t;
+    #define vec_load(a) (*(a))
+    #define vec_store(a, b) *(a) = (b)
+    #define vec_add_16(a, b) vaddq_s16(a, b)
+    #define vec_sub_16(a, b) vsubq_s16(a, b)
+    #define vec_mulhi_16(a, b) vqdmulhq_s16(a, b)
+    #define vec_zero() vec_t{0}
+    #define vec_set_16(a) vdupq_n_s16(a)
+    #define vec_max_16(a, b) vmaxq_s16(a, b)
+    #define vec_min_16(a, b) vminq_s16(a, b)
+    #define vec_slli_16(a, b) vshlq_s16(a, vec_set_16(b))
+    #define vec_packus_16(a, b) reinterpret_cast<vec_t>(vcombine_u8(vqmovun_s16(a), vqmovun_s16(b)))
+    #define vec_load_psqt(a) (*(a))
+    #define vec_store_psqt(a, b) *(a) = (b)
+    #define vec_add_psqt_32(a, b) vaddq_s32(a, b)
+    #define vec_sub_psqt_32(a, b) vsubq_s32(a, b)
+    #define vec_zero_psqt() psqt_vec_t{0}
+static constexpr std::uint32_t Mask[4] = {1, 2, 4, 8};
+    #define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask)))
+    #define vec128_zero vdupq_n_u16(0)
+    #define vec128_set_16(a) vdupq_n_u16(a)
+    #define vec128_load(a) vld1q_u16(reinterpret_cast<const std::uint16_t*>(a))
+    #define vec128_storeu(a, b) vst1q_u16(reinterpret_cast<std::uint16_t*>(a), b)
+    #define vec128_add(a, b) vaddq_u16(a, b)
+    #define NumRegistersSIMD 16
+    #define MaxChunkSize 16
+    #ifndef __aarch64__
+// Single instruction doesn't exist on 32-bit ARM
+inline int16x8_t vmovl_high_s8(int8x16_t val) { return vmovl_s8(vget_high_s8(val)); }
+    #endif
+#else
+    #undef VECTOR
+#endif
+struct Vec16Wrapper {
+#ifdef VECTOR
+    using type = vec_t;
+    static type add(const type& lhs, const type& rhs) { return vec_add_16(lhs, rhs); }
+    static type sub(const type& lhs, const type& rhs) { return vec_sub_16(lhs, rhs); }
+#else
+    using type = BiasType;
+    static type add(const type& lhs, const type& rhs) { return lhs + rhs; }
+    static type sub(const type& lhs, const type& rhs) { return lhs - rhs; }
+#endif
+};
+struct Vec32Wrapper {
+#ifdef VECTOR
+    using type = psqt_vec_t;
+    static type add(const type& lhs, const type& rhs) { return vec_add_psqt_32(lhs, rhs); }
+    static type sub(const type& lhs, const type& rhs) { return vec_sub_psqt_32(lhs, rhs); }
+#else
+    using type = PSQTWeightType;
+    static type add(const type& lhs, const type& rhs) { return lhs + rhs; }
+    static type sub(const type& lhs, const type& rhs) { return lhs - rhs; }
+#endif
+};
+enum UpdateOperation {
+    Add,
+    Sub
+};
+template<typename VecWrapper,
+         UpdateOperation... ops,
+         std::enable_if_t<sizeof...(ops) == 0, bool> = true>
+typename VecWrapper::type fused(const typename VecWrapper::type& in) {
+    return in;
+}
+template<typename VecWrapper,
+         UpdateOperation update_op,
+         UpdateOperation... ops,
+         typename T,
+         typename... Ts,
+         std::enable_if_t<is_all_same_v<typename VecWrapper::type, T, Ts...>, bool> = true,
+         std::enable_if_t<sizeof...(ops) == sizeof...(Ts), bool>                    = true>
+typename VecWrapper::type
+fused(const typename VecWrapper::type& in, const T& operand, const Ts&... operands) {
+    switch (update_op)
+    {
+    case Add :
+        return fused<VecWrapper, ops...>(VecWrapper::add(in, operand), operands...);
+    case Sub :
+        return fused<VecWrapper, ops...>(VecWrapper::sub(in, operand), operands...);
+    default :
+        static_assert(update_op == Add || update_op == Sub,
+                      "Only Add and Sub are currently supported.");
+        return typename VecWrapper::type();
+    }
+}
+#if defined(USE_AVX512)
+[[maybe_unused]] static int m512_hadd(__m512i sum, int bias) {
+    return _mm512_reduce_add_epi32(sum) + bias;
+}
+[[maybe_unused]] static void m512_add_dpbusd_epi32(__m512i& acc, __m512i a, __m512i b) {
+    #if defined(USE_VNNI)
+    acc = _mm512_dpbusd_epi32(acc, a, b);
+    #else
+    __m512i product0 = _mm512_maddubs_epi16(a, b);
+    product0         = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
+    acc              = _mm512_add_epi32(acc, product0);
+    #endif
+}
+#endif
+#if defined(USE_AVX2)
+[[maybe_unused]] static int m256_hadd(__m256i sum, int bias) {
+    __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+    sum128         = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+    sum128         = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+    return _mm_cvtsi128_si32(sum128) + bias;
+}
+[[maybe_unused]] static void m256_add_dpbusd_epi32(__m256i& acc, __m256i a, __m256i b) {
+    #if defined(USE_VNNI)
+    acc = _mm256_dpbusd_epi32(acc, a, b);
+    #else
+    __m256i product0 = _mm256_maddubs_epi16(a, b);
+    product0         = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
+    acc              = _mm256_add_epi32(acc, product0);
+    #endif
+}
+#endif
+#if defined(USE_SSSE3)
+[[maybe_unused]] static int m128_hadd(__m128i sum, int bias) {
+    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));  //_MM_PERM_BADC
+    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));  //_MM_PERM_CDAB
+    return _mm_cvtsi128_si32(sum) + bias;
+}
+[[maybe_unused]] static void m128_add_dpbusd_epi32(__m128i& acc, __m128i a, __m128i b) {
+    __m128i product0 = _mm_maddubs_epi16(a, b);
+    product0         = _mm_madd_epi16(product0, _mm_set1_epi16(1));
+    acc              = _mm_add_epi32(acc, product0);
+}
+#endif
+#if defined(USE_NEON_DOTPROD)
+[[maybe_unused]] static void
+dotprod_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {
+    acc = vdotq_s32(acc, a, b);
+}
+#endif
+#if defined(USE_NEON)
+[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
+    #if USE_NEON >= 8
+    return vaddvq_s32(s);
+    #else
+    return s[0] + s[1] + s[2] + s[3];
+    #endif
+}
+[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
+    return neon_m128_reduce_add_epi32(sum) + bias;
+}
+#endif
+#if USE_NEON >= 8
+[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {
+    int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
+    int16x8_t product1 = vmull_high_s8(a, b);
+    int16x8_t sum      = vpaddq_s16(product0, product1);
+    acc                = vpadalq_s16(acc, sum);
+}
+#endif
+// Compute optimal SIMD register count for feature transformer accumulation.
+template<IndexType TransformedFeatureWidth, IndexType HalfDimensions, IndexType PSQTBuckets>
+class SIMDTiling {
+#ifdef VECTOR
+        // We use __m* types as template arguments, which causes GCC to emit warnings
+        // about losing some attribute information. This is irrelevant to us as we
+        // only take their size, so the following pragma are harmless.
+    #if defined(__GNUC__)
+        #pragma GCC diagnostic push
+        #pragma GCC diagnostic ignored "-Wignored-attributes"
+    #endif
+    template<typename SIMDRegisterType, typename LaneType, int NumLanes, int MaxRegisters>
+    static constexpr int BestRegisterCount() {
+        constexpr std::size_t RegisterSize = sizeof(SIMDRegisterType);
+        constexpr std::size_t LaneSize     = sizeof(LaneType);
+        static_assert(RegisterSize >= LaneSize);
+        static_assert(MaxRegisters <= NumRegistersSIMD);
+        static_assert(MaxRegisters > 0);
+        static_assert(NumRegistersSIMD > 0);
+        static_assert(RegisterSize % LaneSize == 0);
+        static_assert((NumLanes * LaneSize) % RegisterSize == 0);
+        const int ideal = (NumLanes * LaneSize) / RegisterSize;
+        if (ideal <= MaxRegisters)
+            return ideal;
+        // Look for the largest divisor of the ideal register count that is smaller than MaxRegisters
+        for (int divisor = MaxRegisters; divisor > 1; --divisor)
+            if (ideal % divisor == 0)
+                return divisor;
+        return 1;
+    }
+    #if defined(__GNUC__)
+        #pragma GCC diagnostic pop
+    #endif
+   public:
+    static constexpr int NumRegs =
+      BestRegisterCount<vec_t, WeightType, TransformedFeatureWidth, NumRegistersSIMD>();
+    static constexpr int NumPsqtRegs =
+      BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
+    static constexpr IndexType TileHeight     = NumRegs * sizeof(vec_t) / 2;
+    static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
+    static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions");
+    static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets");
+#endif
+};
+}
+#endif

src/numa.h ADDED Viewed

	@@ -0,0 +1,1718 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef NUMA_H_INCLUDED
+#define NUMA_H_INCLUDED
+#include <algorithm>
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+#include <cstring>
+#include "shm.h"
+// We support linux very well, but we explicitly do NOT support Android,
+// because there is no affected systems, not worth maintaining.
+#if defined(__linux__) && !defined(__ANDROID__)
+    #if !defined(_GNU_SOURCE)
+        #define _GNU_SOURCE
+    #endif
+    #include <sched.h>
+#elif defined(_WIN64)
+    #if _WIN32_WINNT < 0x0601
+        #undef _WIN32_WINNT
+        #define _WIN32_WINNT 0x0601  // Force to include needed API prototypes
+    #endif
+// On Windows each processor group can have up to 64 processors.
+// https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups
+static constexpr size_t WIN_PROCESSOR_GROUP_SIZE = 64;
+    #if !defined(NOMINMAX)
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #if defined small
+        #undef small
+    #endif
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreadselectedcpusetmasks
+using SetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT);
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-getthreadselectedcpusetmasks
+using GetThreadSelectedCpuSetMasks_t = BOOL (*)(HANDLE, PGROUP_AFFINITY, USHORT, PUSHORT);
+#endif
+#include "misc.h"
+namespace Stockfish {
+using CpuIndex  = size_t;
+using NumaIndex = size_t;
+inline CpuIndex get_hardware_concurrency() {
+    CpuIndex concurrency = std::thread::hardware_concurrency();
+    // Get all processors across all processor groups on windows, since
+    // hardware_concurrency() only returns the number of processors in
+    // the first group, because only these are available to std::thread.
+#ifdef _WIN64
+    concurrency = std::max<CpuIndex>(concurrency, GetActiveProcessorCount(ALL_PROCESSOR_GROUPS));
+#endif
+    return concurrency;
+}
+inline const CpuIndex SYSTEM_THREADS_NB = std::max<CpuIndex>(1, get_hardware_concurrency());
+#if defined(_WIN64)
+struct WindowsAffinity {
+    std::optional<std::set<CpuIndex>> oldApi;
+    std::optional<std::set<CpuIndex>> newApi;
+    // We also provide diagnostic for when the affinity is set to nullopt
+    // whether it was due to being indeterminate. If affinity is indeterminate
+    // it is best to assume it is not set at all, so consistent with the meaning
+    // of the nullopt affinity.
+    bool isNewDeterminate = true;
+    bool isOldDeterminate = true;
+    std::optional<std::set<CpuIndex>> get_combined() const {
+        if (!oldApi.has_value())
+            return newApi;
+        if (!newApi.has_value())
+            return oldApi;
+        std::set<CpuIndex> intersect;
+        std::set_intersection(oldApi->begin(), oldApi->end(), newApi->begin(), newApi->end(),
+                              std::inserter(intersect, intersect.begin()));
+        return intersect;
+    }
+    // Since Windows 11 and Windows Server 2022 thread affinities can span
+    // processor groups and can be set as such by a new WinAPI function. However,
+    // we may need to force using the old API if we detect that the process has
+    // affinity set by the old API already and we want to override that. Due to the
+    // limitations of the old API we cannot detect its use reliably. There will be
+    // cases where we detect not use but it has actually been used and vice versa.
+    bool likely_used_old_api() const { return oldApi.has_value() || !isOldDeterminate; }
+};
+inline std::pair<BOOL, std::vector<USHORT>> get_process_group_affinity() {
+    // GetProcessGroupAffinity requires the GroupArray argument to be
+    // aligned to 4 bytes instead of just 2.
+    static constexpr size_t GroupArrayMinimumAlignment = 4;
+    static_assert(GroupArrayMinimumAlignment >= alignof(USHORT));
+    // The function should succeed the second time, but it may fail if the group
+    // affinity has changed between GetProcessGroupAffinity calls. In such case
+    // we consider this a hard error, as we Cannot work with unstable affinities
+    // anyway.
+    static constexpr int MAX_TRIES  = 2;
+    USHORT               GroupCount = 1;
+    for (int i = 0; i < MAX_TRIES; ++i)
+    {
+        auto GroupArray = std::make_unique<USHORT[]>(
+          GroupCount + (GroupArrayMinimumAlignment / alignof(USHORT) - 1));
+        USHORT* GroupArrayAligned = align_ptr_up<GroupArrayMinimumAlignment>(GroupArray.get());
+        const BOOL status =
+          GetProcessGroupAffinity(GetCurrentProcess(), &GroupCount, GroupArrayAligned);
+        if (status == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+        {
+            break;
+        }
+        if (status != 0)
+        {
+            return std::make_pair(status,
+                                  std::vector(GroupArrayAligned, GroupArrayAligned + GroupCount));
+        }
+    }
+    return std::make_pair(0, std::vector<USHORT>());
+}
+// On Windows there are two ways to set affinity, and therefore 2 ways to get it.
+// These are not consistent, so we have to check both. In some cases it is actually
+// not possible to determine affinity. For example when two different threads have
+// affinity on different processor groups, set using SetThreadAffinityMask, we cannot
+// retrieve the actual affinities.
+// From documentation on GetProcessAffinityMask:
+//     > If the calling process contains threads in multiple groups,
+//     > the function returns zero for both affinity masks.
+// In such cases we just give up and assume we have affinity for all processors.
+// nullopt means no affinity is set, that is, all processors are allowed
+inline WindowsAffinity get_process_affinity() {
+    HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
+    auto    GetThreadSelectedCpuSetMasks_f = GetThreadSelectedCpuSetMasks_t(
+      (void (*)()) GetProcAddress(k32, "GetThreadSelectedCpuSetMasks"));
+    BOOL status = 0;
+    WindowsAffinity affinity;
+    if (GetThreadSelectedCpuSetMasks_f != nullptr)
+    {
+        USHORT RequiredMaskCount;
+        status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), nullptr, 0, &RequiredMaskCount);
+        // We expect ERROR_INSUFFICIENT_BUFFER from GetThreadSelectedCpuSetMasks,
+        // but other failure is an actual error.
+        if (status == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+        {
+            affinity.isNewDeterminate = false;
+        }
+        else if (RequiredMaskCount > 0)
+        {
+            // If RequiredMaskCount then these affinities were never set, but it's
+            // not consistent so GetProcessAffinityMask may still return some affinity.
+            auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(RequiredMaskCount);
+            status = GetThreadSelectedCpuSetMasks_f(GetCurrentThread(), groupAffinities.get(),
+                                                    RequiredMaskCount, &RequiredMaskCount);
+            if (status == 0)
+            {
+                affinity.isNewDeterminate = false;
+            }
+            else
+            {
+                std::set<CpuIndex> cpus;
+                for (USHORT i = 0; i < RequiredMaskCount; ++i)
+                {
+                    const size_t procGroupIndex = groupAffinities[i].Group;
+                    for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
+                    {
+                        if (groupAffinities[i].Mask & (KAFFINITY(1) << j))
+                            cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
+                    }
+                }
+                affinity.newApi = std::move(cpus);
+            }
+        }
+    }
+    // NOTE: There is no way to determine full affinity using the old API if
+    //       individual threads set affinity on different processor groups.
+    DWORD_PTR proc, sys;
+    status = GetProcessAffinityMask(GetCurrentProcess(), &proc, &sys);
+    // If proc == 0 then we cannot determine affinity because it spans processor groups.
+    // On Windows 11 and Server 2022 it will instead
+    //     > If, however, hHandle specifies a handle to the current process, the function
+    //     > always uses the calling thread's primary group (which by default is the same
+    //     > as the process' primary group) in order to set the
+    //     > lpProcessAffinityMask and lpSystemAffinityMask.
+    // So it will never be indeterminate here. We can only make assumptions later.
+    if (status == 0 || proc == 0)
+    {
+        affinity.isOldDeterminate = false;
+        return affinity;
+    }
+    // If SetProcessAffinityMask was never called the affinity must span
+    // all processor groups, but if it was called it must only span one.
+    std::vector<USHORT> groupAffinity;  // We need to capture this later and capturing
+                                        // from structured bindings requires c++20.
+    std::tie(status, groupAffinity) = get_process_group_affinity();
+    if (status == 0)
+    {
+        affinity.isOldDeterminate = false;
+        return affinity;
+    }
+    if (groupAffinity.size() == 1)
+    {
+        // We detect the case when affinity is set to all processors and correctly
+        // leave affinity.oldApi as nullopt.
+        if (GetActiveProcessorGroupCount() != 1 || proc != sys)
+        {
+            std::set<CpuIndex> cpus;
+            const size_t procGroupIndex = groupAffinity[0];
+            const uint64_t mask = static_cast<uint64_t>(proc);
+            for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
+            {
+                if (mask & (KAFFINITY(1) << j))
+                    cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
+            }
+            affinity.oldApi = std::move(cpus);
+        }
+    }
+    else
+    {
+        // If we got here it means that either SetProcessAffinityMask was never set
+        // or we're on Windows 11/Server 2022.
+        // Since Windows 11 and Windows Server 2022 the behaviour of
+        // GetProcessAffinityMask changed:
+        //     > If, however, hHandle specifies a handle to the current process,
+        //     > the function always uses the calling thread's primary group
+        //     > (which by default is the same as the process' primary group)
+        //     > in order to set the lpProcessAffinityMask and lpSystemAffinityMask.
+        // In which case we can actually retrieve the full affinity.
+        if (GetThreadSelectedCpuSetMasks_f != nullptr)
+        {
+            std::thread th([&]() {
+                std::set<CpuIndex> cpus;
+                bool               isAffinityFull = true;
+                for (auto procGroupIndex : groupAffinity)
+                {
+                    const int numActiveProcessors =
+                      GetActiveProcessorCount(static_cast<WORD>(procGroupIndex));
+                    // We have to schedule to two different processors
+                    // and & the affinities we get. Otherwise our processor
+                    // choice could influence the resulting affinity.
+                    // We assume the processor IDs within the group are
+                    // filled sequentially from 0.
+                    uint64_t procCombined = std::numeric_limits<uint64_t>::max();
+                    uint64_t sysCombined  = std::numeric_limits<uint64_t>::max();
+                    for (int i = 0; i < std::min(numActiveProcessors, 2); ++i)
+                    {
+                        GROUP_AFFINITY GroupAffinity;
+                        std::memset(&GroupAffinity, 0, sizeof(GROUP_AFFINITY));
+                        GroupAffinity.Group = static_cast<WORD>(procGroupIndex);
+                        GroupAffinity.Mask = static_cast<KAFFINITY>(1) << i;
+                        status =
+                          SetThreadGroupAffinity(GetCurrentThread(), &GroupAffinity, nullptr);
+                        if (status == 0)
+                        {
+                            affinity.isOldDeterminate = false;
+                            return;
+                        }
+                        SwitchToThread();
+                        DWORD_PTR proc2, sys2;
+                        status = GetProcessAffinityMask(GetCurrentProcess(), &proc2, &sys2);
+                        if (status == 0)
+                        {
+                            affinity.isOldDeterminate = false;
+                            return;
+                        }
+                        procCombined &= static_cast<uint64_t>(proc2);
+                        sysCombined &= static_cast<uint64_t>(sys2);
+                    }
+                    if (procCombined != sysCombined)
+                        isAffinityFull = false;
+                    for (size_t j = 0; j < WIN_PROCESSOR_GROUP_SIZE; ++j)
+                    {
+                        if (procCombined & (KAFFINITY(1) << j))
+                            cpus.insert(procGroupIndex * WIN_PROCESSOR_GROUP_SIZE + j);
+                    }
+                }
+                // We have to detect the case where the affinity was not set,
+                // or is set to all processors so that we correctly produce as
+                // std::nullopt result.
+                if (!isAffinityFull)
+                {
+                    affinity.oldApi = std::move(cpus);
+                }
+            });
+            th.join();
+        }
+    }
+    return affinity;
+}
+// Type machinery used to emulate Cache->GroupCount
+template<typename T, typename = void>
+struct HasGroupCount: std::false_type {};
+template<typename T>
+struct HasGroupCount<T, std::void_t<decltype(std::declval<T>().Cache.GroupCount)>>: std::true_type {
+};
+template<typename T, typename Pred, std::enable_if_t<HasGroupCount<T>::value, bool> = true>
+std::set<CpuIndex> readCacheMembers(const T* info, Pred&& is_cpu_allowed) {
+    std::set<CpuIndex> cpus;
+    // On Windows 10 this will read a 0 because GroupCount doesn't exist
+    int groupCount = std::max(info->Cache.GroupCount, WORD(1));
+    for (WORD procGroup = 0; procGroup < groupCount; ++procGroup)
+    {
+        for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number)
+        {
+            WORD           groupNumber = info->Cache.GroupMasks[procGroup].Group;
+            const CpuIndex c = static_cast<CpuIndex>(groupNumber) * WIN_PROCESSOR_GROUP_SIZE
+                             + static_cast<CpuIndex>(number);
+            if (!(info->Cache.GroupMasks[procGroup].Mask & (1ULL << number)) || !is_cpu_allowed(c))
+                continue;
+            cpus.insert(c);
+        }
+    }
+    return cpus;
+}
+template<typename T, typename Pred, std::enable_if_t<!HasGroupCount<T>::value, bool> = true>
+std::set<CpuIndex> readCacheMembers(const T* info, Pred&& is_cpu_allowed) {
+    std::set<CpuIndex> cpus;
+    for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number)
+    {
+        WORD           groupNumber = info->Cache.GroupMask.Group;
+        const CpuIndex c           = static_cast<CpuIndex>(groupNumber) * WIN_PROCESSOR_GROUP_SIZE
+                         + static_cast<CpuIndex>(number);
+        if (!(info->Cache.GroupMask.Mask & (1ULL << number)) || !is_cpu_allowed(c))
+            continue;
+        cpus.insert(c);
+    }
+    return cpus;
+}
+#endif
+#if defined(__linux__) && !defined(__ANDROID__)
+inline std::set<CpuIndex> get_process_affinity() {
+    std::set<CpuIndex> cpus;
+    // For unsupported systems, or in case of a soft error, we may assume
+    // all processors are available for use.
+    [[maybe_unused]] auto set_to_all_cpus = [&]() {
+        for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
+            cpus.insert(c);
+    };
+    // cpu_set_t by default holds 1024 entries. This may not be enough soon,
+    // but there is no easy way to determine how many threads there actually
+    // is. In this case we just choose a reasonable upper bound.
+    static constexpr CpuIndex MaxNumCpus = 1024 * 64;
+    cpu_set_t* mask = CPU_ALLOC(MaxNumCpus);
+    if (mask == nullptr)
+        std::exit(EXIT_FAILURE);
+    const size_t masksize = CPU_ALLOC_SIZE(MaxNumCpus);
+    CPU_ZERO_S(masksize, mask);
+    const int status = sched_getaffinity(0, masksize, mask);
+    if (status != 0)
+    {
+        CPU_FREE(mask);
+        std::exit(EXIT_FAILURE);
+    }
+    for (CpuIndex c = 0; c < MaxNumCpus; ++c)
+        if (CPU_ISSET_S(c, masksize, mask))
+            cpus.insert(c);
+    CPU_FREE(mask);
+    return cpus;
+}
+#endif
+#if defined(__linux__) && !defined(__ANDROID__)
+inline static const auto STARTUP_PROCESSOR_AFFINITY = get_process_affinity();
+#elif defined(_WIN64)
+inline static const auto STARTUP_PROCESSOR_AFFINITY = get_process_affinity();
+inline static const auto STARTUP_USE_OLD_AFFINITY_API =
+  STARTUP_PROCESSOR_AFFINITY.likely_used_old_api();
+#endif
+// We want to abstract the purpose of storing the numa node index somewhat.
+// Whoever is using this does not need to know the specifics of the replication
+// machinery to be able to access NUMA replicated memory.
+class NumaReplicatedAccessToken {
+   public:
+    NumaReplicatedAccessToken() :
+        n(0) {}
+    explicit NumaReplicatedAccessToken(NumaIndex idx) :
+        n(idx) {}
+    NumaIndex get_numa_index() const { return n; }
+   private:
+    NumaIndex n;
+};
+struct L3Domain {
+    NumaIndex          systemNumaIndex{};
+    std::set<CpuIndex> cpus{};
+};
+// Use system NUMA nodes
+struct SystemNumaPolicy {};
+// Use system-reported L3 domains
+struct L3DomainsPolicy {};
+// Group system-reported L3 domains until they reach bundleSize
+struct BundledL3Policy {
+    size_t bundleSize;
+};
+using NumaAutoPolicy = std::variant<SystemNumaPolicy, L3DomainsPolicy, BundledL3Policy>;
+// Designed as immutable, because there is no good reason to alter an already
+// existing config in a way that doesn't require recreating it completely, and
+// it would be complex and expensive to maintain class invariants.
+// The CPU (processor) numbers always correspond to the actual numbering used
+// by the system. The NUMA node numbers MAY NOT correspond to the system's
+// numbering of the NUMA nodes. In particular, by default, if the processor has
+// non-uniform cache access within a NUMA node (i.e., a non-unified L3 cache structure),
+// then L3 domains within a system NUMA node will be used to subdivide it
+// into multiple logical NUMA nodes in the config. Additionally, empty nodes may
+// be removed, or the user may create custom nodes.
+//
+// As a special case, when performing system-wide replication of read-only data
+// (i.e., LazyNumaReplicatedSystemWide), the system NUMA node is used, rather than
+// custom or L3-aware nodes. See that class's get_discriminator() function.
+//
+// It is guaranteed that NUMA nodes are NOT empty: every node exposed by NumaConfig
+// has at least one processor assigned.
+//
+// We use startup affinities so as not to modify its own behaviour in time.
+//
+// Since Stockfish doesn't support exceptions all places where an exception
+// should be thrown are replaced by std::exit.
+class NumaConfig {
+   public:
+    NumaConfig() :
+        highestCpuIndex(0),
+        customAffinity(false) {
+        const auto numCpus = SYSTEM_THREADS_NB;
+        add_cpu_range_to_node(NumaIndex{0}, CpuIndex{0}, numCpus - 1);
+    }
+    // This function gets a NumaConfig based on the system's provided information.
+    // The available policies are documented above.
+    static NumaConfig from_system([[maybe_unused]] const NumaAutoPolicy& policy,
+                                  bool respectProcessAffinity = true) {
+        NumaConfig cfg = empty();
+#if !((defined(__linux__) && !defined(__ANDROID__)) || defined(_WIN64))
+        // Fallback for unsupported systems.
+        for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
+            cfg.add_cpu_to_node(NumaIndex{0}, c);
+#else
+    #if defined(_WIN64)
+        std::optional<std::set<CpuIndex>> allowedCpus;
+        if (respectProcessAffinity)
+            allowedCpus = STARTUP_PROCESSOR_AFFINITY.get_combined();
+        // The affinity cannot be determined in all cases on Windows,
+        // but we at least guarantee that the number of allowed processors
+        // is >= number of processors in the affinity mask. In case the user
+        // is not satisfied they must set the processor numbers explicitly.
+        auto is_cpu_allowed = [&allowedCpus](CpuIndex c) {
+            return !allowedCpus.has_value() || allowedCpus->count(c) == 1;
+        };
+    #elif defined(__linux__) && !defined(__ANDROID__)
+        std::set<CpuIndex> allowedCpus;
+        if (respectProcessAffinity)
+            allowedCpus = STARTUP_PROCESSOR_AFFINITY;
+        auto is_cpu_allowed = [respectProcessAffinity, &allowedCpus](CpuIndex c) {
+            return !respectProcessAffinity || allowedCpus.count(c) == 1;
+        };
+    #endif
+        bool l3Success = false;
+        if (!std::holds_alternative<SystemNumaPolicy>(policy))
+        {
+            size_t l3BundleSize = 0;
+            if (const auto* v = std::get_if<BundledL3Policy>(&policy))
+            {
+                l3BundleSize = v->bundleSize;
+            }
+            if (auto l3Cfg =
+                  try_get_l3_aware_config(respectProcessAffinity, l3BundleSize, is_cpu_allowed))
+            {
+                cfg       = std::move(*l3Cfg);
+                l3Success = true;
+            }
+        }
+        if (!l3Success)
+            cfg = from_system_numa(respectProcessAffinity, is_cpu_allowed);
+    #if defined(_WIN64)
+        // Split the NUMA nodes to be contained within a group if necessary.
+        // This is needed between Windows 10 Build 20348 and Windows 11, because
+        // the new NUMA allocation behaviour was introduced while there was
+        // still no way to set thread affinity spanning multiple processor groups.
+        // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
+        // We also do this is if need to force old API for some reason.
+        //
+        // 2024-08-26: It appears that we need to actually always force this behaviour.
+        // While Windows allows this to work now, such assignments have bad interaction
+        // with the scheduler - in particular it still prefers scheduling on the thread's
+        // "primary" node, even if it means scheduling SMT processors first.
+        // See https://github.com/official-stockfish/Stockfish/issues/5551
+        // See https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups
+        //
+        //     Each process is assigned a primary group at creation, and by default all
+        //     of its threads' primary group is the same. Each thread's ideal processor
+        //     is in the thread's primary group, so threads will preferentially be
+        //     scheduled to processors on their primary group, but they are able to
+        //     be scheduled to processors on any other group.
+        //
+        // used to be guarded by if (STARTUP_USE_OLD_AFFINITY_API)
+        {
+            NumaConfig splitCfg = empty();
+            NumaIndex splitNodeIndex = 0;
+            for (const auto& cpus : cfg.nodes)
+            {
+                if (cpus.empty())
+                    continue;
+                size_t lastProcGroupIndex = *(cpus.begin()) / WIN_PROCESSOR_GROUP_SIZE;
+                for (CpuIndex c : cpus)
+                {
+                    const size_t procGroupIndex = c / WIN_PROCESSOR_GROUP_SIZE;
+                    if (procGroupIndex != lastProcGroupIndex)
+                    {
+                        splitNodeIndex += 1;
+                        lastProcGroupIndex = procGroupIndex;
+                    }
+                    splitCfg.add_cpu_to_node(splitNodeIndex, c);
+                }
+                splitNodeIndex += 1;
+            }
+            cfg = std::move(splitCfg);
+        }
+    #endif
+#endif
+        // We have to ensure no empty NUMA nodes persist.
+        cfg.remove_empty_numa_nodes();
+        // If the user explicitly opts out from respecting the current process affinity
+        // then it may be inconsistent with the current affinity (obviously), so we
+        // consider it custom.
+        if (!respectProcessAffinity)
+            cfg.customAffinity = true;
+        return cfg;
+    }
+    // ':'-separated numa nodes
+    // ','-separated cpu indices
+    // supports "first-last" range syntax for cpu indices
+    // For example "0-15,128-143:16-31,144-159:32-47,160-175:48-63,176-191"
+    static NumaConfig from_string(const std::string& s) {
+        NumaConfig cfg = empty();
+        NumaIndex n = 0;
+        for (auto&& nodeStr : split(s, ":"))
+        {
+            auto indices = indices_from_shortened_string(std::string(nodeStr));
+            if (!indices.empty())
+            {
+                for (auto idx : indices)
+                {
+                    if (!cfg.add_cpu_to_node(n, CpuIndex(idx)))
+                        std::exit(EXIT_FAILURE);
+                }
+                n += 1;
+            }
+        }
+        cfg.customAffinity = true;
+        return cfg;
+    }
+    NumaConfig(const NumaConfig&)            = delete;
+    NumaConfig(NumaConfig&&)                 = default;
+    NumaConfig& operator=(const NumaConfig&) = delete;
+    NumaConfig& operator=(NumaConfig&&)      = default;
+    bool is_cpu_assigned(CpuIndex n) const { return nodeByCpu.count(n) == 1; }
+    NumaIndex num_numa_nodes() const { return nodes.size(); }
+    CpuIndex num_cpus_in_numa_node(NumaIndex n) const {
+        assert(n < nodes.size());
+        return nodes[n].size();
+    }
+    CpuIndex num_cpus() const { return nodeByCpu.size(); }
+    bool requires_memory_replication() const { return customAffinity || nodes.size() > 1; }
+    std::string to_string() const {
+        std::string str;
+        bool isFirstNode = true;
+        for (auto&& cpus : nodes)
+        {
+            if (!isFirstNode)
+                str += ":";
+            bool isFirstSet = true;
+            auto rangeStart = cpus.begin();
+            for (auto it = cpus.begin(); it != cpus.end(); ++it)
+            {
+                auto next = std::next(it);
+                if (next == cpus.end() || *next != *it + 1)
+                {
+                    // cpus[i] is at the end of the range (may be of size 1)
+                    if (!isFirstSet)
+                        str += ",";
+                    const CpuIndex last = *it;
+                    if (it != rangeStart)
+                    {
+                        const CpuIndex first = *rangeStart;
+                        str += std::to_string(first);
+                        str += "-";
+                        str += std::to_string(last);
+                    }
+                    else
+                        str += std::to_string(last);
+                    rangeStart = next;
+                    isFirstSet = false;
+                }
+            }
+            isFirstNode = false;
+        }
+        return str;
+    }
+    bool suggests_binding_threads(CpuIndex numThreads) const {
+        // If we can reasonably determine that the threads cannot be contained
+        // by the OS within the first NUMA node then we advise distributing
+        // and binding threads. When the threads are not bound we can only use
+        // NUMA memory replicated objects from the first node, so when the OS
+        // has to schedule on other nodes we lose performance. We also suggest
+        // binding if there's enough threads to distribute among nodes with minimal
+        // disparity. We try to ignore small nodes, in particular the empty ones.
+        // If the affinity set by the user does not match the affinity given by
+        // the OS then binding is necessary to ensure the threads are running on
+        // correct processors.
+        if (customAffinity)
+            return true;
+        // We obviously cannot distribute a single thread, so a single thread
+        // should never be bound.
+        if (numThreads <= 1)
+            return false;
+        size_t largestNodeSize = 0;
+        for (auto&& cpus : nodes)
+            if (cpus.size() > largestNodeSize)
+                largestNodeSize = cpus.size();
+        auto is_node_small = [largestNodeSize](const std::set<CpuIndex>& node) {
+            static constexpr double SmallNodeThreshold = 0.6;
+            return static_cast<double>(node.size()) / static_cast<double>(largestNodeSize)
+                <= SmallNodeThreshold;
+        };
+        size_t numNotSmallNodes = 0;
+        for (auto&& cpus : nodes)
+            if (!is_node_small(cpus))
+                numNotSmallNodes += 1;
+        return (numThreads > largestNodeSize / 2 || numThreads >= numNotSmallNodes * 4)
+            && nodes.size() > 1;
+    }
+    std::vector<NumaIndex> distribute_threads_among_numa_nodes(CpuIndex numThreads) const {
+        std::vector<NumaIndex> ns;
+        if (nodes.size() == 1)
+        {
+            // Special case for when there's no NUMA nodes. This doesn't buy us
+            // much, but let's keep the default path simple.
+            ns.resize(numThreads, NumaIndex{0});
+        }
+        else
+        {
+            std::vector<size_t> occupation(nodes.size(), 0);
+            for (CpuIndex c = 0; c < numThreads; ++c)
+            {
+                NumaIndex bestNode{0};
+                float     bestNodeFill = std::numeric_limits<float>::max();
+                for (NumaIndex n = 0; n < nodes.size(); ++n)
+                {
+                    float fill =
+                      static_cast<float>(occupation[n] + 1) / static_cast<float>(nodes[n].size());
+                    // NOTE: Do we want to perhaps fill the first available node
+                    //       up to 50% first before considering other nodes?
+                    //       Probably not, because it would interfere with running
+                    //       multiple instances. We basically shouldn't favor any
+                    //       particular node.
+                    if (fill < bestNodeFill)
+                    {
+                        bestNode     = n;
+                        bestNodeFill = fill;
+                    }
+                }
+                ns.emplace_back(bestNode);
+                occupation[bestNode] += 1;
+            }
+        }
+        return ns;
+    }
+    NumaReplicatedAccessToken bind_current_thread_to_numa_node(NumaIndex n) const {
+        if (n >= nodes.size() || nodes[n].size() == 0)
+            std::exit(EXIT_FAILURE);
+#if defined(__linux__) && !defined(__ANDROID__)
+        cpu_set_t* mask = CPU_ALLOC(highestCpuIndex + 1);
+        if (mask == nullptr)
+            std::exit(EXIT_FAILURE);
+        const size_t masksize = CPU_ALLOC_SIZE(highestCpuIndex + 1);
+        CPU_ZERO_S(masksize, mask);
+        for (CpuIndex c : nodes[n])
+            CPU_SET_S(c, masksize, mask);
+        const int status = sched_setaffinity(0, masksize, mask);
+        CPU_FREE(mask);
+        if (status != 0)
+            std::exit(EXIT_FAILURE);
+        // We yield this thread just to be sure it gets rescheduled.
+        // This is defensive, allowed because this code is not performance critical.
+        sched_yield();
+#elif defined(_WIN64)
+        // Requires Windows 11. No good way to set thread affinity spanning
+        // processor groups before that.
+        HMODULE k32                            = GetModuleHandle(TEXT("Kernel32.dll"));
+        auto    SetThreadSelectedCpuSetMasks_f = SetThreadSelectedCpuSetMasks_t(
+          (void (*)()) GetProcAddress(k32, "SetThreadSelectedCpuSetMasks"));
+        // We ALWAYS set affinity with the new API if available, because
+        // there's no downsides, and we forcibly keep it consistent with
+        // the old API should we need to use it. I.e. we always keep this
+        // as a superset of what we set with SetThreadGroupAffinity.
+        if (SetThreadSelectedCpuSetMasks_f != nullptr)
+        {
+            // Only available on Windows 11 and Windows Server 2022 onwards
+            const USHORT numProcGroups = USHORT(
+              ((highestCpuIndex + 1) + WIN_PROCESSOR_GROUP_SIZE - 1) / WIN_PROCESSOR_GROUP_SIZE);
+            auto groupAffinities = std::make_unique<GROUP_AFFINITY[]>(numProcGroups);
+            std::memset(groupAffinities.get(), 0, sizeof(GROUP_AFFINITY) * numProcGroups);
+            for (WORD i = 0; i < numProcGroups; ++i)
+                groupAffinities[i].Group = i;
+            for (CpuIndex c : nodes[n])
+            {
+                const size_t procGroupIndex     = c / WIN_PROCESSOR_GROUP_SIZE;
+                const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE;
+                groupAffinities[procGroupIndex].Mask |= KAFFINITY(1) << idxWithinProcGroup;
+            }
+            HANDLE hThread = GetCurrentThread();
+            const BOOL status =
+              SetThreadSelectedCpuSetMasks_f(hThread, groupAffinities.get(), numProcGroups);
+            if (status == 0)
+                std::exit(EXIT_FAILURE);
+            // We yield this thread just to be sure it gets rescheduled.
+            // This is defensive, allowed because this code is not performance critical.
+            SwitchToThread();
+        }
+        // Sometimes we need to force the old API, but do not use it unless necessary.
+        if (SetThreadSelectedCpuSetMasks_f == nullptr || STARTUP_USE_OLD_AFFINITY_API)
+        {
+            // On earlier windows version (since windows 7) we cannot run a single thread
+            // on multiple processor groups, so we need to restrict the group.
+            // We assume the group of the first processor listed for this node.
+            // Processors from outside this group will not be assigned for this thread.
+            // Normally this won't be an issue because windows used to assign NUMA nodes
+            // such that they cannot span processor groups. However, since Windows 10
+            // Build 20348 the behaviour changed, so there's a small window of versions
+            // between this and Windows 11 that might exhibit problems with not all
+            // processors being utilized.
+            //
+            // We handle this in NumaConfig::from_system by manually splitting the
+            // nodes when we detect that there is no function to set affinity spanning
+            // processor nodes. This is required because otherwise our thread distribution
+            // code may produce suboptimal results.
+            //
+            // See https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
+            GROUP_AFFINITY affinity;
+            std::memset(&affinity, 0, sizeof(GROUP_AFFINITY));
+            // We use an ordered set to be sure to get the smallest cpu number here.
+            const size_t forcedProcGroupIndex = *(nodes[n].begin()) / WIN_PROCESSOR_GROUP_SIZE;
+            affinity.Group                    = static_cast<WORD>(forcedProcGroupIndex);
+            for (CpuIndex c : nodes[n])
+            {
+                const size_t procGroupIndex     = c / WIN_PROCESSOR_GROUP_SIZE;
+                const size_t idxWithinProcGroup = c % WIN_PROCESSOR_GROUP_SIZE;
+                // We skip processors that are not in the same processor group.
+                // If everything was set up correctly this will never be an issue,
+                // but we have to account for bad NUMA node specification.
+                if (procGroupIndex != forcedProcGroupIndex)
+                    continue;
+                affinity.Mask |= KAFFINITY(1) << idxWithinProcGroup;
+            }
+            HANDLE hThread = GetCurrentThread();
+            const BOOL status = SetThreadGroupAffinity(hThread, &affinity, nullptr);
+            if (status == 0)
+                std::exit(EXIT_FAILURE);
+            // We yield this thread just to be sure it gets rescheduled. This is
+            // defensive, allowed because this code is not performance critical.
+            SwitchToThread();
+        }
+#endif
+        return NumaReplicatedAccessToken(n);
+    }
+    template<typename FuncT>
+    void execute_on_numa_node(NumaIndex n, FuncT&& f) const {
+        std::thread th([this, &f, n]() {
+            bind_current_thread_to_numa_node(n);
+            std::forward<FuncT>(f)();
+        });
+        th.join();
+    }
+    std::vector<std::set<CpuIndex>> nodes;
+    std::map<CpuIndex, NumaIndex>   nodeByCpu;
+   private:
+    CpuIndex highestCpuIndex;
+    bool customAffinity;
+    static NumaConfig empty() { return NumaConfig(EmptyNodeTag{}); }
+    struct EmptyNodeTag {};
+    NumaConfig(EmptyNodeTag) :
+        highestCpuIndex(0),
+        customAffinity(false) {}
+    void remove_empty_numa_nodes() {
+        std::vector<std::set<CpuIndex>> newNodes;
+        for (auto&& cpus : nodes)
+            if (!cpus.empty())
+                newNodes.emplace_back(std::move(cpus));
+        nodes = std::move(newNodes);
+    }
+    // Returns true if successful
+    // Returns false if failed, i.e. when the cpu is already present
+    //                          strong guarantee, the structure remains unmodified
+    bool add_cpu_to_node(NumaIndex n, CpuIndex c) {
+        if (is_cpu_assigned(c))
+            return false;
+        while (nodes.size() <= n)
+            nodes.emplace_back();
+        nodes[n].insert(c);
+        nodeByCpu[c] = n;
+        if (c > highestCpuIndex)
+            highestCpuIndex = c;
+        return true;
+    }
+    // Returns true if successful
+    // Returns false if failed, i.e. when any of the cpus is already present
+    //                          strong guarantee, the structure remains unmodified
+    bool add_cpu_range_to_node(NumaIndex n, CpuIndex cfirst, CpuIndex clast) {
+        for (CpuIndex c = cfirst; c <= clast; ++c)
+            if (is_cpu_assigned(c))
+                return false;
+        while (nodes.size() <= n)
+            nodes.emplace_back();
+        for (CpuIndex c = cfirst; c <= clast; ++c)
+        {
+            nodes[n].insert(c);
+            nodeByCpu[c] = n;
+        }
+        if (clast > highestCpuIndex)
+            highestCpuIndex = clast;
+        return true;
+    }
+    static std::vector<size_t> indices_from_shortened_string(const std::string& s) {
+        std::vector<size_t> indices;
+        if (s.empty())
+            return indices;
+        for (const auto& ss : split(s, ","))
+        {
+            if (ss.empty())
+                continue;
+            auto parts = split(ss, "-");
+            if (parts.size() == 1)
+            {
+                const CpuIndex c = CpuIndex{str_to_size_t(std::string(parts[0]))};
+                indices.emplace_back(c);
+            }
+            else if (parts.size() == 2)
+            {
+                const CpuIndex cfirst = CpuIndex{str_to_size_t(std::string(parts[0]))};
+                const CpuIndex clast  = CpuIndex{str_to_size_t(std::string(parts[1]))};
+                for (size_t c = cfirst; c <= clast; ++c)
+                {
+                    indices.emplace_back(c);
+                }
+            }
+        }
+        return indices;
+    }
+    // This function queries the system for the mapping of processors to NUMA nodes.
+    // On Linux we read from standardized kernel sysfs, with a fallback to single NUMA
+    // node. On Windows we utilize GetNumaProcessorNodeEx, which has its quirks, see
+    // comment for Windows implementation of get_process_affinity.
+    template<typename Pred>
+    static NumaConfig from_system_numa([[maybe_unused]] bool   respectProcessAffinity,
+                                       [[maybe_unused]] Pred&& is_cpu_allowed) {
+        NumaConfig cfg = empty();
+#if defined(__linux__) && !defined(__ANDROID__)
+        // On Linux things are straightforward, since there's no processor groups and
+        // any thread can be scheduled on all processors.
+        // We try to gather this information from the sysfs first
+        // https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-devices-node
+        bool useFallback = false;
+        auto fallback    = [&]() {
+            useFallback = true;
+            cfg         = empty();
+        };
+        // /sys/devices/system/node/online contains information about active NUMA nodes
+        auto nodeIdsStr = read_file_to_string("/sys/devices/system/node/online");
+        if (!nodeIdsStr.has_value() || nodeIdsStr->empty())
+        {
+            fallback();
+        }
+        else
+        {
+            remove_whitespace(*nodeIdsStr);
+            for (size_t n : indices_from_shortened_string(*nodeIdsStr))
+            {
+                // /sys/devices/system/node/node.../cpulist
+                std::string path =
+                  std::string("/sys/devices/system/node/node") + std::to_string(n) + "/cpulist";
+                auto cpuIdsStr = read_file_to_string(path);
+                // Now, we only bail if the file does not exist. Some nodes may be
+                // empty, that's fine. An empty node still has a file that appears
+                // to have some whitespace, so we need to handle that.
+                if (!cpuIdsStr.has_value())
+                {
+                    fallback();
+                    break;
+                }
+                else
+                {
+                    remove_whitespace(*cpuIdsStr);
+                    for (size_t c : indices_from_shortened_string(*cpuIdsStr))
+                    {
+                        if (is_cpu_allowed(c))
+                            cfg.add_cpu_to_node(n, c);
+                    }
+                }
+            }
+        }
+        if (useFallback)
+        {
+            for (CpuIndex c = 0; c < SYSTEM_THREADS_NB; ++c)
+                if (is_cpu_allowed(c))
+                    cfg.add_cpu_to_node(NumaIndex{0}, c);
+        }
+#elif defined(_WIN64)
+        WORD numProcGroups = GetActiveProcessorGroupCount();
+        for (WORD procGroup = 0; procGroup < numProcGroups; ++procGroup)
+        {
+            for (BYTE number = 0; number < WIN_PROCESSOR_GROUP_SIZE; ++number)
+            {
+                PROCESSOR_NUMBER procnum;
+                procnum.Group    = procGroup;
+                procnum.Number   = number;
+                procnum.Reserved = 0;
+                USHORT nodeNumber;
+                const BOOL     status = GetNumaProcessorNodeEx(&procnum, &nodeNumber);
+                const CpuIndex c      = static_cast<CpuIndex>(procGroup) * WIN_PROCESSOR_GROUP_SIZE
+                                 + static_cast<CpuIndex>(number);
+                if (status != 0 && nodeNumber != std::numeric_limits<USHORT>::max()
+                    && is_cpu_allowed(c))
+                {
+                    cfg.add_cpu_to_node(nodeNumber, c);
+                }
+            }
+        }
+#else
+        abort();  // should not reach here
+#endif
+        return cfg;
+    }
+    template<typename Pred>
+    static std::optional<NumaConfig> try_get_l3_aware_config(
+      bool respectProcessAffinity, size_t bundleSize, [[maybe_unused]] Pred&& is_cpu_allowed) {
+        // Get the normal system configuration so we know to which NUMA node
+        // each L3 domain belongs.
+        NumaConfig systemConfig =
+          NumaConfig::from_system(SystemNumaPolicy{}, respectProcessAffinity);
+        std::vector<L3Domain> l3Domains;
+#if defined(__linux__) && !defined(__ANDROID__)
+        std::set<CpuIndex> seenCpus;
+        auto               nextUnseenCpu = [&seenCpus]() {
+            for (CpuIndex i = 0;; ++i)
+                if (!seenCpus.count(i))
+                    return i;
+        };
+        while (true)
+        {
+            CpuIndex next = nextUnseenCpu();
+            auto     siblingsStr =
+              read_file_to_string("/sys/devices/system/cpu/cpu" + std::to_string(next)
+                                  + "/cache/index3/shared_cpu_list");
+            if (!siblingsStr.has_value() || siblingsStr->empty())
+            {
+                break;  // we have read all available CPUs
+            }
+            L3Domain domain;
+            for (size_t c : indices_from_shortened_string(*siblingsStr))
+            {
+                if (is_cpu_allowed(c))
+                {
+                    domain.systemNumaIndex = systemConfig.nodeByCpu.at(c);
+                    domain.cpus.insert(c);
+                }
+                seenCpus.insert(c);
+            }
+            if (!domain.cpus.empty())
+            {
+                l3Domains.emplace_back(std::move(domain));
+            }
+        }
+#elif defined(_WIN64)
+        DWORD bufSize = 0;
+        GetLogicalProcessorInformationEx(RelationCache, nullptr, &bufSize);
+        if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+            return std::nullopt;
+        std::vector<char> buffer(bufSize);
+        auto info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
+        if (!GetLogicalProcessorInformationEx(RelationCache, info, &bufSize))
+            return std::nullopt;
+        while (reinterpret_cast<char*>(info) < buffer.data() + bufSize)
+        {
+            info = std::launder(info);
+            if (info->Relationship == RelationCache && info->Cache.Level == 3)
+            {
+                L3Domain domain{};
+                domain.cpus = readCacheMembers(info, is_cpu_allowed);
+                if (!domain.cpus.empty())
+                {
+                    domain.systemNumaIndex = systemConfig.nodeByCpu.at(*domain.cpus.begin());
+                    l3Domains.push_back(std::move(domain));
+                }
+            }
+            // Variable length data structure, advance to next
+            info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(
+              reinterpret_cast<char*>(info) + info->Size);
+        }
+#endif
+        if (!l3Domains.empty())
+            return {NumaConfig::from_l3_info(std::move(l3Domains), bundleSize)};
+        return std::nullopt;
+    }
+    static NumaConfig from_l3_info(std::vector<L3Domain>&& domains, size_t bundleSize) {
+        assert(!domains.empty());
+        std::map<NumaIndex, std::vector<L3Domain>> list;
+        for (auto& d : domains)
+            list[d.systemNumaIndex].emplace_back(std::move(d));
+        NumaConfig cfg = empty();
+        NumaIndex  n   = 0;
+        for (auto& [_, ds] : list)
+        {
+            bool changed;
+            // Scan through pairs and merge them. With roughly equal L3 sizes, should give
+            // a decent distribution.
+            do
+            {
+                changed = false;
+                for (size_t j = 0; j + 1 < ds.size(); ++j)
+                {
+                    if (ds[j].cpus.size() + ds[j + 1].cpus.size() <= bundleSize)
+                    {
+                        changed = true;
+                        ds[j].cpus.merge(ds[j + 1].cpus);
+                        ds.erase(ds.begin() + j + 1);
+                    }
+                }
+                // ds.size() has decreased if changed is true, so this loop will terminate
+            } while (changed);
+            for (const L3Domain& d : ds)
+            {
+                const NumaIndex dn = n++;
+                for (CpuIndex cpu : d.cpus)
+                {
+                    cfg.add_cpu_to_node(dn, cpu);
+                }
+            }
+        }
+        return cfg;
+    }
+};
+class NumaReplicationContext;
+// Instances of this class are tracked by the NumaReplicationContext instance.
+// NumaReplicationContext informs all tracked instances when NUMA configuration changes.
+class NumaReplicatedBase {
+   public:
+    NumaReplicatedBase(NumaReplicationContext& ctx);
+    NumaReplicatedBase(const NumaReplicatedBase&) = delete;
+    NumaReplicatedBase(NumaReplicatedBase&& other) noexcept;
+    NumaReplicatedBase& operator=(const NumaReplicatedBase&) = delete;
+    NumaReplicatedBase& operator=(NumaReplicatedBase&& other) noexcept;
+    virtual void on_numa_config_changed() = 0;
+    virtual ~NumaReplicatedBase();
+    const NumaConfig& get_numa_config() const;
+   private:
+    NumaReplicationContext* context;
+};
+// We force boxing with a unique_ptr. If this becomes an issue due to added
+// indirection we may need to add an option for a custom boxing type. When the
+// NUMA config changes the value stored at the index 0 is replicated to other nodes.
+template<typename T>
+class NumaReplicated: public NumaReplicatedBase {
+   public:
+    using ReplicatorFuncType = std::function<T(const T&)>;
+    NumaReplicated(NumaReplicationContext& ctx) :
+        NumaReplicatedBase(ctx) {
+        replicate_from(T{});
+    }
+    NumaReplicated(NumaReplicationContext& ctx, T&& source) :
+        NumaReplicatedBase(ctx) {
+        replicate_from(std::move(source));
+    }
+    NumaReplicated(const NumaReplicated&) = delete;
+    NumaReplicated(NumaReplicated&& other) noexcept :
+        NumaReplicatedBase(std::move(other)),
+        instances(std::exchange(other.instances, {})) {}
+    NumaReplicated& operator=(const NumaReplicated&) = delete;
+    NumaReplicated& operator=(NumaReplicated&& other) noexcept {
+        NumaReplicatedBase::operator=(*this, std::move(other));
+        instances = std::exchange(other.instances, {});
+        return *this;
+    }
+    NumaReplicated& operator=(T&& source) {
+        replicate_from(std::move(source));
+        return *this;
+    }
+    ~NumaReplicated() override = default;
+    const T& operator[](NumaReplicatedAccessToken token) const {
+        assert(token.get_numa_index() < instances.size());
+        return *(instances[token.get_numa_index()]);
+    }
+    const T& operator*() const { return *(instances[0]); }
+    const T* operator->() const { return instances[0].get(); }
+    template<typename FuncT>
+    void modify_and_replicate(FuncT&& f) {
+        auto source = std::move(instances[0]);
+        std::forward<FuncT>(f)(*source);
+        replicate_from(std::move(*source));
+    }
+    void on_numa_config_changed() override {
+        // Use the first one as the source. It doesn't matter which one we use,
+        // because they all must be identical, but the first one is guaranteed to exist.
+        auto source = std::move(instances[0]);
+        replicate_from(std::move(*source));
+    }
+   private:
+    std::vector<std::unique_ptr<T>> instances;
+    void replicate_from(T&& source) {
+        instances.clear();
+        const NumaConfig& cfg = get_numa_config();
+        if (cfg.requires_memory_replication())
+        {
+            for (NumaIndex n = 0; n < cfg.num_numa_nodes(); ++n)
+            {
+                cfg.execute_on_numa_node(
+                  n, [this, &source]() { instances.emplace_back(std::make_unique<T>(source)); });
+            }
+        }
+        else
+        {
+            assert(cfg.num_numa_nodes() == 1);
+            // We take advantage of the fact that replication is not required
+            // and reuse the source value, avoiding one copy operation.
+            instances.emplace_back(std::make_unique<T>(std::move(source)));
+        }
+    }
+};
+// We force boxing with a unique_ptr. If this becomes an issue due to added
+// indirection we may need to add an option for a custom boxing type.
+template<typename T>
+class LazyNumaReplicated: public NumaReplicatedBase {
+   public:
+    using ReplicatorFuncType = std::function<T(const T&)>;
+    LazyNumaReplicated(NumaReplicationContext& ctx) :
+        NumaReplicatedBase(ctx) {
+        prepare_replicate_from(T{});
+    }
+    LazyNumaReplicated(NumaReplicationContext& ctx, T&& source) :
+        NumaReplicatedBase(ctx) {
+        prepare_replicate_from(std::move(source));
+    }
+    LazyNumaReplicated(const LazyNumaReplicated&) = delete;
+    LazyNumaReplicated(LazyNumaReplicated&& other) noexcept :
+        NumaReplicatedBase(std::move(other)),
+        instances(std::exchange(other.instances, {})) {}
+    LazyNumaReplicated& operator=(const LazyNumaReplicated&) = delete;
+    LazyNumaReplicated& operator=(LazyNumaReplicated&& other) noexcept {
+        NumaReplicatedBase::operator=(*this, std::move(other));
+        instances = std::exchange(other.instances, {});
+        return *this;
+    }
+    LazyNumaReplicated& operator=(T&& source) {
+        prepare_replicate_from(std::move(source));
+        return *this;
+    }
+    ~LazyNumaReplicated() override = default;
+    const T& operator[](NumaReplicatedAccessToken token) const {
+        assert(token.get_numa_index() < instances.size());
+        ensure_present(token.get_numa_index());
+        return *(instances[token.get_numa_index()]);
+    }
+    const T& operator*() const { return *(instances[0]); }
+    const T* operator->() const { return instances[0].get(); }
+    template<typename FuncT>
+    void modify_and_replicate(FuncT&& f) {
+        auto source = std::move(instances[0]);
+        std::forward<FuncT>(f)(*source);
+        prepare_replicate_from(std::move(*source));
+    }
+    void on_numa_config_changed() override {
+        // Use the first one as the source. It doesn't matter which one we use,
+        // because they all must be identical, but the first one is guaranteed to exist.
+        auto source = std::move(instances[0]);
+        prepare_replicate_from(std::move(*source));
+    }
+   private:
+    mutable std::vector<std::unique_ptr<T>> instances;
+    mutable std::mutex                      mutex;
+    void ensure_present(NumaIndex idx) const {
+        assert(idx < instances.size());
+        if (instances[idx] != nullptr)
+            return;
+        assert(idx != 0);
+        std::unique_lock<std::mutex> lock(mutex);
+        // Check again for races.
+        if (instances[idx] != nullptr)
+            return;
+        const NumaConfig& cfg = get_numa_config();
+        cfg.execute_on_numa_node(
+          idx, [this, idx]() { instances[idx] = std::make_unique<T>(*instances[0]); });
+    }
+    void prepare_replicate_from(T&& source) {
+        instances.clear();
+        const NumaConfig& cfg = get_numa_config();
+        if (cfg.requires_memory_replication())
+        {
+            assert(cfg.num_numa_nodes() > 0);
+            // We just need to make sure the first instance is there.
+            // Note that we cannot move here as we need to reallocate the data
+            // on the correct NUMA node.
+            cfg.execute_on_numa_node(
+              0, [this, &source]() { instances.emplace_back(std::make_unique<T>(source)); });
+            // Prepare others for lazy init.
+            instances.resize(cfg.num_numa_nodes());
+        }
+        else
+        {
+            assert(cfg.num_numa_nodes() == 1);
+            // We take advantage of the fact that replication is not required
+            // and reuse the source value, avoiding one copy operation.
+            instances.emplace_back(std::make_unique<T>(std::move(source)));
+        }
+    }
+};
+// Utilizes shared memory.
+template<typename T>
+class LazyNumaReplicatedSystemWide: public NumaReplicatedBase {
+   public:
+    using ReplicatorFuncType = std::function<T(const T&)>;
+    LazyNumaReplicatedSystemWide(NumaReplicationContext& ctx) :
+        NumaReplicatedBase(ctx) {
+        prepare_replicate_from(std::make_unique<T>());
+    }
+    LazyNumaReplicatedSystemWide(NumaReplicationContext& ctx, std::unique_ptr<T>&& source) :
+        NumaReplicatedBase(ctx) {
+        prepare_replicate_from(std::move(source));
+    }
+    LazyNumaReplicatedSystemWide(const LazyNumaReplicatedSystemWide&) = delete;
+    LazyNumaReplicatedSystemWide(LazyNumaReplicatedSystemWide&& other) noexcept :
+        NumaReplicatedBase(std::move(other)),
+        instances(std::exchange(other.instances, {})) {}
+    LazyNumaReplicatedSystemWide& operator=(const LazyNumaReplicatedSystemWide&) = delete;
+    LazyNumaReplicatedSystemWide& operator=(LazyNumaReplicatedSystemWide&& other) noexcept {
+        NumaReplicatedBase::operator=(*this, std::move(other));
+        instances = std::exchange(other.instances, {});
+        return *this;
+    }
+    LazyNumaReplicatedSystemWide& operator=(std::unique_ptr<T>&& source) {
+        prepare_replicate_from(std::move(source));
+        return *this;
+    }
+    ~LazyNumaReplicatedSystemWide() override = default;
+    const T& operator[](NumaReplicatedAccessToken token) const {
+        assert(token.get_numa_index() < instances.size());
+        ensure_present(token.get_numa_index());
+        return *(instances[token.get_numa_index()]);
+    }
+    const T& operator*() const { return *(instances[0]); }
+    const T* operator->() const { return &*instances[0]; }
+    std::vector<std::pair<SystemWideSharedConstantAllocationStatus, std::optional<std::string>>>
+    get_status_and_errors() const {
+        std::vector<std::pair<SystemWideSharedConstantAllocationStatus, std::optional<std::string>>>
+          status;
+        status.reserve(instances.size());
+        for (const auto& instance : instances)
+        {
+            status.emplace_back(instance.get_status(), instance.get_error_message());
+        }
+        return status;
+    }
+    template<typename FuncT>
+    void modify_and_replicate(FuncT&& f) {
+        auto source = std::make_unique<T>(*instances[0]);
+        std::forward<FuncT>(f)(*source);
+        prepare_replicate_from(std::move(source));
+    }
+    void on_numa_config_changed() override {
+        // Use the first one as the source. It doesn't matter which one we use,
+        // because they all must be identical, but the first one is guaranteed to exist.
+        auto source = std::make_unique<T>(*instances[0]);
+        prepare_replicate_from(std::move(source));
+    }
+   private:
+    mutable std::vector<SystemWideSharedConstant<T>> instances;
+    mutable std::mutex                               mutex;
+    std::size_t get_discriminator(NumaIndex idx) const {
+        const NumaConfig& cfg     = get_numa_config();
+        const NumaConfig& cfg_sys = NumaConfig::from_system(SystemNumaPolicy{}, false);
+        // as a discriminator, locate the hardware/system numadomain this cpuindex belongs to
+        CpuIndex    cpu     = *cfg.nodes[idx].begin();  // get a CpuIndex from NumaIndex
+        NumaIndex   sys_idx = cfg_sys.is_cpu_assigned(cpu) ? cfg_sys.nodeByCpu.at(cpu) : 0;
+        std::string s       = cfg_sys.to_string() + "$" + std::to_string(sys_idx);
+        return static_cast<std::size_t>(hash_string(s));
+    }
+    void ensure_present(NumaIndex idx) const {
+        assert(idx < instances.size());
+        if (instances[idx] != nullptr)
+            return;
+        assert(idx != 0);
+        std::unique_lock<std::mutex> lock(mutex);
+        // Check again for races.
+        if (instances[idx] != nullptr)
+            return;
+        const NumaConfig& cfg = get_numa_config();
+        cfg.execute_on_numa_node(idx, [this, idx]() {
+            instances[idx] = SystemWideSharedConstant<T>(*instances[0], get_discriminator(idx));
+        });
+    }
+    void prepare_replicate_from(std::unique_ptr<T>&& source) {
+        instances.clear();
+        const NumaConfig& cfg = get_numa_config();
+        // We just need to make sure the first instance is there.
+        // Note that we cannot move here as we need to reallocate the data
+        // on the correct NUMA node.
+        // Even in the case of a single NUMA node we have to copy since it's shared memory.
+        if (cfg.requires_memory_replication())
+        {
+            assert(cfg.num_numa_nodes() > 0);
+            cfg.execute_on_numa_node(0, [this, &source]() {
+                instances.emplace_back(SystemWideSharedConstant<T>(*source, get_discriminator(0)));
+            });
+            // Prepare others for lazy init.
+            instances.resize(cfg.num_numa_nodes());
+        }
+        else
+        {
+            assert(cfg.num_numa_nodes() == 1);
+            instances.emplace_back(SystemWideSharedConstant<T>(*source, get_discriminator(0)));
+        }
+    }
+};
+class NumaReplicationContext {
+   public:
+    NumaReplicationContext(NumaConfig&& cfg) :
+        config(std::move(cfg)) {}
+    NumaReplicationContext(const NumaReplicationContext&) = delete;
+    NumaReplicationContext(NumaReplicationContext&&)      = delete;
+    NumaReplicationContext& operator=(const NumaReplicationContext&) = delete;
+    NumaReplicationContext& operator=(NumaReplicationContext&&)      = delete;
+    ~NumaReplicationContext() {
+        // The context must outlive replicated objects
+        if (!trackedReplicatedObjects.empty())
+            std::exit(EXIT_FAILURE);
+    }
+    void attach(NumaReplicatedBase* obj) {
+        assert(trackedReplicatedObjects.count(obj) == 0);
+        trackedReplicatedObjects.insert(obj);
+    }
+    void detach(NumaReplicatedBase* obj) {
+        assert(trackedReplicatedObjects.count(obj) == 1);
+        trackedReplicatedObjects.erase(obj);
+    }
+    // oldObj may be invalid at this point
+    void move_attached([[maybe_unused]] NumaReplicatedBase* oldObj, NumaReplicatedBase* newObj) {
+        assert(trackedReplicatedObjects.count(oldObj) == 1);
+        assert(trackedReplicatedObjects.count(newObj) == 0);
+        trackedReplicatedObjects.erase(oldObj);
+        trackedReplicatedObjects.insert(newObj);
+    }
+    void set_numa_config(NumaConfig&& cfg) {
+        config = std::move(cfg);
+        for (auto&& obj : trackedReplicatedObjects)
+            obj->on_numa_config_changed();
+    }
+    const NumaConfig& get_numa_config() const { return config; }
+   private:
+    NumaConfig config;
+    // std::set uses std::less by default, which is required for pointer comparison
+    std::set<NumaReplicatedBase*> trackedReplicatedObjects;
+};
+inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicationContext& ctx) :
+    context(&ctx) {
+    context->attach(this);
+}
+inline NumaReplicatedBase::NumaReplicatedBase(NumaReplicatedBase&& other) noexcept :
+    context(std::exchange(other.context, nullptr)) {
+    context->move_attached(&other, this);
+}
+inline NumaReplicatedBase& NumaReplicatedBase::operator=(NumaReplicatedBase&& other) noexcept {
+    context = std::exchange(other.context, nullptr);
+    context->move_attached(&other, this);
+    return *this;
+}
+inline NumaReplicatedBase::~NumaReplicatedBase() {
+    if (context != nullptr)
+        context->detach(this);
+}
+inline const NumaConfig& NumaReplicatedBase::get_numa_config() const {
+    return context->get_numa_config();
+}
+}  // namespace Stockfish
+#endif  // #ifndef NUMA_H_INCLUDED

src/perft.h ADDED Viewed

	@@ -0,0 +1,67 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef PERFT_H_INCLUDED
+#define PERFT_H_INCLUDED
+#include <cstdint>
+#include "movegen.h"
+#include "position.h"
+#include "types.h"
+#include "uci.h"
+namespace Stockfish::Benchmark {
+// Utility to verify move generation. All the leaf nodes up
+// to the given depth are generated and counted, and the sum is returned.
+template<bool Root>
+uint64_t perft(Position& pos, Depth depth) {
+    StateInfo st;
+    uint64_t   cnt, nodes = 0;
+    const bool leaf = (depth == 2);
+    for (const auto& m : MoveList<LEGAL>(pos))
+    {
+        if (Root && depth <= 1)
+            cnt = 1, nodes++;
+        else
+        {
+            pos.do_move(m, st);
+            cnt = leaf ? MoveList<LEGAL>(pos).size() : perft<false>(pos, depth - 1);
+            nodes += cnt;
+            pos.undo_move(m);
+        }
+        if (Root)
+            sync_cout << UCIEngine::move(m, pos.is_chess960()) << ": " << cnt << sync_endl;
+    }
+    return nodes;
+}
+inline uint64_t perft(const std::string& fen, Depth depth, bool isChess960) {
+    StateInfo st;
+    Position  p;
+    p.set(fen, isChess960, &st);
+    return perft<true>(p, depth);
+}
+}
+#endif  // PERFT_H_INCLUDED

src/position.cpp ADDED Viewed

	@@ -0,0 +1,1566 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "position.h"
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <cstring>
+#include <initializer_list>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string_view>
+#include <utility>
+#include "bitboard.h"
+#include "history.h"
+#include "misc.h"
+#include "movegen.h"
+#include "syzygy/tbprobe.h"
+#include "tt.h"
+#include "uci.h"
+using std::string;
+namespace Stockfish {
+namespace Zobrist {
+Key psq[PIECE_NB][SQUARE_NB];
+Key enpassant[FILE_NB];
+Key castling[CASTLING_RIGHT_NB];
+Key side, noPawns;
+}
+namespace {
+constexpr std::string_view PieceToChar(" PNBRQK  pnbrqk");
+static constexpr Piece Pieces[] = {W_PAWN, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
+                                   B_PAWN, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING};
+}  // namespace
+// Returns an ASCII representation of the position
+std::ostream& operator<<(std::ostream& os, const Position& pos) {
+    os << "\n +---+---+---+---+---+---+---+---+\n";
+    for (Rank r = RANK_8;; --r)
+    {
+        for (File f = FILE_A; f <= FILE_H; ++f)
+            os << " | " << PieceToChar[pos.piece_on(make_square(f, r))];
+        os << " | " << (1 + r) << "\n +---+---+---+---+---+---+---+---+\n";
+        if (r == RANK_1)
+            break;
+    }
+    os << "   a   b   c   d   e   f   g   h\n"
+       << "\nFen: " << pos.fen() << "\nKey: " << std::hex << std::uppercase << std::setfill('0')
+       << std::setw(16) << pos.key() << std::setfill(' ') << std::dec << "\nCheckers: ";
+    for (Bitboard b = pos.checkers(); b;)
+        os << UCIEngine::square(pop_lsb(b)) << " ";
+    if (Tablebases::MaxCardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
+    {
+        StateInfo st;
+        Position p;
+        p.set(pos.fen(), pos.is_chess960(), &st);
+        Tablebases::ProbeState s1, s2;
+        Tablebases::WDLScore   wdl = Tablebases::probe_wdl(p, &s1);
+        int                    dtz = Tablebases::probe_dtz(p, &s2);
+        os << "\nTablebases WDL: " << std::setw(4) << wdl << " (" << s1 << ")"
+           << "\nTablebases DTZ: " << std::setw(4) << dtz << " (" << s2 << ")";
+    }
+    return os;
+}
+// Implements Marcel van Kervinck's cuckoo algorithm to detect repetition of positions
+// for 3-fold repetition draws. The algorithm uses two hash tables with Zobrist hashes
+// to allow fast detection of recurring positions. For details see:
+// http://web.archive.org/web/20201107002606/https://marcelk.net/2013-04-06/paper/upcoming-rep-v2.pdf
+// First and second hash functions for indexing the cuckoo tables
+inline int H1(Key h) { return h & 0x1fff; }
+inline int H2(Key h) { return (h >> 16) & 0x1fff; }
+// Cuckoo tables with Zobrist hashes of valid reversible moves, and the moves themselves
+std::array<Key, 8192>  cuckoo;
+std::array<Move, 8192> cuckooMove;
+// Initializes at startup the various arrays used to compute hash keys
+void Position::init() {
+    PRNG rng(1070372);
+    for (Piece pc : Pieces)
+        for (Square s = SQ_A1; s <= SQ_H8; ++s)
+            Zobrist::psq[pc][s] = rng.rand<Key>();
+    // pawns on these squares will promote
+    std::fill_n(Zobrist::psq[W_PAWN] + SQ_A8, 8, 0);
+    std::fill_n(Zobrist::psq[B_PAWN], 8, 0);
+    for (File f = FILE_A; f <= FILE_H; ++f)
+        Zobrist::enpassant[f] = rng.rand<Key>();
+    for (int cr = NO_CASTLING; cr <= ANY_CASTLING; ++cr)
+        Zobrist::castling[cr] = rng.rand<Key>();
+    Zobrist::side    = rng.rand<Key>();
+    Zobrist::noPawns = rng.rand<Key>();
+    // Prepare the cuckoo tables
+    cuckoo.fill(0);
+    cuckooMove.fill(Move::none());
+    [[maybe_unused]] int count = 0;
+    for (Piece pc : Pieces)
+        for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1)
+            for (Square s2 = Square(s1 + 1); s2 <= SQ_H8; ++s2)
+                if ((type_of(pc) != PAWN) && (attacks_bb(type_of(pc), s1, 0) & s2))
+                {
+                    Move move = Move(s1, s2);
+                    Key  key  = Zobrist::psq[pc][s1] ^ Zobrist::psq[pc][s2] ^ Zobrist::side;
+                    int  i    = H1(key);
+                    while (true)
+                    {
+                        std::swap(cuckoo[i], key);
+                        std::swap(cuckooMove[i], move);
+                        if (move == Move::none())  // Arrived at empty slot?
+                            break;
+                        i = (i == H1(key)) ? H2(key) : H1(key);  // Push victim to alternative slot
+                    }
+                    count++;
+                }
+    assert(count == 3668);
+}
+// Initializes the position object with the given FEN string.
+// This function is not very robust - make sure that input FENs are correct,
+// this is assumed to be the responsibility of the GUI.
+Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si) {
+    /*
+   A FEN string defines a particular position using only the ASCII character set.
+   A FEN string contains six fields separated by a space. The fields are:
+   1) Piece placement (from white's perspective). Each rank is described, starting
+      with rank 8 and ending with rank 1. Within each rank, the contents of each
+      square are described from file A through file H. Following the Standard
+      Algebraic Notation (SAN), each piece is identified by a single letter taken
+      from the standard English names. White pieces are designated using upper-case
+      letters ("PNBRQK") whilst Black uses lowercase ("pnbrqk"). Blank squares are
+      noted using digits 1 through 8 (the number of blank squares), and "/"
+      separates ranks.
+   2) Active color. "w" means white moves next, "b" means black.
+   3) Castling availability. If neither side can castle, this is "-". Otherwise,
+      this has one or more letters: "K" (White can castle kingside), "Q" (White
+      can castle queenside), "k" (Black can castle kingside), and/or "q" (Black
+      can castle queenside).
+   4) En passant target square (in algebraic notation). If there's no en passant
+      target square, this is "-". If a pawn has just made a 2-square move, this
+      is the position "behind" the pawn. Following X-FEN standard, this is recorded
+      only if there is a pawn in position to make an en passant capture, and if
+      there really is a pawn that might have advanced two squares.
+   5) Halfmove clock. This is the number of halfmoves since the last pawn advance
+      or capture. This is used to determine if a draw can be claimed under the
+      fifty-move rule.
+   6) Fullmove number. The number of the full move. It starts at 1, and is
+      incremented after Black's move.
+*/
+    unsigned char      col, row, token;
+    size_t             idx;
+    Square             sq = SQ_A8;
+    std::istringstream ss(fenStr);
+    std::memset(reinterpret_cast<char*>(this), 0, sizeof(Position));
+    std::memset(si, 0, sizeof(StateInfo));
+    st = si;
+    ss >> std::noskipws;
+    // 1. Piece placement
+    while ((ss >> token) && !isspace(token))
+    {
+        if (isdigit(token))
+            sq += (token - '0') * EAST;  // Advance the given number of files
+        else if (token == '/')
+            sq += 2 * SOUTH;
+        else if ((idx = PieceToChar.find(token)) != string::npos)
+        {
+            put_piece(Piece(idx), sq);
+            ++sq;
+        }
+    }
+    // 2. Active color
+    ss >> token;
+    sideToMove = (token == 'w' ? WHITE : BLACK);
+    ss >> token;
+    // 3. Castling availability. Compatible with 3 standards: Normal FEN standard,
+    // Shredder-FEN that uses the letters of the columns on which the rooks began
+    // the game instead of KQkq and also X-FEN standard that, in case of Chess960,
+    // if an inner rook is associated with the castling right, the castling tag is
+    // replaced by the file letter of the involved rook, as for the Shredder-FEN.
+    while ((ss >> token) && !isspace(token))
+    {
+        Square rsq;
+        Color  c    = islower(token) ? BLACK : WHITE;
+        Piece  rook = make_piece(c, ROOK);
+        token = char(toupper(token));
+        if (token == 'K')
+            for (rsq = relative_square(c, SQ_H1); piece_on(rsq) != rook; --rsq)
+            {}
+        else if (token == 'Q')
+            for (rsq = relative_square(c, SQ_A1); piece_on(rsq) != rook; ++rsq)
+            {}
+        else if (token >= 'A' && token <= 'H')
+            rsq = make_square(File(token - 'A'), relative_rank(c, RANK_1));
+        else
+            continue;
+        set_castling_right(c, rsq);
+    }
+    // 4. En passant square.
+    // Ignore if square is invalid or not on side to move relative rank 6.
+    bool enpassant = false, legalEP = false;
+    if (((ss >> col) && (col >= 'a' && col <= 'h'))
+        && ((ss >> row) && (row == (sideToMove == WHITE ? '6' : '3'))))
+    {
+        st->epSquare = make_square(File(col - 'a'), Rank(row - '1'));
+        Bitboard pawns  = attacks_bb<PAWN>(st->epSquare, ~sideToMove) & pieces(sideToMove, PAWN);
+        Bitboard target = (pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove)));
+        Bitboard occ    = pieces() ^ target ^ st->epSquare;
+        // En passant square will be considered only if
+        // a) side to move have a pawn threatening epSquare
+        // b) there is an enemy pawn in front of epSquare
+        // c) there is no piece on epSquare or behind epSquare
+        enpassant =
+          pawns && target && !(pieces() & (st->epSquare | (st->epSquare + pawn_push(sideToMove))));
+        // If no pawn can execute the en passant capture without leaving the king in check, don't record the epSquare
+        while (pawns)
+            legalEP |= !(attackers_to(square<KING>(sideToMove), occ ^ pop_lsb(pawns))
+                         & pieces(~sideToMove) & ~target);
+    }
+    if (!enpassant || !legalEP)
+        st->epSquare = SQ_NONE;
+    // 5-6. Halfmove clock and fullmove number
+    ss >> std::skipws >> st->rule50 >> gamePly;
+    // Convert from fullmove starting from 1 to gamePly starting from 0,
+    // handle also common incorrect FEN with fullmove = 0.
+    gamePly = std::max(2 * (gamePly - 1), 0) + (sideToMove == BLACK);
+    chess960 = isChess960;
+    set_state();
+    assert(pos_is_ok());
+    return *this;
+}
+// Helper function used to set castling
+// rights given the corresponding color and the rook starting square.
+void Position::set_castling_right(Color c, Square rfrom) {
+    Square         kfrom = square<KING>(c);
+    CastlingRights cr    = c & (kfrom < rfrom ? KING_SIDE : QUEEN_SIDE);
+    st->castlingRights |= cr;
+    castlingRightsMask[kfrom] |= cr;
+    castlingRightsMask[rfrom] |= cr;
+    castlingRookSquare[cr] = rfrom;
+    Square kto = relative_square(c, cr & KING_SIDE ? SQ_G1 : SQ_C1);
+    Square rto = relative_square(c, cr & KING_SIDE ? SQ_F1 : SQ_D1);
+    castlingPath[cr] = (between_bb(rfrom, rto) | between_bb(kfrom, kto)) & ~(kfrom | rfrom);
+}
+// Sets king attacks to detect if a move gives check
+void Position::set_check_info() const {
+    update_slider_blockers(WHITE);
+    update_slider_blockers(BLACK);
+    Square ksq = square<KING>(~sideToMove);
+    st->checkSquares[PAWN]   = attacks_bb<PAWN>(ksq, ~sideToMove);
+    st->checkSquares[KNIGHT] = attacks_bb<KNIGHT>(ksq);
+    st->checkSquares[BISHOP] = attacks_bb<BISHOP>(ksq, pieces());
+    st->checkSquares[ROOK]   = attacks_bb<ROOK>(ksq, pieces());
+    st->checkSquares[QUEEN]  = st->checkSquares[BISHOP] | st->checkSquares[ROOK];
+    st->checkSquares[KING]   = 0;
+}
+// Computes the hash keys of the position, and other
+// data that once computed is updated incrementally as moves are made.
+// The function is only used when a new position is set up
+void Position::set_state() const {
+    st->key               = 0;
+    st->minorPieceKey     = 0;
+    st->nonPawnKey[WHITE] = st->nonPawnKey[BLACK] = 0;
+    st->pawnKey                                   = Zobrist::noPawns;
+    st->nonPawnMaterial[WHITE] = st->nonPawnMaterial[BLACK] = VALUE_ZERO;
+    st->checkersBB = attackers_to(square<KING>(sideToMove)) & pieces(~sideToMove);
+    set_check_info();
+    for (Bitboard b = pieces(); b;)
+    {
+        Square s  = pop_lsb(b);
+        Piece  pc = piece_on(s);
+        st->key ^= Zobrist::psq[pc][s];
+        if (type_of(pc) == PAWN)
+            st->pawnKey ^= Zobrist::psq[pc][s];
+        else
+        {
+            st->nonPawnKey[color_of(pc)] ^= Zobrist::psq[pc][s];
+            if (type_of(pc) != KING)
+            {
+                st->nonPawnMaterial[color_of(pc)] += PieceValue[pc];
+                if (type_of(pc) <= BISHOP)
+                    st->minorPieceKey ^= Zobrist::psq[pc][s];
+            }
+        }
+    }
+    if (st->epSquare != SQ_NONE)
+        st->key ^= Zobrist::enpassant[file_of(st->epSquare)];
+    if (sideToMove == BLACK)
+        st->key ^= Zobrist::side;
+    st->key ^= Zobrist::castling[st->castlingRights];
+    st->materialKey = compute_material_key();
+}
+Key Position::compute_material_key() const {
+    Key k = 0;
+    for (Piece pc : Pieces)
+        for (int cnt = 0; cnt < pieceCount[pc]; ++cnt)
+            k ^= Zobrist::psq[pc][8 + cnt];
+    return k;
+}
+// Overload to initialize the position object with the given endgame code string
+// like "KBPKN". It's mainly a helper to get the material key out of an endgame code.
+Position& Position::set(const string& code, Color c, StateInfo* si) {
+    assert(code[0] == 'K');
+    string sides[] = {code.substr(code.find('K', 1)),                                // Weak
+                      code.substr(0, std::min(code.find('v'), code.find('K', 1)))};  // Strong
+    assert(sides[0].length() > 0 && sides[0].length() < 8);
+    assert(sides[1].length() > 0 && sides[1].length() < 8);
+    std::transform(sides[c].begin(), sides[c].end(), sides[c].begin(), tolower);
+    string fenStr = "8/" + sides[0] + char(8 - sides[0].length() + '0') + "/8/8/8/8/" + sides[1]
+                  + char(8 - sides[1].length() + '0') + "/8 w - - 0 10";
+    return set(fenStr, false, si);
+}
+// Returns a FEN representation of the position. In case of
+// Chess960 the Shredder-FEN notation is used. This is mainly a debugging function.
+string Position::fen() const {
+    int                emptyCnt;
+    std::ostringstream ss;
+    for (Rank r = RANK_8;; --r)
+    {
+        for (File f = FILE_A; f <= FILE_H; ++f)
+        {
+            for (emptyCnt = 0; f <= FILE_H && empty(make_square(f, r)); ++f)
+                ++emptyCnt;
+            if (emptyCnt)
+                ss << emptyCnt;
+            if (f <= FILE_H)
+                ss << PieceToChar[piece_on(make_square(f, r))];
+        }
+        if (r == RANK_1)
+            break;
+        ss << '/';
+    }
+    ss << (sideToMove == WHITE ? " w " : " b ");
+    if (can_castle(WHITE_OO))
+        ss << (chess960 ? char('A' + file_of(castling_rook_square(WHITE_OO))) : 'K');
+    if (can_castle(WHITE_OOO))
+        ss << (chess960 ? char('A' + file_of(castling_rook_square(WHITE_OOO))) : 'Q');
+    if (can_castle(BLACK_OO))
+        ss << (chess960 ? char('a' + file_of(castling_rook_square(BLACK_OO))) : 'k');
+    if (can_castle(BLACK_OOO))
+        ss << (chess960 ? char('a' + file_of(castling_rook_square(BLACK_OOO))) : 'q');
+    if (!can_castle(ANY_CASTLING))
+        ss << '-';
+    ss << (ep_square() == SQ_NONE ? " - " : " " + UCIEngine::square(ep_square()) + " ")
+       << st->rule50 << " " << 1 + (gamePly - (sideToMove == BLACK)) / 2;
+    return ss.str();
+}
+// Calculates st->blockersForKing[c] and st->pinners[~c],
+// which store respectively the pieces preventing king of color c from being in check
+// and the slider pieces of color ~c pinning pieces of color c to the king.
+void Position::update_slider_blockers(Color c) const {
+    Square ksq = square<KING>(c);
+    st->blockersForKing[c] = 0;
+    st->pinners[~c]        = 0;
+    // Snipers are sliders that attack 's' when a piece and other snipers are removed
+    Bitboard snipers = ((attacks_bb<ROOK>(ksq) & pieces(QUEEN, ROOK))
+                        | (attacks_bb<BISHOP>(ksq) & pieces(QUEEN, BISHOP)))
+                     & pieces(~c);
+    Bitboard occupancy = pieces() ^ snipers;
+    while (snipers)
+    {
+        Square   sniperSq = pop_lsb(snipers);
+        Bitboard b        = between_bb(ksq, sniperSq) & occupancy;
+        if (b && !more_than_one(b))
+        {
+            st->blockersForKing[c] |= b;
+            if (b & pieces(c))
+                st->pinners[~c] |= sniperSq;
+        }
+    }
+}
+// Computes a bitboard of all pieces which attack a given square.
+// Slider attacks use the occupied bitboard to indicate occupancy.
+Bitboard Position::attackers_to(Square s, Bitboard occupied) const {
+    return (attacks_bb<ROOK>(s, occupied) & pieces(ROOK, QUEEN))
+         | (attacks_bb<BISHOP>(s, occupied) & pieces(BISHOP, QUEEN))
+         | (attacks_bb<PAWN>(s, BLACK) & pieces(WHITE, PAWN))
+         | (attacks_bb<PAWN>(s, WHITE) & pieces(BLACK, PAWN))
+         | (attacks_bb<KNIGHT>(s) & pieces(KNIGHT)) | (attacks_bb<KING>(s) & pieces(KING));
+}
+bool Position::attackers_to_exist(Square s, Bitboard occupied, Color c) const {
+    return (attacks_bb<ROOK>(s, occupied) & pieces(c, ROOK, QUEEN))
+        || (attacks_bb<BISHOP>(s, occupied) & pieces(c, BISHOP, QUEEN))
+        || (attacks_bb<PAWN>(s, ~c) & pieces(c, PAWN))
+        || (attacks_bb<KNIGHT>(s) & pieces(c, KNIGHT)) || (attacks_bb<KING>(s) & pieces(c, KING));
+}
+// Tests whether a pseudo-legal move is legal
+bool Position::legal(Move m) const {
+    assert(m.is_ok());
+    Color  us   = sideToMove;
+    Square from = m.from_sq();
+    Square to   = m.to_sq();
+    assert(color_of(moved_piece(m)) == us);
+    assert(piece_on(square<KING>(us)) == make_piece(us, KING));
+    // En passant captures are a tricky special case. Because they are rather
+    // uncommon, we do it simply by testing whether the king is attacked after
+    // the move is made.
+    if (m.type_of() == EN_PASSANT)
+    {
+        Square   ksq      = square<KING>(us);
+        Square   capsq    = to - pawn_push(us);
+        Bitboard occupied = (pieces() ^ from ^ capsq) | to;
+        assert(to == ep_square());
+        assert(moved_piece(m) == make_piece(us, PAWN));
+        assert(piece_on(capsq) == make_piece(~us, PAWN));
+        assert(piece_on(to) == NO_PIECE);
+        return !(attacks_bb<ROOK>(ksq, occupied) & pieces(~us, QUEEN, ROOK))
+            && !(attacks_bb<BISHOP>(ksq, occupied) & pieces(~us, QUEEN, BISHOP));
+    }
+    // Castling moves generation does not check if the castling path is clear of
+    // enemy attacks, it is delayed at a later time: now!
+    if (m.type_of() == CASTLING)
+    {
+        // After castling, the rook and king final positions are the same in
+        // Chess960 as they would be in standard chess.
+        to             = relative_square(us, to > from ? SQ_G1 : SQ_C1);
+        Direction step = to > from ? WEST : EAST;
+        for (Square s = to; s != from; s += step)
+            if (attackers_to_exist(s, pieces(), ~us))
+                return false;
+        // In case of Chess960, verify if the Rook blocks some checks.
+        // For instance an enemy queen in SQ_A1 when castling rook is in SQ_B1.
+        return !chess960 || !(blockers_for_king(us) & m.to_sq());
+    }
+    // If the moving piece is a king, check whether the destination square is
+    // attacked by the opponent.
+    if (type_of(piece_on(from)) == KING)
+        return !(attackers_to_exist(to, pieces() ^ from, ~us));
+    // A non-king move is legal if and only if it is not pinned or it
+    // is moving along the ray towards or away from the king.
+    return !(blockers_for_king(us) & from) || line_bb(from, to) & pieces(us, KING);
+}
+// Takes a random move and tests whether the move is
+// pseudo-legal. It is used to validate moves from TT that can be corrupted
+// due to SMP concurrent access or hash position key aliasing.
+bool Position::pseudo_legal(const Move m) const {
+    Color  us   = sideToMove;
+    Square from = m.from_sq();
+    Square to   = m.to_sq();
+    Piece  pc   = moved_piece(m);
+    // Use a slower but simpler function for uncommon cases
+    // yet we skip the legality check of MoveList<LEGAL>().
+    if (m.type_of() != NORMAL)
+        return checkers() ? MoveList<EVASIONS>(*this).contains(m)
+                          : MoveList<NON_EVASIONS>(*this).contains(m);
+    // Is not a promotion, so the promotion piece must be empty
+    assert(m.promotion_type() - KNIGHT == NO_PIECE_TYPE);
+    // If the 'from' square is not occupied by a piece belonging to the side to
+    // move, the move is obviously not legal.
+    if (pc == NO_PIECE || color_of(pc) != us)
+        return false;
+    // The destination square cannot be occupied by a friendly piece
+    if (pieces(us) & to)
+        return false;
+    // Handle the special case of a pawn move
+    if (type_of(pc) == PAWN)
+    {
+        // We have already handled promotion moves, so destination cannot be on the 8th/1st rank
+        if ((Rank8BB | Rank1BB) & to)
+            return false;
+        // Check if it's a valid capture, single push, or double push
+        const bool isCapture    = bool(attacks_bb<PAWN>(from, us) & pieces(~us) & to);
+        const bool isSinglePush = (from + pawn_push(us) == to) && empty(to);
+        const bool isDoublePush = (from + 2 * pawn_push(us) == to)
+                               && (relative_rank(us, from) == RANK_2) && empty(to)
+                               && empty(to - pawn_push(us));
+        if (!(isCapture || isSinglePush || isDoublePush))
+            return false;
+    }
+    else if (!(attacks_bb(type_of(pc), from, pieces()) & to))
+        return false;
+    // Evasions generator already takes care to avoid some kind of illegal moves
+    // and legal() relies on this. We therefore have to take care that the same
+    // kind of moves are filtered out here.
+    if (checkers())
+    {
+        if (type_of(pc) != KING)
+        {
+            // Double check? In this case, a king move is required
+            if (more_than_one(checkers()))
+                return false;
+            // Our move must be a blocking interposition or a capture of the checking piece
+            if (!(between_bb(square<KING>(us), lsb(checkers())) & to))
+                return false;
+        }
+        // In case of king moves under check we have to remove the king so as to catch
+        // invalid moves like b1a1 when opposite queen is on c1.
+        else if (attackers_to_exist(to, pieces() ^ from, ~us))
+            return false;
+    }
+    return true;
+}
+// Tests whether a pseudo-legal move gives a check
+bool Position::gives_check(Move m) const {
+    assert(m.is_ok());
+    assert(color_of(moved_piece(m)) == sideToMove);
+    Square from = m.from_sq();
+    Square to   = m.to_sq();
+    // Is there a direct check?
+    if (check_squares(type_of(piece_on(from))) & to)
+        return true;
+    // Is there a discovered check?
+    if (blockers_for_king(~sideToMove) & from)
+        return !(line_bb(from, to) & pieces(~sideToMove, KING)) || m.type_of() == CASTLING;
+    switch (m.type_of())
+    {
+    case NORMAL :
+        return false;
+    case PROMOTION :
+        return attacks_bb(m.promotion_type(), to, pieces() ^ from) & pieces(~sideToMove, KING);
+    // En passant capture with check? We have already handled the case of direct
+    // checks and ordinary discovered check, so the only case we need to handle
+    // is the unusual case of a discovered check through the captured pawn.
+    case EN_PASSANT : {
+        Square   capsq = make_square(file_of(to), rank_of(from));
+        Bitboard b     = (pieces() ^ from ^ capsq) | to;
+        return (attacks_bb<ROOK>(square<KING>(~sideToMove), b) & pieces(sideToMove, QUEEN, ROOK))
+             | (attacks_bb<BISHOP>(square<KING>(~sideToMove), b)
+                & pieces(sideToMove, QUEEN, BISHOP));
+    }
+    default :  //CASTLING
+    {
+        // Castling is encoded as 'king captures the rook'
+        Square rto = relative_square(sideToMove, to > from ? SQ_F1 : SQ_D1);
+        return check_squares(ROOK) & rto;
+    }
+    }
+}
+// Makes a move, and saves all information necessary
+// to a StateInfo object. The move is assumed to be legal. Pseudo-legal
+// moves should be filtered out before this function is called.
+// If a pointer to the TT table is passed, the entry for the new position
+// will be prefetched, and likewise for shared history.
+void Position::do_move(Move                      m,
+                       StateInfo&                newSt,
+                       bool                      givesCheck,
+                       DirtyPiece&               dp,
+                       DirtyThreats&             dts,
+                       const TranspositionTable* tt      = nullptr,
+                       const SharedHistories*    history = nullptr) {
+    assert(m.is_ok());
+    assert(&newSt != st);
+    Key k = st->key ^ Zobrist::side;
+    // Copy some fields of the old state to our new StateInfo object except the
+    // ones which are going to be recalculated from scratch anyway and then switch
+    // our state pointer to point to the new (ready to be updated) state.
+    std::memcpy(&newSt, st, offsetof(StateInfo, key));
+    newSt.previous = st;
+    st             = &newSt;
+    // Increment ply counters. In particular, rule50 will be reset to zero later on
+    // in case of a capture or a pawn move.
+    ++gamePly;
+    ++st->rule50;
+    ++st->pliesFromNull;
+    Color  us       = sideToMove;
+    Color  them     = ~us;
+    Square from     = m.from_sq();
+    Square to       = m.to_sq();
+    Piece  pc       = piece_on(from);
+    Piece  captured = m.type_of() == EN_PASSANT ? make_piece(them, PAWN) : piece_on(to);
+    dp.pc             = pc;
+    dp.from           = from;
+    dp.to             = to;
+    dp.add_sq         = SQ_NONE;
+    dts.us            = us;
+    dts.prevKsq       = square<KING>(us);
+    dts.threatenedSqs = dts.threateningSqs = 0;
+    assert(color_of(pc) == us);
+    assert(captured == NO_PIECE || color_of(captured) == (m.type_of() != CASTLING ? them : us));
+    assert(type_of(captured) != KING);
+    if (m.type_of() == CASTLING)
+    {
+        assert(pc == make_piece(us, KING));
+        assert(captured == make_piece(us, ROOK));
+        Square rfrom, rto;
+        do_castling<true>(us, from, to, rfrom, rto, &dts, &dp);
+        k ^= Zobrist::psq[captured][rfrom] ^ Zobrist::psq[captured][rto];
+        st->nonPawnKey[us] ^= Zobrist::psq[captured][rfrom] ^ Zobrist::psq[captured][rto];
+        captured = NO_PIECE;
+    }
+    else if (captured)
+    {
+        Square capsq = to;
+        // If the captured piece is a pawn, update pawn hash key, otherwise
+        // update non-pawn material.
+        if (type_of(captured) == PAWN)
+        {
+            if (m.type_of() == EN_PASSANT)
+            {
+                capsq -= pawn_push(us);
+                assert(pc == make_piece(us, PAWN));
+                assert(to == st->epSquare);
+                assert(relative_rank(us, to) == RANK_6);
+                assert(piece_on(to) == NO_PIECE);
+                assert(piece_on(capsq) == make_piece(them, PAWN));
+                // Update board and piece lists in ep case, normal captures are updated later
+                remove_piece(capsq, &dts);
+            }
+            st->pawnKey ^= Zobrist::psq[captured][capsq];
+        }
+        else
+        {
+            st->nonPawnMaterial[them] -= PieceValue[captured];
+            st->nonPawnKey[them] ^= Zobrist::psq[captured][capsq];
+            if (type_of(captured) <= BISHOP)
+                st->minorPieceKey ^= Zobrist::psq[captured][capsq];
+        }
+        dp.remove_pc = captured;
+        dp.remove_sq = capsq;
+        k ^= Zobrist::psq[captured][capsq];
+        st->materialKey ^=
+          Zobrist::psq[captured][8 + pieceCount[captured] - (m.type_of() != EN_PASSANT)];
+        // Reset rule 50 counter
+        st->rule50 = 0;
+    }
+    else
+        dp.remove_sq = SQ_NONE;
+    // Update hash key
+    k ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to];
+    // Reset en passant square
+    if (st->epSquare != SQ_NONE)
+    {
+        k ^= Zobrist::enpassant[file_of(st->epSquare)];
+        st->epSquare = SQ_NONE;
+    }
+    // Update castling rights.
+    k ^= Zobrist::castling[st->castlingRights];
+    st->castlingRights &= ~(castlingRightsMask[from] | castlingRightsMask[to]);
+    k ^= Zobrist::castling[st->castlingRights];
+    // Move the piece. The tricky Chess960 castling is handled earlier
+    if (m.type_of() != CASTLING)
+    {
+        if (captured && m.type_of() != EN_PASSANT)
+        {
+            remove_piece(from, &dts);
+            swap_piece(to, pc, &dts);
+        }
+        else
+            move_piece(from, to, &dts);
+    }
+    // If the moving piece is a pawn do some special extra work
+    if (type_of(pc) == PAWN)
+    {
+        // Check if the en passant square needs to be set. Accurate e.p. info is needed
+        // for correct zobrist key generation and 3-fold checking.
+        if ((int(to) ^ int(from)) == 16)
+        {
+            Square   epSquare = to - pawn_push(us);
+            Bitboard pawns    = attacks_bb<PAWN>(epSquare, us) & pieces(them, PAWN);
+            // If there are no pawns attacking the ep square, ep is not possible.
+            if (pawns)
+            {
+                Square   ksq         = square<KING>(them);
+                Bitboard notBlockers = ~st->previous->blockersForKing[them];
+                bool     noDiscovery = (from & notBlockers) || file_of(from) == file_of(ksq);
+                // If the pawn gives discovered check, ep is never legal. Else, if at least one
+                // pawn was not a blocker for the enemy king or lies on the same line as the
+                // enemy king and en passant square, a legal capture exists.
+                if (noDiscovery && (pawns & (notBlockers | line_bb(epSquare, ksq))))
+                {
+                    st->epSquare = epSquare;
+                    k ^= Zobrist::enpassant[file_of(epSquare)];
+                }
+            }
+        }
+        else if (m.type_of() == PROMOTION)
+        {
+            Piece     promotion     = make_piece(us, m.promotion_type());
+            PieceType promotionType = type_of(promotion);
+            assert(relative_rank(us, to) == RANK_8);
+            assert(type_of(promotion) >= KNIGHT && type_of(promotion) <= QUEEN);
+            swap_piece(to, promotion, &dts);
+            dp.add_pc = promotion;
+            dp.add_sq = to;
+            dp.to     = SQ_NONE;
+            // Update hash keys
+            // Zobrist::psq[pc][to] is zero, so we don't need to clear it
+            k ^= Zobrist::psq[promotion][to];
+            st->materialKey ^= Zobrist::psq[promotion][8 + pieceCount[promotion] - 1]
+                             ^ Zobrist::psq[pc][8 + pieceCount[pc]];
+            st->nonPawnKey[us] ^= Zobrist::psq[promotion][to];
+            if (promotionType <= BISHOP)
+                st->minorPieceKey ^= Zobrist::psq[promotion][to];
+            // Update material
+            st->nonPawnMaterial[us] += PieceValue[promotion];
+        }
+        // Update pawn hash key
+        st->pawnKey ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to];
+        // Reset rule 50 draw counter
+        st->rule50 = 0;
+    }
+    else
+    {
+        st->nonPawnKey[us] ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to];
+        if (type_of(pc) <= BISHOP)
+            st->minorPieceKey ^= Zobrist::psq[pc][from] ^ Zobrist::psq[pc][to];
+    }
+    // Update the key with the final value
+    st->key = k;
+    if (tt)
+        prefetch(tt->first_entry(key()));
+    if (history)
+    {
+        prefetch(&history->pawn_entry(*this)[pc][to]);
+        prefetch(&history->pawn_correction_entry(*this));
+        prefetch(&history->minor_piece_correction_entry(*this));
+        prefetch(&history->nonpawn_correction_entry<WHITE>(*this));
+        prefetch(&history->nonpawn_correction_entry<BLACK>(*this));
+    }
+    // Set capture piece
+    st->capturedPiece = captured;
+    // Calculate checkers bitboard (if move gives check)
+    st->checkersBB = givesCheck ? attackers_to(square<KING>(them)) & pieces(us) : 0;
+    sideToMove = ~sideToMove;
+    // Update king attacks used for fast check detection
+    set_check_info();
+    // Calculate the repetition info. It is the ply distance from the previous
+    // occurrence of the same position, negative in the 3-fold case, or zero
+    // if the position was not repeated.
+    st->repetition = 0;
+    int end        = std::min(st->rule50, st->pliesFromNull);
+    if (end >= 4)
+    {
+        StateInfo* stp = st->previous->previous;
+        for (int i = 4; i <= end; i += 2)
+        {
+            stp = stp->previous->previous;
+            if (stp->key == st->key)
+            {
+                st->repetition = stp->repetition ? -i : i;
+                break;
+            }
+        }
+    }
+    dts.ksq = square<KING>(us);
+    assert(pos_is_ok());
+    assert(dp.pc != NO_PIECE);
+    assert(!(bool(captured) || m.type_of() == CASTLING) ^ (dp.remove_sq != SQ_NONE));
+    assert(dp.from != SQ_NONE);
+    assert(!(dp.add_sq != SQ_NONE) ^ (m.type_of() == PROMOTION || m.type_of() == CASTLING));
+}
+// Unmakes a move. When it returns, the position should
+// be restored to exactly the same state as before the move was made.
+void Position::undo_move(Move m) {
+    assert(m.is_ok());
+    sideToMove = ~sideToMove;
+    Color  us   = sideToMove;
+    Square from = m.from_sq();
+    Square to   = m.to_sq();
+    Piece  pc   = piece_on(to);
+    assert(empty(from) || m.type_of() == CASTLING);
+    assert(type_of(st->capturedPiece) != KING);
+    if (m.type_of() == PROMOTION)
+    {
+        assert(relative_rank(us, to) == RANK_8);
+        assert(type_of(pc) == m.promotion_type());
+        assert(type_of(pc) >= KNIGHT && type_of(pc) <= QUEEN);
+        remove_piece(to);
+        pc = make_piece(us, PAWN);
+        put_piece(pc, to);
+    }
+    if (m.type_of() == CASTLING)
+    {
+        Square rfrom, rto;
+        do_castling<false>(us, from, to, rfrom, rto);
+    }
+    else
+    {
+        move_piece(to, from);  // Put the piece back at the source square
+        if (st->capturedPiece)
+        {
+            Square capsq = to;
+            if (m.type_of() == EN_PASSANT)
+            {
+                capsq -= pawn_push(us);
+                assert(type_of(pc) == PAWN);
+                assert(to == st->previous->epSquare);
+                assert(relative_rank(us, to) == RANK_6);
+                assert(piece_on(capsq) == NO_PIECE);
+                assert(st->capturedPiece == make_piece(~us, PAWN));
+            }
+            put_piece(st->capturedPiece, capsq);  // Restore the captured piece
+        }
+    }
+    // Finally point our state pointer back to the previous state
+    st = st->previous;
+    --gamePly;
+    assert(pos_is_ok());
+}
+template<bool PutPiece>
+inline void add_dirty_threat(
+  DirtyThreats* const dts, Piece pc, Piece threatened, Square s, Square threatenedSq) {
+    if (PutPiece)
+    {
+        dts->threatenedSqs |= threatenedSq;
+        dts->threateningSqs |= s;
+    }
+    dts->list.push_back({pc, threatened, s, threatenedSq, PutPiece});
+}
+#ifdef USE_AVX512ICL
+// Given a DirtyThreat template and bit offsets to insert the piece type and square, write the threats
+// present at the given bitboard.
+template<int SqShift, int PcShift>
+void write_multiple_dirties(const Position& p,
+                            Bitboard        mask,
+                            DirtyThreat     dt_template,
+                            DirtyThreats*   dts) {
+    static_assert(sizeof(DirtyThreat) == 4);
+    const __m512i board      = _mm512_loadu_si512(p.piece_array().data());
+    const __m512i AllSquares = _mm512_set_epi8(
+      63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41,
+      40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
+      17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    const int dt_count = popcount(mask);
+    assert(dt_count <= 16);
+    const __m512i template_v = _mm512_set1_epi32(dt_template.raw());
+    auto*         write      = dts->list.make_space(dt_count);
+    // Extract the list of squares and upconvert to 32 bits. There are never more than 16
+    // incoming threats so this is sufficient.
+    __m512i threat_squares = _mm512_maskz_compress_epi8(mask, AllSquares);
+    threat_squares         = _mm512_cvtepi8_epi32(_mm512_castsi512_si128(threat_squares));
+    __m512i threat_pieces =
+      _mm512_maskz_permutexvar_epi8(0x1111111111111111ULL, threat_squares, board);
+    // Shift the piece and square into place
+    threat_squares = _mm512_slli_epi32(threat_squares, SqShift);
+    threat_pieces  = _mm512_slli_epi32(threat_pieces, PcShift);
+    const __m512i dirties =
+      _mm512_ternarylogic_epi32(template_v, threat_squares, threat_pieces, 254 /* A | B | C */);
+    _mm512_storeu_si512(write, dirties);
+}
+#endif
+template<bool PutPiece, bool ComputeRay>
+void Position::update_piece_threats(Piece                     pc,
+                                    Square                    s,
+                                    DirtyThreats* const       dts,
+                                    [[maybe_unused]] Bitboard noRaysContaining) const {
+    const Bitboard occupied     = pieces();
+    const Bitboard rookQueens   = pieces(ROOK, QUEEN);
+    const Bitboard bishopQueens = pieces(BISHOP, QUEEN);
+    const Bitboard rAttacks     = attacks_bb<ROOK>(s, occupied);
+    const Bitboard bAttacks     = attacks_bb<BISHOP>(s, occupied);
+    const Bitboard kings        = pieces(KING);
+    Bitboard       occupiedNoK  = occupied ^ kings;
+    Bitboard sliders         = (rookQueens & rAttacks) | (bishopQueens & bAttacks);
+    auto     process_sliders = [&](bool addDirectAttacks) {
+        while (sliders)
+        {
+            Square sliderSq = pop_lsb(sliders);
+            Piece  slider   = piece_on(sliderSq);
+            const Bitboard ray        = RayPassBB[sliderSq][s];
+            const Bitboard discovered = ray & (rAttacks | bAttacks) & occupiedNoK;
+            assert(!more_than_one(discovered));
+            if (discovered && (RayPassBB[sliderSq][s] & noRaysContaining) != noRaysContaining)
+            {
+                const Square threatenedSq = lsb(discovered);
+                const Piece  threatenedPc = piece_on(threatenedSq);
+                add_dirty_threat<!PutPiece>(dts, slider, threatenedPc, sliderSq, threatenedSq);
+            }
+            if (addDirectAttacks)
+                add_dirty_threat<PutPiece>(dts, slider, pc, sliderSq, s);
+        }
+    };
+    if (type_of(pc) == KING)
+    {
+        if constexpr (ComputeRay)
+            process_sliders(false);
+        return;
+    }
+    const Bitboard knights    = pieces(KNIGHT);
+    const Bitboard whitePawns = pieces(WHITE, PAWN);
+    const Bitboard blackPawns = pieces(BLACK, PAWN);
+    Bitboard threatened = attacks_bb(pc, s, occupied) & occupiedNoK;
+    Bitboard incoming_threats =
+      (PseudoAttacks[KNIGHT][s] & knights) | (attacks_bb<PAWN>(s, WHITE) & blackPawns)
+      | (attacks_bb<PAWN>(s, BLACK) & whitePawns) | (PseudoAttacks[KING][s] & kings);
+#ifdef USE_AVX512ICL
+    if constexpr (PutPiece)
+    {
+        dts->threatenedSqs |= threatened;
+        // A bit may only be set if that square actually produces a threat, so we
+        // must guard setting the square accordingly
+        dts->threateningSqs |= Bitboard(bool(threatened)) << s;
+    }
+    DirtyThreat dt_template{pc, NO_PIECE, s, Square(0), PutPiece};
+    write_multiple_dirties<DirtyThreat::ThreatenedSqOffset, DirtyThreat::ThreatenedPcOffset>(
+      *this, threatened, dt_template, dts);
+    Bitboard all_attackers = sliders | incoming_threats;
+    if constexpr (PutPiece)
+    {
+        dts->threatenedSqs |= Bitboard(bool(all_attackers)) << s;  // same as above
+        dts->threateningSqs |= all_attackers;
+    }
+    dt_template = {NO_PIECE, pc, Square(0), s, PutPiece};
+    write_multiple_dirties<DirtyThreat::PcSqOffset, DirtyThreat::PcOffset>(*this, all_attackers,
+                                                                           dt_template, dts);
+#else
+    while (threatened)
+    {
+        Square threatenedSq = pop_lsb(threatened);
+        Piece  threatenedPc = piece_on(threatenedSq);
+        assert(threatenedSq != s);
+        assert(threatenedPc);
+        add_dirty_threat<PutPiece>(dts, pc, threatenedPc, s, threatenedSq);
+    }
+#endif
+    if constexpr (ComputeRay)
+    {
+#ifndef USE_AVX512ICL
+        process_sliders(true);
+#else  // for ICL, direct threats were processed earlier (all_attackers)
+        process_sliders(false);
+#endif
+    }
+    else
+    {
+        incoming_threats |= sliders;
+    }
+#ifndef USE_AVX512ICL
+    while (incoming_threats)
+    {
+        Square srcSq = pop_lsb(incoming_threats);
+        Piece  srcPc = piece_on(srcSq);
+        assert(srcSq != s);
+        assert(srcPc != NO_PIECE);
+        add_dirty_threat<PutPiece>(dts, srcPc, pc, srcSq, s);
+    }
+#endif
+}
+// Helper used to do/undo a castling move. This is a bit
+// tricky in Chess960 where from/to squares can overlap.
+template<bool Do>
+void Position::do_castling(Color               us,
+                           Square              from,
+                           Square&             to,
+                           Square&             rfrom,
+                           Square&             rto,
+                           DirtyThreats* const dts,
+                           DirtyPiece* const   dp) {
+    bool kingSide = to > from;
+    rfrom         = to;  // Castling is encoded as "king captures friendly rook"
+    rto           = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
+    to            = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
+    assert(!Do || dp);
+    if (Do)
+    {
+        dp->to        = to;
+        dp->remove_pc = dp->add_pc = make_piece(us, ROOK);
+        dp->remove_sq              = rfrom;
+        dp->add_sq                 = rto;
+    }
+    // Remove both pieces first since squares could overlap in Chess960
+    remove_piece(Do ? from : to, dts);
+    remove_piece(Do ? rfrom : rto, dts);
+    put_piece(make_piece(us, KING), Do ? to : from, dts);
+    put_piece(make_piece(us, ROOK), Do ? rto : rfrom, dts);
+}
+// Used to do a "null move": it flips
+// the side to move without executing any move on the board.
+void Position::do_null_move(StateInfo& newSt) {
+    assert(!checkers());
+    assert(&newSt != st);
+    std::memcpy(&newSt, st, sizeof(StateInfo));
+    newSt.previous = st;
+    st             = &newSt;
+    if (st->epSquare != SQ_NONE)
+    {
+        st->key ^= Zobrist::enpassant[file_of(st->epSquare)];
+        st->epSquare = SQ_NONE;
+    }
+    st->key ^= Zobrist::side;
+    st->pliesFromNull = 0;
+    sideToMove = ~sideToMove;
+    set_check_info();
+    st->repetition = 0;
+    assert(pos_is_ok());
+}
+// Must be used to undo a "null move"
+void Position::undo_null_move() {
+    assert(!checkers());
+    st         = st->previous;
+    sideToMove = ~sideToMove;
+}
+// Tests if the SEE (Static Exchange Evaluation)
+// value of move is greater or equal to the given threshold. We'll use an
+// algorithm similar to alpha-beta pruning with a null window.
+bool Position::see_ge(Move m, int threshold) const {
+    assert(m.is_ok());
+    // Only deal with normal moves, assume others pass a simple SEE
+    if (m.type_of() != NORMAL)
+        return VALUE_ZERO >= threshold;
+    Square from = m.from_sq(), to = m.to_sq();
+    assert(piece_on(from) != NO_PIECE);
+    int swap = PieceValue[piece_on(to)] - threshold;
+    if (swap < 0)
+        return false;
+    swap = PieceValue[piece_on(from)] - swap;
+    if (swap <= 0)
+        return true;
+    assert(color_of(piece_on(from)) == sideToMove);
+    Bitboard occupied  = pieces() ^ from ^ to;  // xoring to is important for pinned piece logic
+    Color    stm       = sideToMove;
+    Bitboard attackers = attackers_to(to, occupied);
+    Bitboard stmAttackers, bb;
+    int      res = 1;
+    while (true)
+    {
+        stm = ~stm;
+        attackers &= occupied;
+        // If stm has no more attackers then give up: stm loses
+        if (!(stmAttackers = attackers & pieces(stm)))
+            break;
+        // Don't allow pinned pieces to attack as long as there are
+        // pinners on their original square.
+        if (pinners(~stm) & occupied)
+        {
+            stmAttackers &= ~blockers_for_king(stm);
+            if (!stmAttackers)
+                break;
+        }
+        res ^= 1;
+        // Locate and remove the next least valuable attacker, and add to
+        // the bitboard 'attackers' any X-ray attackers behind it.
+        if ((bb = stmAttackers & pieces(PAWN)))
+        {
+            if ((swap = PawnValue - swap) < res)
+                break;
+            occupied ^= least_significant_square_bb(bb);
+            attackers |= attacks_bb<BISHOP>(to, occupied) & pieces(BISHOP, QUEEN);
+        }
+        else if ((bb = stmAttackers & pieces(KNIGHT)))
+        {
+            if ((swap = KnightValue - swap) < res)
+                break;
+            occupied ^= least_significant_square_bb(bb);
+        }
+        else if ((bb = stmAttackers & pieces(BISHOP)))
+        {
+            if ((swap = BishopValue - swap) < res)
+                break;
+            occupied ^= least_significant_square_bb(bb);
+            attackers |= attacks_bb<BISHOP>(to, occupied) & pieces(BISHOP, QUEEN);
+        }
+        else if ((bb = stmAttackers & pieces(ROOK)))
+        {
+            if ((swap = RookValue - swap) < res)
+                break;
+            occupied ^= least_significant_square_bb(bb);
+            attackers |= attacks_bb<ROOK>(to, occupied) & pieces(ROOK, QUEEN);
+        }
+        else if ((bb = stmAttackers & pieces(QUEEN)))
+        {
+            swap = QueenValue - swap;
+            //  implies that the previous recapture was done by a higher rated piece than a Queen (King is excluded)
+            assert(swap >= res);
+            occupied ^= least_significant_square_bb(bb);
+            attackers |= (attacks_bb<BISHOP>(to, occupied) & pieces(BISHOP, QUEEN))
+                       | (attacks_bb<ROOK>(to, occupied) & pieces(ROOK, QUEEN));
+        }
+        else  // KING
+              // If we "capture" with the king but the opponent still has attackers,
+              // reverse the result.
+            return (attackers & ~pieces(stm)) ? res ^ 1 : res;
+    }
+    return bool(res);
+}
+// Tests whether the position is drawn by 50-move rule
+// or by repetition. It does not detect stalemates.
+bool Position::is_draw(int ply) const {
+    if (st->rule50 > 99 && (!checkers() || MoveList<LEGAL>(*this).size()))
+        return true;
+    return is_repetition(ply);
+}
+// Return a draw score if a position repeats once earlier but strictly
+// after the root, or repeats twice before or at the root.
+bool Position::is_repetition(int ply) const { return st->repetition && st->repetition < ply; }
+// Tests whether there has been at least one repetition
+// of positions since the last capture or pawn move.
+bool Position::has_repeated() const {
+    StateInfo* stc = st;
+    int        end = std::min(st->rule50, st->pliesFromNull);
+    while (end-- >= 4)
+    {
+        if (stc->repetition)
+            return true;
+        stc = stc->previous;
+    }
+    return false;
+}
+// Tests if the position has a move which draws by repetition.
+// This function accurately matches the outcome of is_draw() over all legal moves.
+bool Position::upcoming_repetition(int ply) const {
+    int j;
+    int end = std::min(st->rule50, st->pliesFromNull);
+    if (end < 3)
+        return false;
+    Key        originalKey = st->key;
+    StateInfo* stp         = st->previous;
+    Key        other       = originalKey ^ stp->key ^ Zobrist::side;
+    for (int i = 3; i <= end; i += 2)
+    {
+        stp = stp->previous;
+        other ^= stp->key ^ stp->previous->key ^ Zobrist::side;
+        stp = stp->previous;
+        if (other != 0)
+            continue;
+        Key moveKey = originalKey ^ stp->key;
+        if ((j = H1(moveKey), cuckoo[j] == moveKey) || (j = H2(moveKey), cuckoo[j] == moveKey))
+        {
+            Move   move = cuckooMove[j];
+            Square s1   = move.from_sq();
+            Square s2   = move.to_sq();
+            if (!((between_bb(s1, s2) ^ s2) & pieces()))
+            {
+                if (ply > i)
+                    return true;
+                // For nodes before or at the root, check that the move is a
+                // repetition rather than a move to the current position.
+                if (stp->repetition)
+                    return true;
+            }
+        }
+    }
+    return false;
+}
+// Flips position with the white and black sides reversed. This
+// is only useful for debugging e.g. for finding evaluation symmetry bugs.
+void Position::flip() {
+    string            f, token;
+    std::stringstream ss(fen());
+    for (Rank r = RANK_8;; --r)  // Piece placement
+    {
+        std::getline(ss, token, r > RANK_1 ? '/' : ' ');
+        f.insert(0, token + (f.empty() ? " " : "/"));
+        if (r == RANK_1)
+            break;
+    }
+    ss >> token;                        // Active color
+    f += (token == "w" ? "B " : "W ");  // Will be lowercased later
+    ss >> token;  // Castling availability
+    f += token + " ";
+    std::transform(f.begin(), f.end(), f.begin(),
+                   [](char c) { return char(islower(c) ? toupper(c) : tolower(c)); });
+    ss >> token;  // En passant square
+    f += (token == "-" ? token : token.replace(1, 1, token[1] == '3' ? "6" : "3"));
+    std::getline(ss, token);  // Half and full moves
+    f += token;
+    set(f, is_chess960(), st);
+    assert(pos_is_ok());
+}
+bool Position::material_key_is_ok() const { return compute_material_key() == st->materialKey; }
+// Performs some consistency checks for the position object
+// and raise an assert if something wrong is detected.
+// This is meant to be helpful when debugging.
+bool Position::pos_is_ok() const {
+    constexpr bool Fast = true;  // Quick (default) or full check?
+    if ((sideToMove != WHITE && sideToMove != BLACK) || piece_on(square<KING>(WHITE)) != W_KING
+        || piece_on(square<KING>(BLACK)) != B_KING
+        || (ep_square() != SQ_NONE && relative_rank(sideToMove, ep_square()) != RANK_6))
+        assert(0 && "pos_is_ok: Default");
+    if (Fast)
+        return true;
+    if (pieceCount[W_KING] != 1 || pieceCount[B_KING] != 1
+        || attackers_to_exist(square<KING>(~sideToMove), pieces(), sideToMove))
+        assert(0 && "pos_is_ok: Kings");
+    if ((pieces(PAWN) & (Rank1BB | Rank8BB)) || pieceCount[W_PAWN] > 8 || pieceCount[B_PAWN] > 8)
+        assert(0 && "pos_is_ok: Pawns");
+    if (ep_square() != SQ_NONE)
+    {
+        Square ksq = square<KING>(sideToMove);
+        Bitboard captured = (ep_square() + pawn_push(~sideToMove)) & pieces(~sideToMove, PAWN);
+        Bitboard pawns    = attacks_bb<PAWN>(ep_square(), ~sideToMove) & pieces(sideToMove, PAWN);
+        Bitboard potentialCheckers = pieces(~sideToMove) ^ captured;
+        if (!captured || !pawns
+            || ((attackers_to(ksq, pieces() ^ captured ^ ep_square() ^ lsb(pawns))
+                 & potentialCheckers)
+                && (attackers_to(ksq, pieces() ^ captured ^ ep_square() ^ msb(pawns))
+                    & potentialCheckers)))
+            assert(0 && "pos_is_ok: En passant square");
+    }
+    if ((pieces(WHITE) & pieces(BLACK)) || (pieces(WHITE) | pieces(BLACK)) != pieces()
+        || popcount(pieces(WHITE)) > 16 || popcount(pieces(BLACK)) > 16)
+        assert(0 && "pos_is_ok: Bitboards");
+    for (PieceType p1 = PAWN; p1 <= KING; ++p1)
+        for (PieceType p2 = PAWN; p2 <= KING; ++p2)
+            if (p1 != p2 && (pieces(p1) & pieces(p2)))
+                assert(0 && "pos_is_ok: Bitboards");
+    for (Piece pc : Pieces)
+        if (pieceCount[pc] != popcount(pieces(color_of(pc), type_of(pc)))
+            || pieceCount[pc] != std::count(board.begin(), board.end(), pc))
+            assert(0 && "pos_is_ok: Pieces");
+    for (Color c : {WHITE, BLACK})
+        for (CastlingRights cr : {c & KING_SIDE, c & QUEEN_SIDE})
+        {
+            if (!can_castle(cr))
+                continue;
+            if (piece_on(castlingRookSquare[cr]) != make_piece(c, ROOK)
+                || castlingRightsMask[castlingRookSquare[cr]] != cr
+                || (castlingRightsMask[square<KING>(c)] & cr) != cr)
+                assert(0 && "pos_is_ok: Castling");
+        }
+    assert(material_key_is_ok() && "pos_is_ok: materialKey");
+    return true;
+}
+}  // namespace Stockfish

src/position.h ADDED Viewed

	@@ -0,0 +1,414 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef POSITION_H_INCLUDED
+#define POSITION_H_INCLUDED
+#include <array>
+#include <cassert>
+#include <deque>
+#include <iosfwd>
+#include <memory>
+#include <new>
+#include <string>
+#include "bitboard.h"
+#include "types.h"
+namespace Stockfish {
+class TranspositionTable;
+struct SharedHistories;
+// StateInfo struct stores information needed to restore a Position object to
+// its previous state when we retract a move. Whenever a move is made on the
+// board (by calling Position::do_move), a StateInfo object must be passed.
+struct StateInfo {
+    // Copied when making a move
+    Key    materialKey;
+    Key    pawnKey;
+    Key    minorPieceKey;
+    Key    nonPawnKey[COLOR_NB];
+    Value  nonPawnMaterial[COLOR_NB];
+    int    castlingRights;
+    int    rule50;
+    int    pliesFromNull;
+    Square epSquare;
+    // Not copied when making a move (will be recomputed anyhow)
+    Key        key;
+    Bitboard   checkersBB;
+    StateInfo* previous;
+    Bitboard   blockersForKing[COLOR_NB];
+    Bitboard   pinners[COLOR_NB];
+    Bitboard   checkSquares[PIECE_TYPE_NB];
+    Piece      capturedPiece;
+    int        repetition;
+};
+// A list to keep track of the position states along the setup moves (from the
+// start position to the position just before the search starts). Needed by
+// 'draw by repetition' detection. Use a std::deque because pointers to
+// elements are not invalidated upon list resizing.
+using StateListPtr = std::unique_ptr<std::deque<StateInfo>>;
+// Position class stores information regarding the board representation as
+// pieces, side to move, hash keys, castling info, etc. Important methods are
+// do_move() and undo_move(), used by the search to update node info when
+// traversing the search tree.
+class Position {
+   public:
+    static void init();
+    Position()                           = default;
+    Position(const Position&)            = delete;
+    Position& operator=(const Position&) = delete;
+    // FEN string input/output
+    Position&   set(const std::string& fenStr, bool isChess960, StateInfo* si);
+    Position&   set(const std::string& code, Color c, StateInfo* si);
+    std::string fen() const;
+    // Position representation
+    Bitboard pieces() const;  // All pieces
+    template<typename... PieceTypes>
+    Bitboard pieces(PieceTypes... pts) const;
+    Bitboard pieces(Color c) const;
+    template<typename... PieceTypes>
+    Bitboard                            pieces(Color c, PieceTypes... pts) const;
+    Piece                               piece_on(Square s) const;
+    const std::array<Piece, SQUARE_NB>& piece_array() const;
+    Square                              ep_square() const;
+    bool                                empty(Square s) const;
+    template<PieceType Pt>
+    int count(Color c) const;
+    template<PieceType Pt>
+    int count() const;
+    template<PieceType Pt>
+    Square square(Color c) const;
+    // Castling
+    bool   can_castle(CastlingRights cr) const;
+    bool   castling_impeded(CastlingRights cr) const;
+    Square castling_rook_square(CastlingRights cr) const;
+    // Checking
+    Bitboard checkers() const;
+    Bitboard blockers_for_king(Color c) const;
+    Bitboard check_squares(PieceType pt) const;
+    Bitboard pinners(Color c) const;
+    // Attacks to/from a given square
+    Bitboard attackers_to(Square s) const;
+    Bitboard attackers_to(Square s, Bitboard occupied) const;
+    bool     attackers_to_exist(Square s, Bitboard occupied, Color c) const;
+    void     update_slider_blockers(Color c) const;
+    template<PieceType Pt>
+    Bitboard attacks_by(Color c) const;
+    // Properties of moves
+    bool  legal(Move m) const;
+    bool  pseudo_legal(const Move m) const;
+    bool  capture(Move m) const;
+    bool  capture_stage(Move m) const;
+    bool  gives_check(Move m) const;
+    Piece moved_piece(Move m) const;
+    Piece captured_piece() const;
+    // Doing and undoing moves
+    void do_move(Move m, StateInfo& newSt, const TranspositionTable* tt);
+    void do_move(Move                      m,
+                 StateInfo&                newSt,
+                 bool                      givesCheck,
+                 DirtyPiece&               dp,
+                 DirtyThreats&             dts,
+                 const TranspositionTable* tt,
+                 const SharedHistories*    worker);
+    void undo_move(Move m);
+    void do_null_move(StateInfo& newSt);
+    void undo_null_move();
+    // Static Exchange Evaluation
+    bool see_ge(Move m, int threshold = 0) const;
+    // Accessing hash keys
+    Key key() const;
+    Key material_key() const;
+    Key pawn_key() const;
+    Key minor_piece_key() const;
+    Key non_pawn_key(Color c) const;
+    // Other properties of the position
+    Color side_to_move() const;
+    int   game_ply() const;
+    bool  is_chess960() const;
+    bool  is_draw(int ply) const;
+    bool  is_repetition(int ply) const;
+    bool  upcoming_repetition(int ply) const;
+    bool  has_repeated() const;
+    int   rule50_count() const;
+    Value non_pawn_material(Color c) const;
+    Value non_pawn_material() const;
+    // Position consistency check, for debugging
+    bool pos_is_ok() const;
+    bool material_key_is_ok() const;
+    void flip();
+    StateInfo* state() const;
+    void put_piece(Piece pc, Square s, DirtyThreats* const dts = nullptr);
+    void remove_piece(Square s, DirtyThreats* const dts = nullptr);
+    void swap_piece(Square s, Piece pc, DirtyThreats* const dts = nullptr);
+   private:
+    // Initialization helpers (used while setting up a position)
+    void set_castling_right(Color c, Square rfrom);
+    Key  compute_material_key() const;
+    void set_state() const;
+    void set_check_info() const;
+    // Other helpers
+    template<bool PutPiece, bool ComputeRay = true>
+    void update_piece_threats(Piece               pc,
+                              Square              s,
+                              DirtyThreats* const dts,
+                              Bitboard            noRaysContaining = -1ULL) const;
+    void move_piece(Square from, Square to, DirtyThreats* const dts = nullptr);
+    template<bool Do>
+    void do_castling(Color               us,
+                     Square              from,
+                     Square&             to,
+                     Square&             rfrom,
+                     Square&             rto,
+                     DirtyThreats* const dts = nullptr,
+                     DirtyPiece* const   dp  = nullptr);
+    Key  adjust_key50(Key k) const;
+    // Data members
+    std::array<Piece, SQUARE_NB>        board;
+    std::array<Bitboard, PIECE_TYPE_NB> byTypeBB;
+    std::array<Bitboard, COLOR_NB>      byColorBB;
+    int          pieceCount[PIECE_NB];
+    int          castlingRightsMask[SQUARE_NB];
+    Square       castlingRookSquare[CASTLING_RIGHT_NB];
+    Bitboard     castlingPath[CASTLING_RIGHT_NB];
+    StateInfo*   st;
+    int          gamePly;
+    Color        sideToMove;
+    bool         chess960;
+    DirtyPiece   scratch_dp;
+    DirtyThreats scratch_dts;
+};
+std::ostream& operator<<(std::ostream& os, const Position& pos);
+inline Color Position::side_to_move() const { return sideToMove; }
+inline Piece Position::piece_on(Square s) const {
+    assert(is_ok(s));
+    return board[s];
+}
+inline const std::array<Piece, SQUARE_NB>& Position::piece_array() const { return board; }
+inline bool Position::empty(Square s) const { return piece_on(s) == NO_PIECE; }
+inline Piece Position::moved_piece(Move m) const { return piece_on(m.from_sq()); }
+inline Bitboard Position::pieces() const { return byTypeBB[ALL_PIECES]; }
+template<typename... PieceTypes>
+inline Bitboard Position::pieces(PieceTypes... pts) const {
+    return (byTypeBB[pts] | ...);
+}
+inline Bitboard Position::pieces(Color c) const { return byColorBB[c]; }
+template<typename... PieceTypes>
+inline Bitboard Position::pieces(Color c, PieceTypes... pts) const {
+    return pieces(c) & pieces(pts...);
+}
+template<PieceType Pt>
+inline int Position::count(Color c) const {
+    return pieceCount[make_piece(c, Pt)];
+}
+template<PieceType Pt>
+inline int Position::count() const {
+    return count<Pt>(WHITE) + count<Pt>(BLACK);
+}
+template<PieceType Pt>
+inline Square Position::square(Color c) const {
+    assert(count<Pt>(c) == 1);
+    return lsb(pieces(c, Pt));
+}
+inline Square Position::ep_square() const { return st->epSquare; }
+inline bool Position::can_castle(CastlingRights cr) const { return st->castlingRights & cr; }
+inline bool Position::castling_impeded(CastlingRights cr) const {
+    assert(cr == WHITE_OO || cr == WHITE_OOO || cr == BLACK_OO || cr == BLACK_OOO);
+    return pieces() & castlingPath[cr];
+}
+inline Square Position::castling_rook_square(CastlingRights cr) const {
+    assert(cr == WHITE_OO || cr == WHITE_OOO || cr == BLACK_OO || cr == BLACK_OOO);
+    return castlingRookSquare[cr];
+}
+inline Bitboard Position::attackers_to(Square s) const { return attackers_to(s, pieces()); }
+template<PieceType Pt>
+inline Bitboard Position::attacks_by(Color c) const {
+    if constexpr (Pt == PAWN)
+        return c == WHITE ? pawn_attacks_bb<WHITE>(pieces(WHITE, PAWN))
+                          : pawn_attacks_bb<BLACK>(pieces(BLACK, PAWN));
+    else
+    {
+        Bitboard threats   = 0;
+        Bitboard attackers = pieces(c, Pt);
+        while (attackers)
+            threats |= attacks_bb<Pt>(pop_lsb(attackers), pieces());
+        return threats;
+    }
+}
+inline Bitboard Position::checkers() const { return st->checkersBB; }
+inline Bitboard Position::blockers_for_king(Color c) const { return st->blockersForKing[c]; }
+inline Bitboard Position::pinners(Color c) const { return st->pinners[c]; }
+inline Bitboard Position::check_squares(PieceType pt) const { return st->checkSquares[pt]; }
+inline Key Position::key() const { return adjust_key50(st->key); }
+inline Key Position::adjust_key50(Key k) const {
+    return st->rule50 < 14 ? k : k ^ make_key((st->rule50 - 14) / 8);
+}
+inline Key Position::pawn_key() const { return st->pawnKey; }
+inline Key Position::material_key() const { return st->materialKey; }
+inline Key Position::minor_piece_key() const { return st->minorPieceKey; }
+inline Key Position::non_pawn_key(Color c) const { return st->nonPawnKey[c]; }
+inline Value Position::non_pawn_material(Color c) const { return st->nonPawnMaterial[c]; }
+inline Value Position::non_pawn_material() const {
+    return non_pawn_material(WHITE) + non_pawn_material(BLACK);
+}
+inline int Position::game_ply() const { return gamePly; }
+inline int Position::rule50_count() const { return st->rule50; }
+inline bool Position::is_chess960() const { return chess960; }
+inline bool Position::capture(Move m) const {
+    assert(m.is_ok());
+    return (!empty(m.to_sq()) && m.type_of() != CASTLING) || m.type_of() == EN_PASSANT;
+}
+// Returns true if a move is generated from the capture stage, having also
+// queen promotions covered, i.e. consistency with the capture stage move
+// generation is needed to avoid the generation of duplicate moves.
+inline bool Position::capture_stage(Move m) const {
+    assert(m.is_ok());
+    return capture(m) || m.promotion_type() == QUEEN;
+}
+inline Piece Position::captured_piece() const { return st->capturedPiece; }
+inline void Position::put_piece(Piece pc, Square s, DirtyThreats* const dts) {
+    board[s] = pc;
+    byTypeBB[ALL_PIECES] |= byTypeBB[type_of(pc)] |= s;
+    byColorBB[color_of(pc)] |= s;
+    pieceCount[pc]++;
+    pieceCount[make_piece(color_of(pc), ALL_PIECES)]++;
+    if (dts)
+        update_piece_threats<true>(pc, s, dts);
+}
+inline void Position::remove_piece(Square s, DirtyThreats* const dts) {
+    Piece pc = board[s];
+    if (dts)
+        update_piece_threats<false>(pc, s, dts);
+    byTypeBB[ALL_PIECES] ^= s;
+    byTypeBB[type_of(pc)] ^= s;
+    byColorBB[color_of(pc)] ^= s;
+    board[s] = NO_PIECE;
+    pieceCount[pc]--;
+    pieceCount[make_piece(color_of(pc), ALL_PIECES)]--;
+}
+inline void Position::move_piece(Square from, Square to, DirtyThreats* const dts) {
+    Piece    pc     = board[from];
+    Bitboard fromTo = from | to;
+    if (dts)
+        update_piece_threats<false>(pc, from, dts, fromTo);
+    byTypeBB[ALL_PIECES] ^= fromTo;
+    byTypeBB[type_of(pc)] ^= fromTo;
+    byColorBB[color_of(pc)] ^= fromTo;
+    board[from] = NO_PIECE;
+    board[to]   = pc;
+    if (dts)
+        update_piece_threats<true>(pc, to, dts, fromTo);
+}
+inline void Position::swap_piece(Square s, Piece pc, DirtyThreats* const dts) {
+    Piece old = board[s];
+    remove_piece(s);
+    if (dts)
+        update_piece_threats<false, false>(old, s, dts);
+    put_piece(pc, s);
+    if (dts)
+        update_piece_threats<true, false>(pc, s, dts);
+}
+inline void Position::do_move(Move m, StateInfo& newSt, const TranspositionTable* tt = nullptr) {
+    new (&scratch_dts) DirtyThreats;
+    do_move(m, newSt, gives_check(m), scratch_dp, scratch_dts, tt, nullptr);
+}
+inline StateInfo* Position::state() const { return st; }
+}  // namespace Stockfish
+#endif  // #ifndef POSITION_H_INCLUDED

src/score.cpp ADDED Viewed

	@@ -0,0 +1,48 @@

+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2026 The Stockfish developers (see AUTHORS file)
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include "score.h"
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include "uci.h"
+namespace Stockfish {
+Score::Score(Value v, const Position& pos) {
+    assert(-VALUE_INFINITE < v && v < VALUE_INFINITE);
+    if (!is_decisive(v))
+    {
+        score = InternalUnits{UCIEngine::to_cp(v, pos)};
+    }
+    else if (std::abs(v) <= VALUE_TB)
+    {
+        auto distance = VALUE_TB - std::abs(v);
+        score         = (v > 0) ? Tablebase{distance, true} : Tablebase{-distance, false};
+    }
+    else
+    {
+        auto distance = VALUE_MATE - std::abs(v);
+        score         = (v > 0) ? Mate{distance} : Mate{-distance};
+    }
+}
+}