def__init__(self,name:str,lib:bytes):ifsys.platform=="win32":PAGE_EXECUTE_READWRITE=0x40MEM_COMMIT=0x1000MEM_RESERVE=0x2000ctypes.windll.kernel32.VirtualAlloc.restype=ctypes.c_void_pself.mem=ctypes.windll.kernel32.VirtualAlloc(ctypes.c_void_p(0),ctypes.c_size_t(len(lib)),MEM_COMMIT|MEM_RESERVE,PAGE_EXECUTE_READWRITE)ctypes.memmove(self.mem,lib,len(lib))ctypes.windll.kernel32.GetCurrentProcess.restype=ctypes.c_void_pproc=ctypes.windll.kernel32.GetCurrentProcess()ctypes.windll.kernel32.FlushInstructionCache(ctypes.c_void_p(proc),ctypes.c_void_p(self.mem),ctypes.c_size_t(len(lib)))self.fxn=ctypes.CFUNCTYPE(None)(self.mem)else:frommmapimportmmap,PROT_READ,PROT_WRITE,PROT_EXEC,MAP_ANON,MAP_PRIVATE# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)self.mem=mmap(-1,len(lib),MAP_ANON|MAP_PRIVATE|(MAP_JITifOSXelse0),PROT_READ|PROT_WRITE|PROT_EXEC)ifOSX:CPUProgram.rt_lib.pthread_jit_write_protect_np(False)self.mem.write(lib)ifOSX:CPUProgram.rt_lib.pthread_jit_write_protect_np(True)# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5CPUProgram.rt_lib["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)),ctypes.c_void_p(mv_address(self.mem)+len(lib)))self.fxn=ctypes.CFUNCTYPE(None)(mv_address(self.mem))
def__call__(self,*bufs,vals=(),wait=False):args=list(bufs)+list(vals)# NOTE: replace this by --target={host's triple}-elf in clang args once we only support macos sequoia and later.# Apple relaxes abi requirement for stack arguments to always be at least 8 byte aligned on arm64# https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms# This hack is required because clang/llvm bug doesn't allow us to just use {host's triple}+'-elf' (relocation failures)# The bug was fixed in https://github.com/llvm/llvm-project/commit/454cc36630296262cdb6360b60f90a64a97f7f1a but was only backported to xcode 16+ifplatform.machine()=="arm64"andOSX:args=args[:8]+[ctypes.c_int64(a)ifisinstance(a,int)elseaforainargs[8:]]returncpu_time_execution(lambda:self.fxn(*args),enable=wait)
defcompile_cached(self,src:str)->bytes:ifself.cachekeyisNoneor(lib:=diskcache_get(self.cachekey,src))isNone:assertnotgetenv("ASSERT_COMPILE"),f"tried to compile with ASSERT_COMPILE set\n{src}"lib=self.compile(src)ifself.cachekeyisnotNone:diskcache_put(self.cachekey,src,lib)returnlib