Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
976bb79
MPI Replay: remove print_surrogate_stats() to compile cleanly
kevinabrown Jun 17, 2024
7f42f4a
director-b: started adding director LP for mpi-replay
kevinabrown Jul 5, 2024
5ae2e7c
director-b: complete initial director LP prototype for mpi-replay
kevinabrown Aug 16, 2024
138f46a
zmqml: update zmq server and requester to interface with director LP
kevinabrown Aug 16, 2024
39fbc4f
Fix zmq and ROSS compilation issues
sanjaychari May 21, 2026
0651b5e
Fix torch-jit compilation
sanjaychari May 21, 2026
01a2b16
Allow cpu-based PyTorch usage
sanjaychari May 21, 2026
51f691b
Add ML models
sanjaychari May 22, 2026
e42e75a
Move ML models to surrogate directory
sanjaychari May 27, 2026
4a50395
surrogate: support LP-aware Torch-JIT latency model
sanjaychari May 27, 2026
5534e2c
surrogate: add Torch-JIT debug diagnostics
sanjaychari May 27, 2026
3c3f78d
Improve debug print check mechanism
sanjaychari May 27, 2026
399ec0f
Add LP-type-aware Torch-JIT surrogate models
sanjaychari May 28, 2026
47d9d84
Make zeromq dependency handling robust
sanjaychari Jun 1, 2026
81deb94
Fix installation with swm and union
sanjaychari Jun 2, 2026
36f8486
Add Zeromq director example configs
sanjaychari Jun 2, 2026
d970c3e
Add requirements.txt file for surrogate
sanjaychari Jun 3, 2026
aea6585
Add iteration time ML model for zmq-based director
sanjaychari Jun 4, 2026
06f1fb9
Add optimistic RC for ZeroMQ Director
sanjaychari Jun 4, 2026
9bc7aa1
Add flag to shutdown zeromq server after a run
sanjaychari Jun 5, 2026
5d6fb19
Introduce pretraining for zmq hybrid simulations
sanjaychari Jun 8, 2026
76545e9
Zmq Director: Fix unmatched sends/recvs issue
sanjaychari Jun 9, 2026
1e04405
Add template conf file for milc
sanjaychari Jun 9, 2026
a2cd82b
Add Director retraining workflow
sanjaychari Jun 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
/maint/codes.pc
/test-driver
.deps
src/surrogate/zmqml/demozmqmlrequester
src/surrogate/zmqml/libzmqmlrequester.so

# make generated artifacts
.dirstamp
Expand All @@ -44,3 +46,4 @@ install-mastiff/include/codes/model-net-method.h
/build*
.cache
compile_commands.json
__pycache__/
249 changes: 234 additions & 15 deletions CODES-compile-instructions.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#!/usr/bin bash -x
#!/usr/bin/env bash
set -euo pipefail
set -x

# Switches
swm_enable=1
union_enable=1
torch_enable=0
swm_enable=0
union_enable=0
torch_enable=1

# Uncomment below for MPICH
#export PATH=/usr/local/mpich-4.1.2/bin/:"$PATH"
Expand All @@ -19,30 +21,53 @@ fi
# What to compile
CUR_DIR="$PWD"



##### Downloading everything #####

git clone https://github.com/codes-org/codes --depth=100 --branch=v1.5.0
git clone https://github.com/ross-org/ross --depth=100 --branch=v8.1.0
if [ ! -d codes/.git ]; then
git clone https://github.com/codes-org/codes --depth=100 --branch=v1.5.0
else
echo "Using existing codes checkout: $(realpath codes)"
fi

if [ ! -d ross/.git ]; then
git clone https://github.com/ross-org/ross --depth=100 --branch=v8.1.0
else
echo "Using existing ross checkout: $(realpath ross)"
fi

if [ $swm_enable = 1 ]; then
if [ ! -d argobots/.git ]; then
git clone https://github.com/pmodels/argobots --depth=1
else
echo "Using existing argobots checkout: $(realpath argobots)"
fi
if [ ! -d swm-workloads/.git ]; then
git clone https://github.com/codes-org/swm-workloads --branch=v1.2
else
echo "Using existing swm-workloads checkout: $(realpath swm-workloads)"
fi
fi

if [ $union_enable = 1 ]; then
# Downloading conceptual
curl -L https://sourceforge.net/projects/conceptual/files/conceptual/1.5.1b/conceptual-1.5.1b.tar.gz -o conceptual-1.5.1b.tar.gz
tar xvf conceptual-1.5.1b.tar.gz
# Downloading union
git clone https://github.com/SPEAR-UIC/Union
if [ ! -d Union/.git ]; then
git clone https://github.com/SPEAR-UIC/Union
else
echo "Using existing Union checkout: $(realpath Union)"
fi
pushd Union && git checkout 99b3df3 && popd
fi

##### COMPILING #####

mkdir ross/build
mkdir -p ross/build
pushd ross/build
cmake .. -DROSS_BUILD_MODELS=ON -DCMAKE_INSTALL_PREFIX="$(realpath ./bin)" \
cmake .. -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DROSS_BUILD_MODELS=ON -DCMAKE_INSTALL_PREFIX="$(realpath ./bin)" \
-DCMAKE_C_COMPILER=mpicc -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS="-g -Wall"
#make VERBOSE=1
make install -j4
Expand All @@ -53,7 +78,7 @@ popd
if [ $swm_enable = 1 ]; then
pushd swm-workloads/swm
./prepare.sh
mkdir build
mkdir -p build
pushd build
../configure --disable-shared --prefix="$(realpath ./bin)" CC=mpicc CXX=mpicxx CFLAGS=-g CXXFLAGS=-g
#make V=1 && make install
Expand All @@ -64,7 +89,7 @@ if [ $swm_enable = 1 ]; then

pushd argobots
./autogen.sh
mkdir build
mkdir -p build
pushd build
#../configure --enable-debug=all --disable-fast --disable-shared --prefix="$(realpath ./bin)" CC=mpicc CXX=mpicxx CFLAGS=-g CXXFLAGS=-g
../configure --disable-shared --prefix="$(realpath ./bin)" CC=mpicc CXX=mpicxx CFLAGS=-g CXXFLAGS=-g
Expand Down Expand Up @@ -97,16 +122,186 @@ if [ $union_enable = 1 ]; then
fi


mkdir codes/build

# Make system pkg-config metadata visible even when Conda's pkg-config is active.
# This is needed for libzmq.pc on systems where ZeroMQ is installed through the OS
# but the active Conda environment's pkg-config only searches Conda pkgconfig dirs.
if ! pkg-config --exists libzmq 2>/dev/null; then
for pcdir in \
/usr/lib/x86_64-linux-gnu/pkgconfig \
/usr/lib64/pkgconfig \
/usr/lib/pkgconfig \
/usr/local/lib/pkgconfig \
/usr/local/lib64/pkgconfig \
/opt/homebrew/lib/pkgconfig \
/usr/share/pkgconfig
do
if [ -d "$pcdir" ]; then
export PKG_CONFIG_PATH="$pcdir:${PKG_CONFIG_PATH:-}"
fi
done
fi

if ! pkg-config --exists libzmq 2>/dev/null; then
echo "WARNING: pkg-config still cannot find libzmq.pc." >&2
echo " If ZMQML fails to build, install the ZeroMQ development package" >&2
echo " or set PKG_CONFIG_PATH to the directory containing libzmq.pc." >&2
fi

# Build local ZMQML requester library required by director-client.C
pushd codes/src/surrogate/zmqml
make clean
make
test -f libzmqmlrequester.so
test -f zmqmlrequester.h
popd

# Make imported zmqmlrequester target visible to doc/example and tests.
python3 - <<'INNERPY'
from pathlib import Path
cm = Path("codes/src/CMakeLists.txt")
text = cm.read_text()
old = "add_library(zmqmlrequester SHARED IMPORTED )"
new = "add_library(zmqmlrequester SHARED IMPORTED GLOBAL)"
if old in text:
cm.write_text(text.replace(old, new))
elif new in text:
pass
else:
raise SystemExit("Could not find zmqmlrequester imported target line in codes/src/CMakeLists.txt")
INNERPY

mkdir -p codes/build
pushd codes/build

torch_cmake_prefix=""
torch_dir=""

if [ "$torch_enable" = 1 ]; then
torch_cmake_prefix="$(python3 - <<'INNERPY'
import torch
print(torch.utils.cmake_prefix_path)
INNERPY
)"
torch_dir="${torch_cmake_prefix}/Torch"

if [ ! -f "${torch_dir}/TorchConfig.cmake" ]; then
echo "ERROR: TorchConfig.cmake not found at: ${torch_dir}/TorchConfig.cmake" >&2
echo " torch.utils.cmake_prefix_path returned: ${torch_cmake_prefix}" >&2
exit 1
fi

echo "Using Torch CMake prefix: ${torch_cmake_prefix}"
echo "Using Torch_DIR: ${torch_dir}"

# CUDA is intentionally opt-in.
# Default to CPU-only Torch-JIT compilation unless CUDA_HOME is explicitly set.
#
# To enable CUDA, run for example:
# export CUDA_HOME=/usr/local/cuda-12.4
# ./CODES-compile-instructions.sh
torch_cuda_version="$(python3 - <<'INNERPY'
import torch
print(torch.version.cuda or "")
INNERPY
)"

cuda_arch=""
if [ -z "${CUDA_HOME:-}" ] && [ -n "${torch_cuda_version}" ]; then
echo "ERROR: CUDA_HOME is not set, so this script is defaulting to CPU-only Torch-JIT compilation." >&2
echo " However, the active Python environment has a CUDA-enabled PyTorch build:" >&2
echo " torch.version.cuda=${torch_cuda_version}" >&2
echo "" >&2
echo " CMake cannot use a CUDA-enabled PyTorch package as a CPU-only LibTorch package." >&2
echo " Choose one of the following:" >&2
echo " 1. For CPU-only compilation, install a CPU-only PyTorch build in this environment." >&2
echo " 2. For CUDA compilation, export CUDA_HOME to your CUDA toolkit root." >&2
echo "" >&2
echo " Example CUDA build:" >&2
echo " export CUDA_HOME=/usr/local/cuda-12.4" >&2
echo " bash CODES-compile-instructions.sh" >&2
exit 1
fi

if [ -n "${CUDA_HOME:-}" ]; then
if [ ! -f "${CUDA_HOME}/include/cuda_runtime_api.h" ]; then
echo "ERROR: CUDA_HOME is set, but missing CUDA header: ${CUDA_HOME}/include/cuda_runtime_api.h" >&2
exit 1
fi

if [ ! -f "${CUDA_HOME}/lib64/libcudart.so" ] && [ ! -f "${CUDA_HOME}/lib/libcudart.so" ]; then
echo "ERROR: CUDA_HOME is set, but missing CUDA runtime library under ${CUDA_HOME}/lib64 or ${CUDA_HOME}/lib" >&2
exit 1
fi

if [ ! -x "${CUDA_HOME}/bin/nvcc" ]; then
echo "ERROR: CUDA_HOME is set, but missing CUDA compiler: ${CUDA_HOME}/bin/nvcc" >&2
exit 1
fi

if [ ! -d "${CUDA_HOME}/nvvm/libdevice" ]; then
echo "ERROR: CUDA_HOME is set, but missing CUDA libdevice directory: ${CUDA_HOME}/nvvm/libdevice" >&2
exit 1
fi

if command -v nvidia-smi >/dev/null 2>&1; then
cuda_arch="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n1 | tr -d '.[:space:]' || true)"
fi

if [ -z "${cuda_arch}" ]; then
echo "WARNING: Could not auto-detect GPU compute capability with nvidia-smi." >&2
echo " Falling back to CMAKE_CUDA_ARCHITECTURES=80." >&2
cuda_arch="80"
fi

export CUDA_HOME
export CUDA_PATH="${CUDA_HOME}"
export CUDA_ROOT="${CUDA_HOME}"
export CUDA_TOOLKIT_ROOT_DIR="${CUDA_HOME}"
export CUDAToolkit_ROOT="${CUDA_HOME}"
export CUDACXX="${CUDA_HOME}/bin/nvcc"
export PATH="${CUDA_HOME}/bin:${PATH}"
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/lib:${LD_LIBRARY_PATH:-}"

echo "CUDA_HOME is set; enabling CUDA Torch-JIT compilation."
echo "Using CUDA_HOME: ${CUDA_HOME}"
echo "Using CUDACXX: ${CUDACXX}"
echo "Using CMAKE_CUDA_ARCHITECTURES=${cuda_arch}"
else
echo "CUDA_HOME is not set; forcing CPU-only Torch-JIT compilation."

# Prevent accidental CUDA discovery from /usr/local/cuda, nvcc on PATH,
# inherited CMake cache variables, or CUDA-enabled PyTorch metadata.
unset CUDA_HOME
unset CUDA_PATH
unset CUDA_ROOT
unset CUDA_TOOLKIT_ROOT_DIR
unset CUDAToolkit_ROOT
unset CUDACXX
unset CMAKE_CUDA_COMPILER
fi
fi

cmake_prefix_path="$(realpath "$CUR_DIR/ross/build/bin")"
if [ "$torch_enable" = 1 ]; then
cmake_prefix_path="${cmake_prefix_path};${torch_cmake_prefix}"
fi

make_args_codes=(
-DCMAKE_PREFIX_PATH="$(realpath "$CUR_DIR/ross/build/bin")"
-DCMAKE_PREFIX_PATH="${cmake_prefix_path}"
-DCMAKE_CXX_COMPILER=mpicxx -DCMAKE_C_COMPILER=mpicc
-DCMAKE_C_FLAGS="-g -Wall"
-DCMAKE_CXX_FLAGS="-g -Wall"
-DTHREADS_PREFER_PTHREAD_FLAG=ON
-DCMAKE_THREAD_LIBS_INIT="-pthread"
-DCMAKE_HAVE_THREADS_LIBRARY=1
-DCMAKE_USE_PTHREADS_INIT=1
-DCMAKE_USE_WIN32_THREADS_INIT=0
-DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON
-DCMAKE_INSTALL_PREFIX="$(realpath bin)"
-DZMQML_BUILD_PATH="$(realpath "$CUR_DIR/codes/src/surrogate/zmqml")"
-DZeroMQ_INCLUDE_DIR=/usr/include
-DZeroMQ_LIBRARY=/usr/lib/x86_64-linux-gnu/libzmq.so
)
if [ $swm_enable = 1 ]; then
make_args_codes=(
Expand All @@ -121,8 +316,32 @@ if [ $union_enable = 1 ]; then
-DUNION_PKG_CONFIG_PATH="$(realpath "$CUR_DIR/Union/install/lib/pkgconfig")"
)
fi
if [ $torch_enable = 1 ]; then
make_args_codes=("${make_args_codes[@]}" -DUSE_TORCH=true)
if [ "$torch_enable" = 1 ]; then
make_args_codes=(
"${make_args_codes[@]}"
-DUSE_TORCH=true
-DTorch_DIR="${torch_dir}"
)

if [ -n "${CUDA_HOME:-}" ]; then
make_args_codes=(
"${make_args_codes[@]}"
-DCUDA_TOOLKIT_ROOT_DIR="${CUDA_HOME}"
-DCUDAToolkit_ROOT="${CUDA_HOME}"
-DCUDA_PATH="${CUDA_HOME}"
-DCUDA_ROOT="${CUDA_HOME}"
-DCMAKE_CUDA_COMPILER="${CUDA_HOME}/bin/nvcc"
-DCMAKE_CUDA_ARCHITECTURES="${cuda_arch}"
-DCUDA_INCLUDE_DIRS="${CUDA_HOME}/include"
-DCUDA_CUDART_LIBRARY="${CUDA_HOME}/lib64/libcudart.so"
)
else
make_args_codes=(
"${make_args_codes[@]}"
-DCMAKE_DISABLE_FIND_PACKAGE_CUDA=ON
-DCMAKE_DISABLE_FIND_PACKAGE_CUDAToolkit=ON
)
fi
else
make_args_codes=("${make_args_codes[@]}" -DUSE_TORCH=false)
fi
Expand Down
Loading