diff --git a/config/queue.csv b/config/queue.csv index 54a4d36..0a1125b 100644 --- a/config/queue.csv +++ b/config/queue.csv @@ -11,7 +11,9 @@ NQSV_AOBA_B,qsub,"-Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q ${queue PJM_WISTERIA_O,pjsub,"-g jh260034o -L rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi proc=${proc} --omp thread=${nthreads}" PJM_WISTERIA_A,pjsub,"-g jh260034a -L rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi proc=${proc} --omp thread=${nthreads}" PBS_TSUKUBA,qsub,"-q ${queue_group} -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=${nthreads} -l walltime=${elapse}" -AGE_TSUBAME4,qsub,"-l ${queue_group}=${nodes} -l h_rt=${elapse}" +PBS_PEGASUS,qsub,"-q ${queue_group} -A CNTBENCH -l elapstim_req=${elapse} -v OMP_NUM_THREADS=${nthreads}" +PBS_SIRIUS,qsub,"-q ${queue_group} -A CNTBENCH -W group_list=CNTBENCH -l select=${nodes}:ncpus=24:mem=124gb:ngpus=1 -l walltime=${elapse}" +AGE_TSUBAME4,qsub,"-g jh260034 -l ${queue_group}=${nodes} -l h_rt=${elapse}" SLURM_CAMPHOR3,sbatch,"-p ${queue_group} -t ${elapse} --rsc p=${proc}:t=${nthreads}:c=${nthreads}:m=1G" NQSV_OSAKA_CPU,qsub,"-q ${queue_group} -b ${nodes} -l elapstim_req=${elapse},cpunum_job=${nthreads}" NQSV_OSAKA_GPU,qsub,"-q ${queue_group} -b ${nodes} -l elapstim_req=${elapse},cpunum_job=${nthreads},gpunum_job=${gpu_per_node}" diff --git a/config/system.csv b/config/system.csv index 5d3e0e0..42d759a 100644 --- a/config/system.csv +++ b/config/system.csv @@ -19,9 +19,9 @@ AOBA_B,cross,aoba_ab_login,aoba_ab_jacamar,NQSV_AOBA_B,lx AOBA_S,cross,aoba_s_login,aoba_s_jacamar,NQSV_AOBA_VE,sxs Odyssey,cross,wisteria_login,wisteria-o_jacamar,PJM_WISTERIA_O,short-o Aquarius,cross,wisteria_login,wisteria-a_jacamar,PJM_WISTERIA_A,short-a -Pegasus,cross,pegasus_login,pegasus_jacamar,PBS_TSUKUBA,regular -Sirius,cross,sirius_login,sirius_jacamar,PBS_TSUKUBA,regular -TSUBAME4,cross,tsubame4_login,tsubame4_jacamar,AGE_TSUBAME4,node_f +Pegasus,cross,pegasus_login,pegasus_jacamar,PBS_PEGASUS,gpu +Sirius,cross,sirius_login,sirius_jacamar,PBS_SIRIUS,mcrp +TSUBAME4,cross,tsubame4_login,tsubame4_jacamar,AGE_TSUBAME4,cpu_4 Camphor3,cross,camphor3_login,camphor3_jacamar,SLURM_CAMPHOR3,jha SQUID_CPU,cross,squid_login,squid_jacamar,NQSV_OSAKA_CPU,SQUID SQUID_GPU,cross,squid_login,squid_jacamar,NQSV_OSAKA_GPU,SQUID diff --git a/docs/guides/add-site.md b/docs/guides/add-site.md index db185ad..8340d07 100644 --- a/docs/guides/add-site.md +++ b/docs/guides/add-site.md @@ -109,6 +109,8 @@ ARM64 ログインノードでは `--arch arm64` を指定します。 - `--no-systemd` / `--no-start` - systemd user service を作らない、または作るだけで起動しない場合に使います +Jacamar-CI のビルドは、ログインノードのプロセス数・メモリ制限に当たりにくいよう、既定で `make -j1`、`GOMAXPROCS=1`、`GOFLAGS="-p=1 -gcflags=all=-dwarf=false"` を使います。余裕のある環境では `JACAMAR_BUILD_MAKE_JOBS`、`JACAMAR_BUILD_GOMAXPROCS`、`JACAMAR_BUILD_GOFLAGS` で上書きできます。 + このスクリプトは `config.toml` の `environment` に `PATH=$BASE_DIR/bin:...` を登録時点で入れるため、アーティファクト保存時に `gitlab-runner` が見つからない問題も避けられます。以下の手動手順は、スクリプトが失敗した場合の切り分けや、サイト固有に調整したい場合の参照として使ってください。 --- @@ -144,6 +146,7 @@ $BASE_DIR/ ├── custom-config.toml # Jacamar 設定ファイル ├── config.sh # カスタムランナー: config ├── prepare.sh # カスタムランナー: prepare +├── runner-env.sh # カスタムランナー: 共通環境初期化 ├── run.sh # カスタムランナー: run └── cleanup.sh # カスタムランナー: cleanup ``` @@ -202,8 +205,10 @@ cd jacamar-ci export CC=gcc export CXX=g++ export CGO_ENABLED=1 +export GOMAXPROCS=1 +export GOFLAGS="-p=1 -gcflags=all=-dwarf=false" -make build +make -j1 build make install PREFIX="$BASE_DIR" # 後片付け @@ -233,7 +238,9 @@ git clone https://gitlab.com/ecp-ci/jacamar-ci.git cp tools.go jacamar-ci/internal/executors/pbs/ cd jacamar-ci -make build +export GOMAXPROCS=1 +export GOFLAGS="-p=1 -gcflags=all=-dwarf=false" +make -j1 build make install PREFIX="$BASE_DIR" ``` @@ -290,7 +297,9 @@ export CPATH="${SEC_PREFIX}/include:${CPATH:-}" git clone https://gitlab.com/ecp-ci/jacamar-ci.git cd jacamar-ci export CC=gcc CXX=g++ CGO_ENABLED=1 -make build +export GOMAXPROCS=1 +export GOFLAGS="-p=1 -gcflags=all=-dwarf=false" +make -j1 build make install PREFIX="${WORKDIR}" # --- 5. 後片付け --- @@ -347,10 +356,19 @@ set -euo pipefail exit 0 ``` +### `runner-env.sh` - 共通環境初期化 + +`run.sh` から source される共通環境初期化ファイルです。非対話 shell でも site の module catalog やユーザの基本環境が見えるように、`/etc/profile`、`/etc/bashrc`、module 初期化ファイル、`~/.bashrc` を順に読みます。アプリごとの `build.sh` / `run.sh` は、原則として site の shell 初期化そのものではなく、必要な `module load` と実行コマンドだけを持ちます。 + ### `run.sh` - ジョブ実行 ```bash #!/usr/bin/env bash -source ~/.bashrc +RUNNER_ENV="${CUSTOM_DIR:-/path/to/gitlab-runner_jacamar-ci_amd}/runner-env.sh" +if [[ -r "${RUNNER_ENV}" ]]; then + source "${RUNNER_ENV}" +elif [[ -r "${HOME}/.bashrc" ]]; then + source "${HOME}/.bashrc" +fi set -eo pipefail exec "$@" ``` diff --git a/programs/qws/build.sh b/programs/qws/build.sh index 73181a7..38d02be 100644 --- a/programs/qws/build.sh +++ b/programs/qws/build.sh @@ -89,8 +89,20 @@ case "$system" in source /work/opt/local/x86_64/cores/intel/2023.0.0/mpi/latest/env/vars.sh make compiler=intel arch=skylake rdma= -j8 ;; + Pegasus) + module load intel/2025.3.1 intmpi/2025.3.1 + make compiler=intel arch=skylake mpi=1 omp=1 rdma= + ;; + Sirius) + module load aocc/5.0.0 openmpi/5.0.10/aocc5.0.0 + make -j4 compiler=aocc arch=zen4 rdma= mpi=1 omp=1 profiler=timing \ + AMD_MARCH=-march=znver4 cppflags="-DARCH_AVX512" main + ;; TSUBAME4) - make -j 8 fugaku_benchmark= omp=1 compiler=openmpi-gnu arch=skylake rdma= mpi=1 powerapi= CC=mpicc CXX=mpic++ + module load openmpi/5.0.10-gcc aocc/4.1.0 + export OMPI_CC=clang OMPI_CXX=clang++ OMPI_FC=flang + make -j4 compiler=aocc arch=zen4 rdma= mpi=1 omp=1 profiler=timing \ + AMD_MARCH=-march=znver4 cppflags="-DARCH_AVX512" main ;; Camphor3) camphor3_modulepath="${MODULEPATH:-}" diff --git a/programs/qws/list.csv b/programs/qws/list.csv index beaddfd..3ddc131 100644 --- a/programs/qws/list.csv +++ b/programs/qws/list.csv @@ -20,6 +20,8 @@ AOBA_S,yes,1,1,8,0:10:00 AOBA_B,yes,1,1,128,0:10:00 Odyssey,yes,1,1,12,0:10:00 Aquarius,yes,1,1,8,0:10:00 -TSUBAME4,yes,1,1,192,0:10:00 +Pegasus,yes,1,1,96,00:10:00 +Sirius,yes,1,1,24,0:10:00 +TSUBAME4,yes,1,1,4,0:10:00 Camphor3,yes,1,1,112,0:10:00 FNCX,yes,1,1,1,0:10:00 diff --git a/programs/qws/run.sh b/programs/qws/run.sh index 329436c..e4d1ce0 100644 --- a/programs/qws/run.sh +++ b/programs/qws/run.sh @@ -171,8 +171,22 @@ case "$system" in mpiexec -n 1 ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 print_results CASE0 CASE0 1 >> ../results/result ;; + Pegasus) + qws_numproc=$((nodes * numproc_node)) + module load intel/2025.3.1 intmpi/2025.3.1 + mpirun -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 ${numproc_node} >> ../results/result + ;; + Sirius) + qws_numproc=$((nodes * numproc_node)) + module load aocc/5.0.0 openmpi/5.0.10/aocc5.0.0 + mpirun -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 ${numproc_node} >> ../results/result + ;; TSUBAME4) qws_numproc=$((nodes * numproc_node)) + module load openmpi/5.0.10-gcc aocc/4.1.0 + export OMPI_CC=clang OMPI_CXX=clang++ OMPI_FC=flang mpirun -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 print_results CASE0 CASE0 ${numproc_node} >> ../results/result ;; diff --git a/scripts/setup_site_runner.sh b/scripts/setup_site_runner.sh index 1f9c99e..2458750 100755 --- a/scripts/setup_site_runner.sh +++ b/scripts/setup_site_runner.sh @@ -20,6 +20,9 @@ install_systemd=1 start_service=1 libseccomp_mode="auto" jacamar_pbs_tools="" +jacamar_make_jobs="${JACAMAR_BUILD_MAKE_JOBS:-1}" +jacamar_gomaxprocs="${JACAMAR_BUILD_GOMAXPROCS:-1}" +jacamar_goflags="${JACAMAR_BUILD_GOFLAGS:--p=1 -gcflags=all=-dwarf=false}" unrestricted_cmd_line=false runner_proxy="" runner_no_proxy="" @@ -65,6 +68,12 @@ Options: --no-start Create and enable service, but do not start it. -h, --help Show this help. +Environment overrides: + JACAMAR_BUILD_MAKE_JOBS Jacamar build make parallelism. Default: 1. + JACAMAR_BUILD_GOMAXPROCS Jacamar build Go scheduler threads. Default: 1. + JACAMAR_BUILD_GOFLAGS Jacamar build Go flags. + Default: -p=1 -gcflags=all=-dwarf=false. + Example: curl -fsSL https://raw.githubusercontent.com/RIKEN-RCCS/benchkit/main/scripts/setup_site_runner.sh \ | bash -s -- --arch amd64 --site genkai \ @@ -297,7 +306,10 @@ if [[ ! -x "$jacamar_bin" ]]; then ( cd "${work_dir}/jacamar-ci" export CC=gcc CXX=g++ CGO_ENABLED=1 - make build + export GOMAXPROCS="${GOMAXPROCS:-$jacamar_gomaxprocs}" + export GOFLAGS="${GOFLAGS:-$jacamar_goflags}" + info "Using Jacamar build limits: make -j${jacamar_make_jobs}, GOMAXPROCS=${GOMAXPROCS}, GOFLAGS=${GOFLAGS}" + make -j"$jacamar_make_jobs" build make install PREFIX="$base_dir" ) else @@ -349,11 +361,40 @@ set -euo pipefail exit 0 EOF -cat > "${base_dir}/run.sh" <<'EOF' +cat > "${base_dir}/runner-env.sh" <<'EOF' #!/usr/bin/env bash -source ~/.bashrc + +source_if_readable() { + local file="$1" + if [[ -r "$file" ]]; then + # shellcheck disable=SC1090 + source "$file" || true + fi +} + +source_if_readable /etc/profile +source_if_readable /etc/bashrc + +if ! type module >/dev/null 2>&1; then + source_if_readable /etc/profile.d/modules.sh + source_if_readable /etc/profile.d/z00_lmod.sh +fi + +source_if_readable "${HOME}/.bashrc" + +unset -f source_if_readable +EOF + +cat > "${base_dir}/run.sh" < "${base_dir}/cleanup.sh" <> "\$LOGFILE" EOF -chmod +x "${base_dir}/config.sh" "${base_dir}/prepare.sh" "${base_dir}/run.sh" "${base_dir}/cleanup.sh" +chmod +x "${base_dir}/config.sh" "${base_dir}/prepare.sh" "${base_dir}/runner-env.sh" "${base_dir}/run.sh" "${base_dir}/cleanup.sh" info "Writing Jacamar config" cat > "${base_dir}/custom-config.toml" <