You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
307 lines
12 KiB
307 lines
12 KiB
## Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. |
|
## |
|
## Redistribution and use in source and binary forms, with or without |
|
## modification, are permitted provided that the following conditions |
|
## are met: |
|
## * Redistributions of source code must retain the above copyright |
|
## notice, this list of conditions and the following disclaimer. |
|
## * Redistributions in binary form must reproduce the above copyright |
|
## notice, this list of conditions and the following disclaimer in the |
|
## documentation and/or other materials provided with the distribution. |
|
## * Neither the name of NVIDIA CORPORATION nor the names of its |
|
## contributors may be used to endorse or promote products derived |
|
## from this software without specific prior written permission. |
|
## |
|
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY |
|
## EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
## PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR |
|
## CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
## EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
## PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
## OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
## |
|
## For additional information on the license terms, see the CUDA EULA at |
|
## https://docs.nvidia.com/cuda/eula/index.html |
|
|
|
import os |
|
import sys |
|
import json |
|
import subprocess |
|
import argparse |
|
from pathlib import Path |
|
import concurrent.futures |
|
import threading |
|
|
|
print_lock = threading.Lock() |
|
|
|
def safe_print(*args, **kwargs): |
|
"""Thread-safe print function""" |
|
with print_lock: |
|
print(*args, **kwargs) |
|
|
|
def normalize_exe_name(name): |
|
"""Normalize executable name across platforms by removing .exe if present""" |
|
return Path(name).stem |
|
|
|
def load_args_config(config_file): |
|
"""Load arguments configuration from JSON file""" |
|
if not config_file or not os.path.exists(config_file): |
|
return {} |
|
|
|
try: |
|
with open(config_file, 'r') as f: |
|
config = json.load(f) |
|
|
|
# Validate the config format |
|
if not isinstance(config, dict): |
|
print("Warning: Config file must contain a dictionary/object") |
|
return {} |
|
|
|
return config |
|
except json.JSONDecodeError: |
|
print("Warning: Failed to parse config file as JSON") |
|
return {} |
|
except Exception as e: |
|
print(f"Warning: Error reading config file: {str(e)}") |
|
return {} |
|
|
|
def find_executables(root_dir): |
|
"""Find all executable files recursively""" |
|
executables = [] |
|
|
|
for path in Path(root_dir).rglob('*'): |
|
# Skip directories |
|
if not path.is_file(): |
|
continue |
|
|
|
# Check if file is executable |
|
if os.access(path, os.X_OK): |
|
# Skip if it's a library file |
|
if path.suffix.lower() in ('.dll', '.so', '.dylib'): |
|
continue |
|
executables.append(path) |
|
|
|
return executables |
|
|
|
def run_single_test_instance(executable, args, output_file, global_args, run_description): |
|
"""Run a single instance of a test executable with specific arguments.""" |
|
exe_path = str(executable) |
|
exe_name = executable.name |
|
|
|
safe_print(f"Starting {exe_name} {run_description}") |
|
|
|
try: |
|
cmd = [f"./{exe_name}"] |
|
cmd.extend(args) |
|
if global_args: |
|
cmd.extend(global_args) |
|
|
|
safe_print(f" Command ({exe_name} {run_description}): {' '.join(cmd)}") |
|
|
|
# Run the executable in its own directory using cwd |
|
with open(output_file, 'w') as f: |
|
result = subprocess.run( |
|
cmd, |
|
stdout=f, |
|
stderr=subprocess.STDOUT, |
|
timeout=300, # 5 minute timeout |
|
cwd=os.path.dirname(exe_path) # Execute in the executable's directory |
|
) |
|
|
|
status = "Passed" if result.returncode == 0 else "Failed" |
|
safe_print(f" Finished {exe_name} {run_description}: {status} (code {result.returncode})") |
|
return {"name": exe_name, "description": run_description, "return_code": result.returncode, "status": status} |
|
|
|
except subprocess.TimeoutExpired: |
|
safe_print(f"Error ({exe_name} {run_description}): Timed out after 5 minutes") |
|
return {"name": exe_name, "description": run_description, "return_code": -1, "status": "Timeout"} |
|
except Exception as e: |
|
safe_print(f"Error running {exe_name} {run_description}: {str(e)}") |
|
return {"name": exe_name, "description": run_description, "return_code": -1, "status": f"Error: {str(e)}"} |
|
|
|
def get_gpu_count(): |
|
"""Return the number of NVIDIA GPUs visible on the system. |
|
|
|
The function first tries to use the `nvidia-smi` CLI which should be |
|
available on most systems with a CUDA-capable driver installed. If the |
|
command is not present or fails we fall back to checking the |
|
CUDA_VISIBLE_DEVICES environment variable. The fallback is conservative |
|
– if we cannot determine the GPU count we assume 0.""" |
|
|
|
# Try the recommended NVML/nvidia-smi approach first |
|
try: |
|
smi = subprocess.run( |
|
["nvidia-smi", "-L"], |
|
stdout=subprocess.PIPE, |
|
stderr=subprocess.DEVNULL, |
|
text=True, |
|
check=False, |
|
) |
|
if smi.returncode == 0: |
|
# Each GPU is reported on its own line that starts with "GPU 0:" etc. |
|
gpu_lines = [ln for ln in smi.stdout.strip().splitlines() if ln.strip().lower().startswith("gpu ")] |
|
if gpu_lines: |
|
return len(gpu_lines) |
|
except FileNotFoundError: |
|
# nvidia-smi is missing – may be WSL/no driver inside container etc. |
|
pass |
|
except Exception: |
|
# Any unexpected error – treat as unknown → 0 |
|
pass |
|
|
|
# Fallback: attempt to infer from CUDA_VISIBLE_DEVICES if it is set and not empty |
|
visible = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() |
|
if visible and visible not in {"no", "none"}: |
|
# Handles comma-separated list like "0,1,2" or single values |
|
return len([v for v in visible.split(',') if v]) |
|
|
|
# Unable to determine, assume no GPUs |
|
return 0 |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description='Run all executables and capture output') |
|
parser.add_argument('--dir', default='.', help='Root directory to search for executables') |
|
parser.add_argument('--config', help='JSON configuration file for executable arguments') |
|
parser.add_argument('--output', default='.', # Default to current directory |
|
help='Output directory for test results') |
|
parser.add_argument('--parallel', type=int, default=1, help='Number of parallel tests to run') |
|
parser.add_argument('--args', nargs=argparse.REMAINDER, |
|
help='Global arguments to pass to all executables') |
|
args = parser.parse_args() |
|
|
|
# Create output directory if it doesn't exist |
|
if args.output: |
|
os.makedirs(args.output, exist_ok=True) |
|
|
|
# Load arguments configuration |
|
args_config = load_args_config(args.config) |
|
|
|
# Determine how many GPUs are available |
|
gpu_count = get_gpu_count() |
|
if gpu_count == 0: |
|
print("No NVIDIA GPU detected – cannot run CUDA samples. Exiting.") |
|
return 1 |
|
else: |
|
print(f"Detected {gpu_count} GPU(s).") |
|
|
|
executables = find_executables(args.dir) |
|
if not executables: |
|
print("No executables found!") |
|
return 1 |
|
|
|
print(f"Found {len(executables)} executables") |
|
print(f"Running tests with up to {args.parallel} parallel tasks") |
|
print("----------------------------------------" + "-" * len(str(args.parallel)) + "\n") |
|
|
|
tasks = [] |
|
for exe in executables: |
|
exe_name = exe.name |
|
base_name = normalize_exe_name(exe_name) |
|
|
|
# Check if this executable should be skipped globally |
|
if base_name in args_config and args_config[base_name].get("skip", False): |
|
safe_print(f"Skipping {exe_name} (marked as skip in config)") |
|
continue |
|
|
|
# Skip if the sample requires more GPUs than available |
|
required_gpus = args_config.get(base_name, {}).get("min_gpus", 1) |
|
if required_gpus > gpu_count: |
|
safe_print( |
|
f"Skipping {exe_name} (requires {required_gpus} GPU(s), only {gpu_count} available)" |
|
) |
|
continue |
|
|
|
arg_sets_configs = [] |
|
if base_name in args_config: |
|
config = args_config[base_name] |
|
if "args" in config: |
|
if isinstance(config["args"], list): |
|
arg_sets_configs.append({"args": config["args"]}) # Wrap in dict for consistency |
|
else: |
|
safe_print(f"Warning: Arguments for {base_name} must be a list") |
|
elif "runs" in config: |
|
for i, run_config in enumerate(config["runs"]): |
|
if run_config.get("skip", False): |
|
safe_print(f"Skipping run {i+1} for {exe_name} (marked as skip in config)") |
|
continue |
|
if isinstance(run_config.get("args", []), list): |
|
arg_sets_configs.append(run_config) |
|
else: |
|
safe_print(f"Warning: Arguments for {base_name} run {i+1} must be a list") |
|
|
|
# If no specific args defined, create one run with no args |
|
if not arg_sets_configs: |
|
arg_sets_configs.append({"args": []}) |
|
|
|
# Create tasks for each run configuration |
|
num_runs = len(arg_sets_configs) |
|
for i, run_config in enumerate(arg_sets_configs): |
|
current_args = run_config.get("args", []) |
|
run_desc = f"(run {i+1}/{num_runs})" if num_runs > 1 else "" |
|
|
|
# Create output file name |
|
if num_runs > 1: |
|
output_file = os.path.abspath(f"{args.output}/APM_{exe_name}.run{i+1}.txt") |
|
else: |
|
output_file = os.path.abspath(f"{args.output}/APM_{exe_name}.txt") |
|
|
|
tasks.append({ |
|
"executable": exe, |
|
"args": current_args, |
|
"output_file": output_file, |
|
"global_args": args.args, |
|
"description": run_desc |
|
}) |
|
|
|
failed = [] |
|
total_runs = len(tasks) |
|
completed_runs = 0 |
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as executor: |
|
future_to_task = { |
|
executor.submit(run_single_test_instance, |
|
task["executable"], |
|
task["args"], |
|
task["output_file"], |
|
task["global_args"], |
|
task["description"]): task |
|
for task in tasks |
|
} |
|
|
|
for future in concurrent.futures.as_completed(future_to_task): |
|
task_info = future_to_task[future] |
|
completed_runs += 1 |
|
safe_print(f"Progress: {completed_runs}/{total_runs} runs completed.") |
|
try: |
|
result = future.result() |
|
if result["return_code"] != 0: |
|
failed.append(result) |
|
except Exception as exc: |
|
safe_print(f'Task {task_info["executable"].name} {task_info["description"]} generated an exception: {exc}') |
|
failed.append({ |
|
"name": task_info["executable"].name, |
|
"description": task_info["description"], |
|
"return_code": -1, |
|
"status": f"Execution Exception: {exc}" |
|
}) |
|
|
|
# Print summary |
|
print("\nTest Summary:") |
|
print(f"Ran {total_runs} test runs for {len(executables)} executables.") |
|
if failed: |
|
print(f"Failed runs ({len(failed)}):") |
|
for fail in failed: |
|
print(f" {fail['name']} {fail['description']}: {fail['status']} (code {fail['return_code']})") |
|
# Return the return code of the first failure, or 1 if only exceptions occurred |
|
first_failure_code = next((f["return_code"] for f in failed if f["return_code"] != -1), 1) |
|
return first_failure_code |
|
else: |
|
print("All test runs passed!") |
|
return 0 |
|
|
|
if __name__ == '__main__': |
|
sys.exit(main())
|
|
|