File size: 4,637 Bytes
cc1f91b
 
316b67d
cc1f91b
 
 
 
 
 
 
316b67d
cc1f91b
 
 
 
 
 
 
316b67d
cc1f91b
 
 
 
 
 
 
1a79202
cc1f91b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316b67d
cc1f91b
 
 
 
 
 
 
 
 
 
316b67d
cc1f91b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a79202
cc1f91b
 
 
1a79202
cc1f91b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316b67d
cc1f91b
 
 
 
 
 
 
 
 
 
 
 
 
316b67d
cc1f91b
 
 
 
 
 
 
 
 
 
 
 
1a79202
cc1f91b
 
 
1a79202
cc1f91b
 
 
 
 
 
1a79202
cc1f91b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env bash
set -euo pipefail

echo "================= RUNTIME CAPABILITIES ================="
date
if command -v nvidia-smi >/dev/null 2>&1; then
  nvidia-smi
else
  echo "nvidia-smi: not available"
fi

echo
echo "CUDA_HOME: ${CUDA_HOME:-/usr/local/cuda}"
if command -v nvcc >/dev/null 2>&1; then
  nvcc --version || true
else
  echo "nvcc: not available"
fi

echo
echo "[PyTorch / CUDA backend]"
python - <<'PY'
import json, os, torch, inspect
def to_bool(x):
    try:
        if callable(x):
            try:
                sig = inspect.signature(x)
                if len(sig.parameters)==0:
                    return bool(x())
            except Exception:
                pass
            return True
        return bool(x)
    except Exception:
        return None

info = {
  "torch": getattr(torch, "__version__", None),
  "cuda_available": torch.cuda.is_available(),
  "cuda_device_count": torch.cuda.device_count(),
  "cuda_runtime_version": getattr(torch.version, "cuda", None),
  "cudnn_version": torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else None,
  "tf32": (torch.backends.cuda.matmul.allow_tf32 if torch.cuda.is_available() else None),
  "flash_sdp": (to_bool(getattr(torch.backends.cuda, "enable_flash_sdp", None)) if torch.cuda.is_available() else None),
  "mem_efficient_sdp": (to_bool(getattr(torch.backends.cuda, "enable_mem_efficient_sdp", None)) if torch.cuda.is_available() else None),
  "math_sdp": (to_bool(getattr(torch.backends.cuda, "enable_math_sdp", None)) if torch.cuda.is_available() else None),
}
print(json.dumps(info, indent=2))
for i in range(min(torch.cuda.device_count(), 8)):
  print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
PY

echo
echo "[Apex]"
python - <<'PY'
try:
  from apex.normalization import FusedLayerNorm, FusedRMSNorm
  import importlib; importlib.import_module("fused_layer_norm_cuda")
  print("apex.normalization: OK")
except Exception as e:
  print("apex.normalization: FAIL ->", e)
PY

echo
echo "[FlashAttention]"
python - <<'PY'
import importlib
for m in ("flash_attn","flash_attn_2_cuda"):
  try:
    importlib.import_module(m); print(f"{m}: OK")
  except Exception as e:
    print(f"{m}: FAIL -> {e}")
PY

echo
echo "[FlashAttention LN test]"
python - <<'PY'
import os, warnings, importlib
warnings.filterwarnings("ignore", category=FutureWarning)
def ok_import(names):
    for n in names:
        try:
            importlib.import_module(n)
            print(f"  [+] import '{n}' OK")
            return True
        except Exception as e:
            print(f"  [-] import '{n}' fail: {e}")
    return False
fa_ver = None
try:
    import flash_attn
    fa_ver = getattr(flash_attn, "__version__", None)
except Exception:
    pass
try:
    import torch
    tv = torch.__version__
    cu = getattr(torch.version, "cuda", None)
except Exception:
    tv, cu = "unknown", "unknown"
print(f"  flash_attn version: {fa_ver}")
print(f"  torch: {tv} | cuda: {cu} | TORCH_CUDA_ARCH_LIST={os.getenv('TORCH_CUDA_ARCH_LIST')}")
names_to_try = [
    "flash_attn_2_cuda",
    "flash_attn.ops.layer_norm",
    "flash_attn.layers.layer_norm",
]
ok = ok_import(names_to_try)
if not ok:
    print("  Hint: faltam kernels LN/RMSNorm do FlashAttention (performance reduzida).")
    print("  Use builder.sh para compilar flash_attn e reutilizar a wheel.")
PY

echo
echo "[Triton]"
python - <<'PY'
try:
  import triton
  print("triton:", triton.__version__)
  try:
    import triton.ops as _; print("triton.ops: OK")
  except Exception:
    print("triton.ops: not present (ok on Triton>=3.x)")
except Exception as e:
  print("triton: FAIL ->", e)
PY

echo
echo "[BitsAndBytes (Q8/Q4)]"
python - <<'PY'
try:
  import bitsandbytes as bnb
  print("bitsandbytes:", bnb.__version__)
  try:
    from bitsandbytes.triton import _custom_ops as _; print("bnb.triton.int8_matmul_mixed_dequantize: OK")
  except Exception as e:
    print("bnb.triton: partial ->", e)
except Exception as e:
  print("bitsandbytes: FAIL ->", e)
PY

echo
echo "[Transformers / Diffusers / XFormers]"
python - <<'PY'
def _v(m):
  try:
    mod = __import__(m)
    print(f"{m}:", getattr(mod, "__version__", "unknown"))
  except Exception as e:
    print(f"{m}: FAIL -> {e}")
for m in ("transformers","diffusers","xformers"):
  _v(m)
PY

echo
echo "[Distribuído / NCCL Env]"
env | grep -E '^(CUDA_VISIBLE_DEVICES|NCCL_|TORCH_|ENABLE_.*SDP|HF_HUB_.*|CUDA_|NV_.*NCCL.*|PYTORCH_CUDA_ALLOC_CONF)=' | sort

echo
echo "[Caminhos e permissões de saída]"
OUT="/app/outputs"
echo "OUT dir: $OUT"
mkdir -p "$OUT"
ls -la "$OUT" || true

echo "================= END CAPABILITIES ================="