Add intelligent GPU selector with automatic fallback to available GPU or CPU
This commit is contained in:
@@ -51,6 +51,51 @@ def _download_hook(progress_bytes, total_bytes):
|
|||||||
_model_status["status_message"] = f"Downloading: {_model_status['download_percentage']}%"
|
_model_status["status_message"] = f"Downloading: {_model_status['download_percentage']}%"
|
||||||
|
|
||||||
|
|
||||||
|
def get_best_gpu():
|
||||||
|
"""
|
||||||
|
Find the best available GPU with most free memory.
|
||||||
|
Returns GPU index or None if no GPU is available.
|
||||||
|
"""
|
||||||
|
if not torch.cuda.is_available():
|
||||||
|
return None
|
||||||
|
|
||||||
|
num_gpus = torch.cuda.device_count()
|
||||||
|
if num_gpus == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
best_gpu = None
|
||||||
|
max_free_memory = 0
|
||||||
|
|
||||||
|
for gpu_id in range(num_gpus):
|
||||||
|
try:
|
||||||
|
# Get GPU properties
|
||||||
|
props = torch.cuda.get_device_properties(gpu_id)
|
||||||
|
total_memory = props.total_memory
|
||||||
|
|
||||||
|
# Try to get allocated memory
|
||||||
|
torch.cuda.set_device(gpu_id)
|
||||||
|
allocated_memory = torch.cuda.memory_allocated(gpu_id)
|
||||||
|
free_memory = total_memory - allocated_memory
|
||||||
|
|
||||||
|
print(f" GPU {gpu_id}: {props.name}")
|
||||||
|
print(f" Total: {total_memory / 1e9:.2f} GB")
|
||||||
|
print(f" Used: {allocated_memory / 1e9:.2f} GB")
|
||||||
|
print(f" Free: {free_memory / 1e9:.2f} GB")
|
||||||
|
|
||||||
|
# Check if this GPU has enough free memory (need at least 3GB for large-v3)
|
||||||
|
min_required_memory = 3 * 1024 * 1024 * 1024 # 3GB in bytes
|
||||||
|
|
||||||
|
if free_memory > min_required_memory and free_memory > max_free_memory:
|
||||||
|
max_free_memory = free_memory
|
||||||
|
best_gpu = gpu_id
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" GPU {gpu_id}: Error checking - {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return best_gpu
|
||||||
|
|
||||||
|
|
||||||
def load_model(model_name: str = None):
|
def load_model(model_name: str = None):
|
||||||
"""Load Whisper model with automatic CPU fallback"""
|
"""Load Whisper model with automatic CPU fallback"""
|
||||||
global _model, _model_status, _current_model_name
|
global _model, _model_status, _current_model_name
|
||||||
@@ -79,31 +124,101 @@ def load_model(model_name: str = None):
|
|||||||
print(f"⚠️ GPU not available, falling back to CPU mode")
|
print(f"⚠️ GPU not available, falling back to CPU mode")
|
||||||
_model_status["status_message"] = "GPU not available, using CPU..."
|
_model_status["status_message"] = "GPU not available, using CPU..."
|
||||||
|
|
||||||
print(f"Loading Whisper model: {model_name} on {actual_device}")
|
print(f"Loading Whisper model: {model_name}")
|
||||||
|
print("Checking available GPUs...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Whisper doesn't have a direct progress callback, but we can monitor the models directory
|
# Try to load on the best available GPU first
|
||||||
|
if requested_device == "cuda":
|
||||||
|
best_gpu = get_best_gpu()
|
||||||
|
|
||||||
|
if best_gpu is not None:
|
||||||
|
print(f"🎯 Selected GPU {best_gpu} with most free memory")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Set the device before loading
|
||||||
|
torch.cuda.set_device(best_gpu)
|
||||||
|
|
||||||
|
# Load model on specific GPU
|
||||||
|
_model = whisper.load_model(
|
||||||
|
model_name,
|
||||||
|
device=f"cuda:{best_gpu}",
|
||||||
|
download_root=settings.models_path
|
||||||
|
)
|
||||||
|
|
||||||
|
_current_model_name = model_name
|
||||||
|
_model_status["is_downloading"] = False
|
||||||
|
_model_status["is_loaded"] = True
|
||||||
|
_model_status["download_percentage"] = 100
|
||||||
|
_model_status["status_message"] = f"Model loaded successfully on GPU {best_gpu}"
|
||||||
|
print(f"✅ Model {model_name} loaded on GPU {best_gpu}")
|
||||||
|
|
||||||
|
except RuntimeError as e:
|
||||||
|
if "out of memory" in str(e).lower() or "cuda" in str(e).lower():
|
||||||
|
print(f"⚠️ GPU {best_gpu} out of memory, trying other GPUs...")
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
# Try other GPUs
|
||||||
|
for gpu_id in range(torch.cuda.device_count()):
|
||||||
|
if gpu_id == best_gpu:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
torch.cuda.set_device(gpu_id)
|
||||||
|
_model = whisper.load_model(
|
||||||
|
model_name,
|
||||||
|
device=f"cuda:{gpu_id}",
|
||||||
|
download_root=settings.models_path
|
||||||
|
)
|
||||||
|
_current_model_name = model_name
|
||||||
|
_model_status["is_downloading"] = False
|
||||||
|
_model_status["is_loaded"] = True
|
||||||
|
_model_status["download_percentage"] = 100
|
||||||
|
_model_status["status_message"] = f"Model loaded on GPU {gpu_id}"
|
||||||
|
print(f"✅ Model {model_name} loaded on GPU {gpu_id}")
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# All GPUs failed, fall back to CPU
|
||||||
|
print("⚠️ All GPUs full, falling back to CPU...")
|
||||||
|
raise Exception("All GPUs full")
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
print("⚠️ No suitable GPU found, falling back to CPU...")
|
||||||
|
raise Exception("No suitable GPU")
|
||||||
|
|
||||||
|
# If we get here with no model, use CPU
|
||||||
|
if _model is None:
|
||||||
|
_model = whisper.load_model(
|
||||||
|
model_name,
|
||||||
|
device="cpu",
|
||||||
|
download_root=settings.models_path
|
||||||
|
)
|
||||||
|
_current_model_name = model_name
|
||||||
|
_model_status["is_downloading"] = False
|
||||||
|
_model_status["is_loaded"] = True
|
||||||
|
_model_status["download_percentage"] = 100
|
||||||
|
_model_status["status_message"] = "Model loaded on CPU"
|
||||||
|
settings.whisper_device = "cpu"
|
||||||
|
print(f"✅ Model {model_name} loaded on CPU")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Error loading model: {e}")
|
||||||
|
# Final fallback to CPU
|
||||||
_model = whisper.load_model(
|
_model = whisper.load_model(
|
||||||
model_name,
|
model_name,
|
||||||
device=actual_device,
|
device="cpu",
|
||||||
download_root=settings.models_path
|
download_root=settings.models_path
|
||||||
)
|
)
|
||||||
_current_model_name = model_name
|
_current_model_name = model_name
|
||||||
_model_status["is_downloading"] = False
|
_model_status["is_downloading"] = False
|
||||||
_model_status["is_loaded"] = True
|
_model_status["is_loaded"] = True
|
||||||
_model_status["download_percentage"] = 100
|
_model_status["download_percentage"] = 100
|
||||||
_model_status["status_message"] = "Model loaded successfully"
|
_model_status["status_message"] = "Model loaded on CPU (error fallback)"
|
||||||
|
settings.whisper_device = "cpu"
|
||||||
# Update device in status to reflect actual device used
|
print(f"✅ Model {model_name} loaded on CPU (after error)")
|
||||||
if actual_device != requested_device:
|
|
||||||
settings.whisper_device = actual_device
|
|
||||||
print(f"✅ Model {model_name} loaded on CPU (GPU fallback)")
|
|
||||||
else:
|
|
||||||
print(f"✅ Model {model_name} loaded on {actual_device}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
_model_status["is_downloading"] = False
|
|
||||||
_model_status["status_message"] = f"Error: {str(e)}"
|
|
||||||
raise
|
|
||||||
|
|
||||||
return _model
|
return _model
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user