Add intelligent GPU selector with automatic fallback to available GPU or CPU
This commit is contained in:
@@ -51,6 +51,51 @@ def _download_hook(progress_bytes, total_bytes):
|
||||
_model_status["status_message"] = f"Downloading: {_model_status['download_percentage']}%"
|
||||
|
||||
|
||||
def get_best_gpu():
|
||||
"""
|
||||
Find the best available GPU with most free memory.
|
||||
Returns GPU index or None if no GPU is available.
|
||||
"""
|
||||
if not torch.cuda.is_available():
|
||||
return None
|
||||
|
||||
num_gpus = torch.cuda.device_count()
|
||||
if num_gpus == 0:
|
||||
return None
|
||||
|
||||
best_gpu = None
|
||||
max_free_memory = 0
|
||||
|
||||
for gpu_id in range(num_gpus):
|
||||
try:
|
||||
# Get GPU properties
|
||||
props = torch.cuda.get_device_properties(gpu_id)
|
||||
total_memory = props.total_memory
|
||||
|
||||
# Try to get allocated memory
|
||||
torch.cuda.set_device(gpu_id)
|
||||
allocated_memory = torch.cuda.memory_allocated(gpu_id)
|
||||
free_memory = total_memory - allocated_memory
|
||||
|
||||
print(f" GPU {gpu_id}: {props.name}")
|
||||
print(f" Total: {total_memory / 1e9:.2f} GB")
|
||||
print(f" Used: {allocated_memory / 1e9:.2f} GB")
|
||||
print(f" Free: {free_memory / 1e9:.2f} GB")
|
||||
|
||||
# Check if this GPU has enough free memory (need at least 3GB for large-v3)
|
||||
min_required_memory = 3 * 1024 * 1024 * 1024 # 3GB in bytes
|
||||
|
||||
if free_memory > min_required_memory and free_memory > max_free_memory:
|
||||
max_free_memory = free_memory
|
||||
best_gpu = gpu_id
|
||||
|
||||
except Exception as e:
|
||||
print(f" GPU {gpu_id}: Error checking - {e}")
|
||||
continue
|
||||
|
||||
return best_gpu
|
||||
|
||||
|
||||
def load_model(model_name: str = None):
|
||||
"""Load Whisper model with automatic CPU fallback"""
|
||||
global _model, _model_status, _current_model_name
|
||||
@@ -79,31 +124,101 @@ def load_model(model_name: str = None):
|
||||
print(f"⚠️ GPU not available, falling back to CPU mode")
|
||||
_model_status["status_message"] = "GPU not available, using CPU..."
|
||||
|
||||
print(f"Loading Whisper model: {model_name} on {actual_device}")
|
||||
print(f"Loading Whisper model: {model_name}")
|
||||
print("Checking available GPUs...")
|
||||
|
||||
try:
|
||||
# Whisper doesn't have a direct progress callback, but we can monitor the models directory
|
||||
# Try to load on the best available GPU first
|
||||
if requested_device == "cuda":
|
||||
best_gpu = get_best_gpu()
|
||||
|
||||
if best_gpu is not None:
|
||||
print(f"🎯 Selected GPU {best_gpu} with most free memory")
|
||||
|
||||
try:
|
||||
# Set the device before loading
|
||||
torch.cuda.set_device(best_gpu)
|
||||
|
||||
# Load model on specific GPU
|
||||
_model = whisper.load_model(
|
||||
model_name,
|
||||
device=f"cuda:{best_gpu}",
|
||||
download_root=settings.models_path
|
||||
)
|
||||
|
||||
_current_model_name = model_name
|
||||
_model_status["is_downloading"] = False
|
||||
_model_status["is_loaded"] = True
|
||||
_model_status["download_percentage"] = 100
|
||||
_model_status["status_message"] = f"Model loaded successfully on GPU {best_gpu}"
|
||||
print(f"✅ Model {model_name} loaded on GPU {best_gpu}")
|
||||
|
||||
except RuntimeError as e:
|
||||
if "out of memory" in str(e).lower() or "cuda" in str(e).lower():
|
||||
print(f"⚠️ GPU {best_gpu} out of memory, trying other GPUs...")
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Try other GPUs
|
||||
for gpu_id in range(torch.cuda.device_count()):
|
||||
if gpu_id == best_gpu:
|
||||
continue
|
||||
try:
|
||||
torch.cuda.set_device(gpu_id)
|
||||
_model = whisper.load_model(
|
||||
model_name,
|
||||
device=f"cuda:{gpu_id}",
|
||||
download_root=settings.models_path
|
||||
)
|
||||
_current_model_name = model_name
|
||||
_model_status["is_downloading"] = False
|
||||
_model_status["is_loaded"] = True
|
||||
_model_status["download_percentage"] = 100
|
||||
_model_status["status_message"] = f"Model loaded on GPU {gpu_id}"
|
||||
print(f"✅ Model {model_name} loaded on GPU {gpu_id}")
|
||||
break
|
||||
except:
|
||||
torch.cuda.empty_cache()
|
||||
continue
|
||||
else:
|
||||
# All GPUs failed, fall back to CPU
|
||||
print("⚠️ All GPUs full, falling back to CPU...")
|
||||
raise Exception("All GPUs full")
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
print("⚠️ No suitable GPU found, falling back to CPU...")
|
||||
raise Exception("No suitable GPU")
|
||||
|
||||
# If we get here with no model, use CPU
|
||||
if _model is None:
|
||||
_model = whisper.load_model(
|
||||
model_name,
|
||||
device="cpu",
|
||||
download_root=settings.models_path
|
||||
)
|
||||
_current_model_name = model_name
|
||||
_model_status["is_downloading"] = False
|
||||
_model_status["is_loaded"] = True
|
||||
_model_status["download_percentage"] = 100
|
||||
_model_status["status_message"] = "Model loaded on CPU"
|
||||
settings.whisper_device = "cpu"
|
||||
print(f"✅ Model {model_name} loaded on CPU")
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error loading model: {e}")
|
||||
# Final fallback to CPU
|
||||
_model = whisper.load_model(
|
||||
model_name,
|
||||
device=actual_device,
|
||||
device="cpu",
|
||||
download_root=settings.models_path
|
||||
)
|
||||
_current_model_name = model_name
|
||||
_model_status["is_downloading"] = False
|
||||
_model_status["is_loaded"] = True
|
||||
_model_status["download_percentage"] = 100
|
||||
_model_status["status_message"] = "Model loaded successfully"
|
||||
|
||||
# Update device in status to reflect actual device used
|
||||
if actual_device != requested_device:
|
||||
settings.whisper_device = actual_device
|
||||
print(f"✅ Model {model_name} loaded on CPU (GPU fallback)")
|
||||
else:
|
||||
print(f"✅ Model {model_name} loaded on {actual_device}")
|
||||
|
||||
except Exception as e:
|
||||
_model_status["is_downloading"] = False
|
||||
_model_status["status_message"] = f"Error: {str(e)}"
|
||||
raise
|
||||
_model_status["status_message"] = "Model loaded on CPU (error fallback)"
|
||||
settings.whisper_device = "cpu"
|
||||
print(f"✅ Model {model_name} loaded on CPU (after error)")
|
||||
|
||||
return _model
|
||||
|
||||
|
||||
Reference in New Issue
Block a user