From 4a1f898b25868615a90f0b2decaf91d5a318d9be Mon Sep 17 00:00:00 2001 From: Dominic Ballenthin Date: Mon, 2 Feb 2026 03:59:23 +0100 Subject: [PATCH] Add intelligent GPU selector with automatic fallback to available GPU or CPU --- src/services/whisper_service.py | 147 ++++++++++++++++++++++++++++---- 1 file changed, 131 insertions(+), 16 deletions(-) diff --git a/src/services/whisper_service.py b/src/services/whisper_service.py index 08ea3bb..1d6edba 100644 --- a/src/services/whisper_service.py +++ b/src/services/whisper_service.py @@ -51,6 +51,51 @@ def _download_hook(progress_bytes, total_bytes): _model_status["status_message"] = f"Downloading: {_model_status['download_percentage']}%" +def get_best_gpu(): + """ + Find the best available GPU with most free memory. + Returns GPU index or None if no GPU is available. + """ + if not torch.cuda.is_available(): + return None + + num_gpus = torch.cuda.device_count() + if num_gpus == 0: + return None + + best_gpu = None + max_free_memory = 0 + + for gpu_id in range(num_gpus): + try: + # Get GPU properties + props = torch.cuda.get_device_properties(gpu_id) + total_memory = props.total_memory + + # Try to get allocated memory + torch.cuda.set_device(gpu_id) + allocated_memory = torch.cuda.memory_allocated(gpu_id) + free_memory = total_memory - allocated_memory + + print(f" GPU {gpu_id}: {props.name}") + print(f" Total: {total_memory / 1e9:.2f} GB") + print(f" Used: {allocated_memory / 1e9:.2f} GB") + print(f" Free: {free_memory / 1e9:.2f} GB") + + # Check if this GPU has enough free memory (need at least 3GB for large-v3) + min_required_memory = 3 * 1024 * 1024 * 1024 # 3GB in bytes + + if free_memory > min_required_memory and free_memory > max_free_memory: + max_free_memory = free_memory + best_gpu = gpu_id + + except Exception as e: + print(f" GPU {gpu_id}: Error checking - {e}") + continue + + return best_gpu + + def load_model(model_name: str = None): """Load Whisper model with automatic CPU fallback""" global _model, _model_status, _current_model_name @@ -79,31 +124,101 @@ def load_model(model_name: str = None): print(f"⚠️ GPU not available, falling back to CPU mode") _model_status["status_message"] = "GPU not available, using CPU..." - print(f"Loading Whisper model: {model_name} on {actual_device}") + print(f"Loading Whisper model: {model_name}") + print("Checking available GPUs...") + try: - # Whisper doesn't have a direct progress callback, but we can monitor the models directory + # Try to load on the best available GPU first + if requested_device == "cuda": + best_gpu = get_best_gpu() + + if best_gpu is not None: + print(f"🎯 Selected GPU {best_gpu} with most free memory") + + try: + # Set the device before loading + torch.cuda.set_device(best_gpu) + + # Load model on specific GPU + _model = whisper.load_model( + model_name, + device=f"cuda:{best_gpu}", + download_root=settings.models_path + ) + + _current_model_name = model_name + _model_status["is_downloading"] = False + _model_status["is_loaded"] = True + _model_status["download_percentage"] = 100 + _model_status["status_message"] = f"Model loaded successfully on GPU {best_gpu}" + print(f"✅ Model {model_name} loaded on GPU {best_gpu}") + + except RuntimeError as e: + if "out of memory" in str(e).lower() or "cuda" in str(e).lower(): + print(f"⚠️ GPU {best_gpu} out of memory, trying other GPUs...") + torch.cuda.empty_cache() + + # Try other GPUs + for gpu_id in range(torch.cuda.device_count()): + if gpu_id == best_gpu: + continue + try: + torch.cuda.set_device(gpu_id) + _model = whisper.load_model( + model_name, + device=f"cuda:{gpu_id}", + download_root=settings.models_path + ) + _current_model_name = model_name + _model_status["is_downloading"] = False + _model_status["is_loaded"] = True + _model_status["download_percentage"] = 100 + _model_status["status_message"] = f"Model loaded on GPU {gpu_id}" + print(f"✅ Model {model_name} loaded on GPU {gpu_id}") + break + except: + torch.cuda.empty_cache() + continue + else: + # All GPUs failed, fall back to CPU + print("⚠️ All GPUs full, falling back to CPU...") + raise Exception("All GPUs full") + else: + raise + else: + print("⚠️ No suitable GPU found, falling back to CPU...") + raise Exception("No suitable GPU") + + # If we get here with no model, use CPU + if _model is None: + _model = whisper.load_model( + model_name, + device="cpu", + download_root=settings.models_path + ) + _current_model_name = model_name + _model_status["is_downloading"] = False + _model_status["is_loaded"] = True + _model_status["download_percentage"] = 100 + _model_status["status_message"] = "Model loaded on CPU" + settings.whisper_device = "cpu" + print(f"✅ Model {model_name} loaded on CPU") + + except Exception as e: + print(f"⚠️ Error loading model: {e}") + # Final fallback to CPU _model = whisper.load_model( model_name, - device=actual_device, + device="cpu", download_root=settings.models_path ) _current_model_name = model_name _model_status["is_downloading"] = False _model_status["is_loaded"] = True _model_status["download_percentage"] = 100 - _model_status["status_message"] = "Model loaded successfully" - - # Update device in status to reflect actual device used - if actual_device != requested_device: - settings.whisper_device = actual_device - print(f"✅ Model {model_name} loaded on CPU (GPU fallback)") - else: - print(f"✅ Model {model_name} loaded on {actual_device}") - - except Exception as e: - _model_status["is_downloading"] = False - _model_status["status_message"] = f"Error: {str(e)}" - raise + _model_status["status_message"] = "Model loaded on CPU (error fallback)" + settings.whisper_device = "cpu" + print(f"✅ Model {model_name} loaded on CPU (after error)") return _model