Add intelligent GPU selector with automatic fallback to available GPU or CPU

2026-02-02 03:59:23 +01:00
parent eb9c56760c
commit 4a1f898b25
1 changed files with 131 additions and 16 deletions
--- a/src/services/whisper_service.py
+++ b/src/services/whisper_service.py
@@ -51,6 +51,51 @@ def _download_hook(progress_bytes, total_bytes):
    _model_status["status_message"] = f"Downloading: {_model_status['download_percentage']}%"
 def get_best_gpu():
    """
    Find the best available GPU with most free memory.
    Returns GPU index or None if no GPU is available.
    """
    if not torch.cuda.is_available():
        return None
    num_gpus = torch.cuda.device_count()
    if num_gpus == 0:
        return None
    best_gpu = None
    max_free_memory = 0
    for gpu_id in range(num_gpus):
        try:
            # Get GPU properties
            props = torch.cuda.get_device_properties(gpu_id)
            total_memory = props.total_memory
            # Try to get allocated memory
            torch.cuda.set_device(gpu_id)
            allocated_memory = torch.cuda.memory_allocated(gpu_id)
            free_memory = total_memory - allocated_memory
            print(f"  GPU {gpu_id}: {props.name}")
            print(f"    Total: {total_memory / 1e9:.2f} GB")
            print(f"    Used: {allocated_memory / 1e9:.2f} GB")
            print(f"    Free: {free_memory / 1e9:.2f} GB")
            # Check if this GPU has enough free memory (need at least 3GB for large-v3)
            min_required_memory = 3 * 1024 * 1024 * 1024  # 3GB in bytes
            if free_memory > min_required_memory and free_memory > max_free_memory:
                max_free_memory = free_memory
                best_gpu = gpu_id
        except Exception as e:
            print(f"  GPU {gpu_id}: Error checking - {e}")
            continue
    return best_gpu
 def load_model(model_name: str = None):
    """Load Whisper model with automatic CPU fallback"""
    global _model, _model_status, _current_model_name
@@ -79,31 +124,101 @@ def load_model(model_name: str = None):
                print(f"⚠️  GPU not available, falling back to CPU mode")
                _model_status["status_message"] = "GPU not available, using CPU..."
-            print(f"Loading Whisper model: {model_name} on {actual_device}")
+            print(f"Loading Whisper model: {model_name}")
            print("Checking available GPUs...")
            try:
-                # Whisper doesn't have a direct progress callback, but we can monitor the models directory
+                # Try to load on the best available GPU first
                if requested_device == "cuda":
                    best_gpu = get_best_gpu()
                    if best_gpu is not None:
                        print(f"🎯 Selected GPU {best_gpu} with most free memory")
                        try:
                            # Set the device before loading
                            torch.cuda.set_device(best_gpu)
                            # Load model on specific GPU
                            _model = whisper.load_model(
                                model_name,
                                device=f"cuda:{best_gpu}",
                                download_root=settings.models_path
                            )
                            _current_model_name = model_name
                            _model_status["is_downloading"] = False
                            _model_status["is_loaded"] = True
                            _model_status["download_percentage"] = 100
                            _model_status["status_message"] = f"Model loaded successfully on GPU {best_gpu}"
                            print(f"✅ Model {model_name} loaded on GPU {best_gpu}")
                        except RuntimeError as e:
                            if "out of memory" in str(e).lower() or "cuda" in str(e).lower():
                                print(f"⚠️  GPU {best_gpu} out of memory, trying other GPUs...")
                                torch.cuda.empty_cache()
                                # Try other GPUs
                                for gpu_id in range(torch.cuda.device_count()):
                                    if gpu_id == best_gpu:
                                        continue
                                    try:
                                        torch.cuda.set_device(gpu_id)
                                        _model = whisper.load_model(
                                            model_name,
                                            device=f"cuda:{gpu_id}",
                                            download_root=settings.models_path
                                        )
                                        _current_model_name = model_name
                                        _model_status["is_downloading"] = False
                                        _model_status["is_loaded"] = True
                                        _model_status["download_percentage"] = 100
                                        _model_status["status_message"] = f"Model loaded on GPU {gpu_id}"
                                        print(f"✅ Model {model_name} loaded on GPU {gpu_id}")
                                        break
                                    except:
                                        torch.cuda.empty_cache()
                                        continue
                                else:
                                    # All GPUs failed, fall back to CPU
                                    print("⚠️  All GPUs full, falling back to CPU...")
                                    raise Exception("All GPUs full")
                            else:
                                raise
                    else:
                        print("⚠️  No suitable GPU found, falling back to CPU...")
                        raise Exception("No suitable GPU")
                # If we get here with no model, use CPU
                if _model is None:
                    _model = whisper.load_model(
                        model_name,
                        device="cpu",
                        download_root=settings.models_path
                    )
                    _current_model_name = model_name
                    _model_status["is_downloading"] = False
                    _model_status["is_loaded"] = True
                    _model_status["download_percentage"] = 100
                    _model_status["status_message"] = "Model loaded on CPU"
                    settings.whisper_device = "cpu"
                    print(f"✅ Model {model_name} loaded on CPU")
            except Exception as e:
                print(f"⚠️  Error loading model: {e}")
                # Final fallback to CPU
                _model = whisper.load_model(
                    model_name,
-                    device=actual_device,
+                    device="cpu",
                    download_root=settings.models_path
                )
                _current_model_name = model_name
                _model_status["is_downloading"] = False
                _model_status["is_loaded"] = True
                _model_status["download_percentage"] = 100
-                _model_status["status_message"] = "Model loaded successfully"
+                _model_status["status_message"] = "Model loaded on CPU (error fallback)"
-                
+                settings.whisper_device = "cpu"
-                # Update device in status to reflect actual device used
+                print(f"✅ Model {model_name} loaded on CPU (after error)")
                if actual_device != requested_device:
                    settings.whisper_device = actual_device
                    print(f"✅ Model {model_name} loaded on CPU (GPU fallback)")
                else:
                    print(f"✅ Model {model_name} loaded on {actual_device}")
            except Exception as e:
                _model_status["is_downloading"] = False
                _model_status["status_message"] = f"Error: {str(e)}"
                raise
    return _model