From 4a1f898b25868615a90f0b2decaf91d5a318d9be Mon Sep 17 00:00:00 2001
From: Dominic Ballenthin <dominic.ballenthin@insight-it.de>
Date: Mon, 2 Feb 2026 03:59:23 +0100
Subject: [PATCH] Add intelligent GPU selector with automatic fallback to
 available GPU or CPU

---
 src/services/whisper_service.py | 147 ++++++++++++++++++++++++++++----
 1 file changed, 131 insertions(+), 16 deletions(-)

diff --git a/src/services/whisper_service.py b/src/services/whisper_service.py
index 08ea3bb..1d6edba 100644
--- a/src/services/whisper_service.py
+++ b/src/services/whisper_service.py
@@ -51,6 +51,51 @@ def _download_hook(progress_bytes, total_bytes):
     _model_status["status_message"] = f"Downloading: {_model_status['download_percentage']}%"
 
 
+def get_best_gpu():
+    """
+    Find the best available GPU with most free memory.
+    Returns GPU index or None if no GPU is available.
+    """
+    if not torch.cuda.is_available():
+        return None
+    
+    num_gpus = torch.cuda.device_count()
+    if num_gpus == 0:
+        return None
+    
+    best_gpu = None
+    max_free_memory = 0
+    
+    for gpu_id in range(num_gpus):
+        try:
+            # Get GPU properties
+            props = torch.cuda.get_device_properties(gpu_id)
+            total_memory = props.total_memory
+            
+            # Try to get allocated memory
+            torch.cuda.set_device(gpu_id)
+            allocated_memory = torch.cuda.memory_allocated(gpu_id)
+            free_memory = total_memory - allocated_memory
+            
+            print(f"  GPU {gpu_id}: {props.name}")
+            print(f"    Total: {total_memory / 1e9:.2f} GB")
+            print(f"    Used: {allocated_memory / 1e9:.2f} GB")
+            print(f"    Free: {free_memory / 1e9:.2f} GB")
+            
+            # Check if this GPU has enough free memory (need at least 3GB for large-v3)
+            min_required_memory = 3 * 1024 * 1024 * 1024  # 3GB in bytes
+            
+            if free_memory > min_required_memory and free_memory > max_free_memory:
+                max_free_memory = free_memory
+                best_gpu = gpu_id
+                
+        except Exception as e:
+            print(f"  GPU {gpu_id}: Error checking - {e}")
+            continue
+    
+    return best_gpu
+
+
 def load_model(model_name: str = None):
     """Load Whisper model with automatic CPU fallback"""
     global _model, _model_status, _current_model_name
@@ -79,31 +124,101 @@ def load_model(model_name: str = None):
                 print(f"⚠️  GPU not available, falling back to CPU mode")
                 _model_status["status_message"] = "GPU not available, using CPU..."
             
-            print(f"Loading Whisper model: {model_name} on {actual_device}")
+            print(f"Loading Whisper model: {model_name}")
+            print("Checking available GPUs...")
+            
             try:
-                # Whisper doesn't have a direct progress callback, but we can monitor the models directory
+                # Try to load on the best available GPU first
+                if requested_device == "cuda":
+                    best_gpu = get_best_gpu()
+                    
+                    if best_gpu is not None:
+                        print(f"🎯 Selected GPU {best_gpu} with most free memory")
+                        
+                        try:
+                            # Set the device before loading
+                            torch.cuda.set_device(best_gpu)
+                            
+                            # Load model on specific GPU
+                            _model = whisper.load_model(
+                                model_name,
+                                device=f"cuda:{best_gpu}",
+                                download_root=settings.models_path
+                            )
+                            
+                            _current_model_name = model_name
+                            _model_status["is_downloading"] = False
+                            _model_status["is_loaded"] = True
+                            _model_status["download_percentage"] = 100
+                            _model_status["status_message"] = f"Model loaded successfully on GPU {best_gpu}"
+                            print(f"✅ Model {model_name} loaded on GPU {best_gpu}")
+                            
+                        except RuntimeError as e:
+                            if "out of memory" in str(e).lower() or "cuda" in str(e).lower():
+                                print(f"⚠️  GPU {best_gpu} out of memory, trying other GPUs...")
+                                torch.cuda.empty_cache()
+                                
+                                # Try other GPUs
+                                for gpu_id in range(torch.cuda.device_count()):
+                                    if gpu_id == best_gpu:
+                                        continue
+                                    try:
+                                        torch.cuda.set_device(gpu_id)
+                                        _model = whisper.load_model(
+                                            model_name,
+                                            device=f"cuda:{gpu_id}",
+                                            download_root=settings.models_path
+                                        )
+                                        _current_model_name = model_name
+                                        _model_status["is_downloading"] = False
+                                        _model_status["is_loaded"] = True
+                                        _model_status["download_percentage"] = 100
+                                        _model_status["status_message"] = f"Model loaded on GPU {gpu_id}"
+                                        print(f"✅ Model {model_name} loaded on GPU {gpu_id}")
+                                        break
+                                    except:
+                                        torch.cuda.empty_cache()
+                                        continue
+                                else:
+                                    # All GPUs failed, fall back to CPU
+                                    print("⚠️  All GPUs full, falling back to CPU...")
+                                    raise Exception("All GPUs full")
+                            else:
+                                raise
+                    else:
+                        print("⚠️  No suitable GPU found, falling back to CPU...")
+                        raise Exception("No suitable GPU")
+                
+                # If we get here with no model, use CPU
+                if _model is None:
+                    _model = whisper.load_model(
+                        model_name,
+                        device="cpu",
+                        download_root=settings.models_path
+                    )
+                    _current_model_name = model_name
+                    _model_status["is_downloading"] = False
+                    _model_status["is_loaded"] = True
+                    _model_status["download_percentage"] = 100
+                    _model_status["status_message"] = "Model loaded on CPU"
+                    settings.whisper_device = "cpu"
+                    print(f"✅ Model {model_name} loaded on CPU")
+                
+            except Exception as e:
+                print(f"⚠️  Error loading model: {e}")
+                # Final fallback to CPU
                 _model = whisper.load_model(
                     model_name,
-                    device=actual_device,
+                    device="cpu",
                     download_root=settings.models_path
                 )
                 _current_model_name = model_name
                 _model_status["is_downloading"] = False
                 _model_status["is_loaded"] = True
                 _model_status["download_percentage"] = 100
-                _model_status["status_message"] = "Model loaded successfully"
-                
-                # Update device in status to reflect actual device used
-                if actual_device != requested_device:
-                    settings.whisper_device = actual_device
-                    print(f"✅ Model {model_name} loaded on CPU (GPU fallback)")
-                else:
-                    print(f"✅ Model {model_name} loaded on {actual_device}")
-                    
-            except Exception as e:
-                _model_status["is_downloading"] = False
-                _model_status["status_message"] = f"Error: {str(e)}"
-                raise
+                _model_status["status_message"] = "Model loaded on CPU (error fallback)"
+                settings.whisper_device = "cpu"
+                print(f"✅ Model {model_name} loaded on CPU (after error)")
                 
     return _model