Version 3.1.4: Clean up ChatterBox crash prevention and rename padding parameter

diodiogod · diodiogod · commit 1d9528f0e0b8 · 2025-07-18T17:57:37.000-03:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.1.4] - 2025-07-18
+
+### Added
+
+- Clean up ChatterBox crash prevention and rename padding parameter
 ## [3.1.3] - 2025-07-18
 
 ### Fixed
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 [![Forks][forks-shield]][forks-url]
 [![Dynamic TOML Badge][version-shield]][version-url]
 
-# ComfyUI ChatterBox SRT Voice (diogod) v3.1.3
+# ComfyUI ChatterBox SRT Voice (diogod) v3.1.4
 
 *This is a refactored node, originally created by [ShmuelRonen](https://github.com/ShmuelRonen/ComfyUI_ChatterBox_Voice).*
 
diff --git a/chatterbox_srt/__init__.py b/chatterbox_srt/__init__.py
@@ -4,7 +4,7 @@
 """
 
 # Version info
-__version__ = "3.1.3"
+__version__ = "3.1.4"
 __author__ = "Diogod"
 
 # Import the new SRT modules
diff --git a/core/__init__.py b/core/__init__.py
@@ -4,7 +4,7 @@
 """
 
 # Version info
-__version__ = "3.1.3"
+__version__ = "3.1.4"
 __author__ = "Diogod"
 
 # Make imports available at package level
diff --git a/core/chatterbox_subprocess.py b/core/chatterbox_subprocess.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+"""
+ChatterBox TTS Subprocess Wrapper
+
+This script runs ChatterBox TTS generation in an isolated subprocess to prevent
+CUDA crashes from affecting the main ComfyUI process.
+
+Usage:
+    python chatterbox_subprocess.py --text "Hello world" --reference_audio "path/to/ref.wav" --output "output.wav"
+"""
+
+import sys
+import os
+import argparse
+import json
+import traceback
+import tempfile
+import torch
+import numpy as np
+from pathlib import Path
+
+# Add the project root to Python path
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+def main():
+    parser = argparse.ArgumentParser(description='ChatterBox TTS Subprocess')
+    parser.add_argument('--text', required=True, help='Text to synthesize')
+    parser.add_argument('--reference_audio', required=True, help='Path to reference audio')
+    parser.add_argument('--output', required=True, help='Output audio file path')
+    parser.add_argument('--device', default='auto', help='Device to use (auto/cuda/cpu)')
+    parser.add_argument('--exaggeration', type=float, default=0.5, help='Exaggeration factor')
+    parser.add_argument('--temperature', type=float, default=0.8, help='Temperature')
+    parser.add_argument('--cfg_weight', type=float, default=0.5, help='CFG weight')
+    parser.add_argument('--seed', type=int, default=0, help='Random seed')
+    
+    args = parser.parse_args()
+    
+    try:
+        # Import ChatterBox TTS modules
+        from chatterbox.chatterbox import ChatterboxTTS
+        import torchaudio
+        
+        print(f"🔄 Subprocess: Loading ChatterBox TTS on {args.device}...")
+        
+        # Initialize ChatterBox TTS
+        if args.device == 'auto':
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        else:
+            device = args.device
+            
+        chatterbox = ChatterboxTTS.from_pretrained(device=device)
+        
+        print(f"🔄 Subprocess: Loading reference audio from {args.reference_audio}...")
+        
+        # Load reference audio
+        ref_audio, sample_rate = torchaudio.load(args.reference_audio)
+        
+        # Ensure mono audio
+        if ref_audio.shape[0] > 1:
+            ref_audio = ref_audio.mean(dim=0, keepdim=True)
+        
+        # Resample if necessary
+        if sample_rate != chatterbox.sr:
+            resampler = torchaudio.transforms.Resample(sample_rate, chatterbox.sr)
+            ref_audio = resampler(ref_audio)
+        
+        print(f"🔄 Subprocess: Generating speech for text: '{args.text[:50]}...'")
+        
+        # Set seed for reproducibility
+        if args.seed != 0:
+            torch.manual_seed(args.seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed(args.seed)
+        
+        # Generate audio
+        generated_audio = chatterbox.generate(
+            text=args.text,
+            reference_audio=ref_audio,
+            exaggeration=args.exaggeration,
+            temperature=args.temperature,
+            cfg_weight=args.cfg_weight
+        )
+        
+        print(f"🔄 Subprocess: Saving audio to {args.output}...")
+        
+        # Save output audio
+        torchaudio.save(args.output, generated_audio.cpu(), chatterbox.sr)
+        
+        # Return success info
+        duration = generated_audio.size(-1) / chatterbox.sr
+        result = {
+            'success': True,
+            'output_path': args.output,
+            'duration': duration,
+            'sample_rate': chatterbox.sr,
+            'audio_shape': list(generated_audio.shape)
+        }
+        
+        print(f"✅ Subprocess: Generation completed successfully ({duration:.2f}s)")
+        print(json.dumps(result))
+        
+    except Exception as e:
+        error_result = {
+            'success': False,
+            'error': str(e),
+            'traceback': traceback.format_exc()
+        }
+        print(f"❌ Subprocess: Generation failed: {e}")
+        print(json.dumps(error_result))
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/nodes.py b/nodes.py
@@ -1,5 +1,5 @@
 # Version and constants
-VERSION = "3.1.3"
+VERSION = "3.1.4"
 IS_DEV = False  # Set to False for release builds
 VERSION_DISPLAY = f"v{VERSION}" + (" (dev)" if IS_DEV else "")
 SEPARATOR = "=" * 70
diff --git a/nodes/srt_tts_node.py b/nodes/srt_tts_node.py
@@ -8,6 +8,7 @@
 import tempfile
 import os
 import hashlib
+import gc
 from typing import Dict, Any, Optional, List, Tuple
 
 # Use direct file imports that work when loaded via importlib
@@ -153,6 +154,10 @@ def INPUT_TYPES(cls):
                     "step": 0.5,
                     "tooltip": "Maximum allowed deviation (in seconds) for timing adjustments in 'smart_natural' mode. Higher values allow more flexibility."
                 }),
+                "crash_protection_template": ("STRING", {
+                    "default": "hmm ,, {seg} hmm ,,",
+                    "tooltip": "Custom padding template for short text segments to prevent ChatterBox crashes. ChatterBox has a bug where text shorter than ~21 characters causes CUDA tensor errors in sequential generation. Use {seg} as placeholder for the original text. Examples: '...ummmmm {seg}' (default hesitation), '{seg}... yes... {seg}' (repetition), 'Well, {seg}' (natural prefix), or empty string to disable padding. This only affects ChatterBox nodes, not F5-TTS nodes."
+                }),
             }
         }
 
@@ -161,6 +166,52 @@ def INPUT_TYPES(cls):
     FUNCTION = "generate_srt_speech"
     CATEGORY = "ChatterBox Voice"
 
+    def _pad_short_text_for_chatterbox(self, text: str, padding_template: str = "...ummmmm {seg}", min_length: int = 21) -> str:
+        """
+        Add custom padding to short text to prevent ChatterBox crashes.
+        
+        ChatterBox has a bug where short text segments cause CUDA tensor indexing errors
+        in sequential generation scenarios. Adding meaningful tokens with custom templates
+        prevents these crashes while allowing user customization.
+        
+        Args:
+            text: Input text to check and pad if needed
+            padding_template: Custom template with {seg} placeholder for original text
+            min_length: Minimum text length threshold (default: 21 characters)
+            
+        Returns:
+            Original text or text with custom padding template if too short
+        """
+        stripped_text = text.strip()
+        if len(stripped_text) < min_length:
+            # If template is empty, disable padding
+            if not padding_template.strip():
+                return text
+            # Replace {seg} placeholder with original text
+            return padding_template.replace("{seg}", stripped_text)
+        return text
+
+    def _safe_generate_tts_audio(self, text, audio_prompt, exaggeration, temperature, cfg_weight):
+        """
+        Wrapper around generate_tts_audio - simplified to just call the base method.
+        CUDA crash recovery was removed as it didn't work reliably.
+        """
+        try:
+            return self.generate_tts_audio(text, audio_prompt, exaggeration, temperature, cfg_weight)
+        except Exception as e:
+            error_msg = str(e)
+            is_cuda_crash = ("srcIndex < srcSelectDimSize" in error_msg or 
+                           "CUDA" in error_msg or 
+                           "device-side assert" in error_msg or
+                           "an illegal memory access" in error_msg)
+            
+            if is_cuda_crash:
+                print(f"🚨 ChatterBox CUDA crash detected: '{text[:50]}...'")
+                print(f"🛡️ This is a known ChatterBox bug with certain text patterns.")
+                raise RuntimeError(f"ChatterBox CUDA crash occurred. Text: '{text[:50]}...' - Try using padding template or longer text, or restart ComfyUI.")
+            else:
+                raise
+
     def _generate_segment_cache_key(self, subtitle_text: str, exaggeration: float, temperature: float, 
                                    cfg_weight: float, seed: int, audio_prompt_component: str, 
                                    model_source: str, device: str) -> str:
@@ -199,7 +250,8 @@ def _detect_overlaps(self, subtitles: List) -> bool:
     def generate_srt_speech(self, srt_content, device, exaggeration, temperature, cfg_weight, seed,
                             timing_mode, reference_audio=None, audio_prompt_path="",
                             max_stretch_ratio=2.0, min_stretch_ratio=0.5, fade_for_StretchToFit=0.01, 
-                            enable_audio_cache=True, timing_tolerance=2.0):
+                            enable_audio_cache=True, timing_tolerance=2.0, 
+                            crash_protection_template="hmm ,, {seg} hmm ,,"):
         
         def _process():
             # Check if SRT support is available
@@ -320,9 +372,16 @@ def _process():
                                     print(f"📺 Generating SRT segment {i+1}/{len(subtitles)} (Seq {subtitle.sequence})...")
                                 else:
                                     print(f"🎭 Generating SRT segment {i+1}/{len(subtitles)} (Seq {subtitle.sequence}) using '{char}'")
-                                # Generate new audio for this character segment
-                                char_wav = self.generate_tts_audio(
-                                    segment_text, char_audio, exaggeration, temperature, cfg_weight
+                                # BUGFIX: Pad short text with custom template to prevent ChatterBox sequential generation crashes
+                                processed_segment_text = self._pad_short_text_for_chatterbox(segment_text, crash_protection_template)
+                                
+                                # DEBUG: Show actual text being sent to ChatterBox when padding might occur
+                                if len(segment_text.strip()) < 21:
+                                    print(f"🔍 DEBUG: Original text: '{segment_text}' → Processed: '{processed_segment_text}' (len: {len(processed_segment_text)})")
+                                
+                                # Generate new audio for this character segment with CUDA recovery
+                                char_wav = self._safe_generate_tts_audio(
+                                    processed_segment_text, char_audio, exaggeration, temperature, cfg_weight
                                 )
                                 
                                 if enable_audio_cache:
@@ -354,8 +413,16 @@ def _process():
                             # Generate new audio
                             print(f"📺 Generating SRT segment {i+1}/{len(subtitles)} (Seq {subtitle.sequence})...")
                             
-                            wav = self.generate_tts_audio(
-                                subtitle.text, audio_prompt, exaggeration, temperature, cfg_weight
+                            # BUGFIX: Pad short text with custom template to prevent ChatterBox sequential generation crashes
+                            processed_subtitle_text = self._pad_short_text_for_chatterbox(subtitle.text, crash_protection_template)
+                            
+                            # DEBUG: Show actual text being sent to ChatterBox when padding might occur
+                            if len(subtitle.text.strip()) < 21:
+                                print(f"🔍 DEBUG: Original text: '{subtitle.text}' → Processed: '{processed_subtitle_text}' (len: {len(processed_subtitle_text)})")
+                            
+                            # Generate new audio with CUDA recovery
+                            wav = self._safe_generate_tts_audio(
+                                processed_subtitle_text, audio_prompt, exaggeration, temperature, cfg_weight
                             )
                             natural_duration = self.AudioTimingUtils.get_audio_duration(wav, self.tts_model.sr)
                             
diff --git a/nodes/tts_node.py b/nodes/tts_node.py
diff --git a/pyproject.toml b/pyproject.toml