automl · mo374z · Feb 15, 2026 · Dec 27, 2025 · Dec 27, 2025 · Dec 27, 2025
@@ -1,5 +1,5 @@
 
-![Coverage](https://img.shields.io/badge/Coverage-91%25-brightgreen)
+![Coverage](https://img.shields.io/badge/Coverage-90%25-brightgreen)
 [![CI](https://github.com/automl/promptolution/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/automl/promptolution/actions/workflows/ci.yml)
 [![Docs](https://github.com/automl/promptolution/actions/workflows/docs.yml/badge.svg?branch=main)](https://github.com/automl/promptolution/actions/workflows/docs.yml)
 ![Code Style](https://img.shields.io/badge/Code%20Style-black-black)
@@ -92,7 +92,7 @@ We encourage every contributor to also write tests, that automatically check if
 
 ```
 poetry run python -m coverage run -m pytest
-poetry run python -m coverage report
+poetry run python -m coverage report -i
 ```
 
 Developed by **Timo Heiß**, **Moritz Schlager**, and **Tom Zehle** (LMU Munich, MCML, ELLIS, TUM, Uni Freiburg).
@@ -83,7 +83,7 @@ api_key = "YOUR_API_KEY"  # Replace with your Promptolution API key
 ```
 
 Here's an explanation of each configuration parameter in the ExperimentConfig:
-- `optimizer`: The algorithm used for prompt optimization. Currently we support "capo", "evopromptga", "evopromptde", and "opro". For this example, we use "capo" as it is capable of leveraging few-shot examples.
+- `optimizer`: The algorithm used for prompt optimization. Currently we support "capo", "capoeira", "evopromptga", "evopromptde", and "opro". For this example, we use "capo" as it is capable of leveraging few-shot examples.
 - `task_description`: A string describing the task you're optimizing prompts for. This is used to provide the meta-llm with context about your task.
 - `prompts`: A list of initial prompt strings that will be used as the starting point for optimization.
 - `n_steps`: The number of optimization steps to run. Higher values allow more exploration and refinement but require more API calls and computational resources.
@@ -114,7 +114,7 @@ With everything configured, you're ready to optimize your prompts! The `run_expe
 prompts = run_experiment(df, config)
 ```
 
-    📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.
+    📌 CAPO-style optimizers require block evaluation strategy. Setting it to 'sequential_block'.
     ⚠️ The LLM does not have a tokenizer. Using simple token count.
     🔥 Starting optimization...
     📊 Starting evaluation...

diff --git a/docs/examples/reward_task_tutorial.md b/docs/examples/reward_task_tutorial.md
@@ -102,7 +102,7 @@ api_key = "YOUR_API_KEY"  # Replace with your Promptolution API key
 ```
 
 Here's an explanation of each configuration parameter in the ExperimentConfig:
-- `optimizer`: The algorithm used for prompt optimization. Currently we support "capo", "evopromptga", "evopromptde", and "opro". For this example, we use "capo" as it is capable of leveraging few-shot examples.
+- `optimizer`: The algorithm used for prompt optimization. Currently we support "capo", "capoeira", "evopromptga", "evopromptde", and "opro". For this example, we use "capo" as it is capable of leveraging few-shot examples.
 - `task_description`: A string describing the task you're optimizing prompts for. This is used to provide the meta-llm with context about your task.
 - `prompts`: A list of initial prompt strings that will be used as the starting point for optimization.
 - `n_steps`: The number of optimization steps to run. Higher values allow more exploration and refinement but require more API calls and computational resources.

@@ -28,12 +28,12 @@ def select_exemplars(self, prompt: Prompt, n_trials: int = 5) -> Prompt:
         best_prompt = prompt
 
         for _ in range(n_trials):
-            _, seq = self.task.evaluate(
-                prompt, self.predictor, eval_strategy="subsample", return_seq=True, return_agg_scores=False
-            )
+            result = self.task.evaluate(prompt, self.predictor, eval_strategy="subsample")
+            seq = result.sequences
             prompt_with_examples = Prompt(prompt.instruction, [seq[0][0]])
             # evaluate prompts as few shot prompt
-            score = self.task.evaluate(prompt_with_examples, self.predictor, eval_strategy="subsample")[0]
+            result = self.task.evaluate(prompt_with_examples, self.predictor, eval_strategy="subsample")
+            score = float(result.agg_scores[0])
             if score > best_score:
                 best_score = score
                 best_prompt = prompt_with_examples

@@ -53,10 +53,10 @@ def select_exemplars(self, prompt: Prompt, n_examples: int = 5) -> Prompt:
         """
         examples: List[str] = []
         while len(examples) < n_examples:
-            scores, seqs = self.task.evaluate(
-                prompt, self.predictor, eval_strategy="subsample", return_seq=True, return_agg_scores=False
-            )
-            score = np.mean(scores)
+            result = self.task.evaluate(prompt, self.predictor, eval_strategy="subsample")
+            scores = result.scores
+            seqs = result.sequences
+            score = float(np.mean(scores))
             seq = seqs[0][0]
             if score == self.desired_score:
                 examples.append(seq)

@@ -13,10 +13,11 @@
     from promptolution.optimizers.base_optimizer import BaseOptimizer
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
-    from promptolution.utils.config import ExperimentConfig
     from promptolution.tasks.base_task import TaskType
     from promptolution.optimizers.base_optimizer import OptimizerType
     from promptolution.predictors.base_predictor import PredictorType
+    from promptolution.utils import ExperimentConfig
+
 
 import pandas as pd
 
@@ -26,6 +27,7 @@
 from promptolution.llms.local_llm import LocalLLM
 from promptolution.llms.vllm import VLLM
 from promptolution.optimizers.capo import CAPO
+from promptolution.optimizers.capoeira import Capoeira
 from promptolution.optimizers.evoprompt_de import EvoPromptDE
 from promptolution.optimizers.evoprompt_ga import EvoPromptGA
 from promptolution.optimizers.opro import OPRO
@@ -79,10 +81,6 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[Promp
         )
         config.prompts = [Prompt(p) for p in initial_prompts]
 
-    if config.optimizer == "capo" and (config.eval_strategy is None or "block" not in config.eval_strategy):
-        logger.warning("📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.")
-        config.eval_strategy = "sequential_block"
-
     task = get_task(df, config, judge_llm=llm)
     optimizer = get_optimizer(
         predictor=predictor,
@@ -121,18 +119,20 @@ def run_evaluation(
     logger.warning("📊 Starting evaluation...")
     if isinstance(prompts[0], str):
         str_prompts = cast(List[str], prompts)
-        prompts = [Prompt(p) for p in str_prompts]
+        prompt_objs = [Prompt(p) for p in str_prompts]
     else:
         str_prompts = [p.construct_prompt() for p in cast(List[Prompt], prompts)]
-    scores = task.evaluate(prompts, predictor, eval_strategy="full")
+        prompt_objs = cast(List[Prompt], prompts)
+    results = task.evaluate(prompt_objs, predictor, eval_strategy="full")
+    scores = results.agg_scores.tolist()
     df = pd.DataFrame(dict(prompt=str_prompts, score=scores))
     df = df.sort_values("score", ascending=False, ignore_index=True)
 
     return df
 
 
 def get_llm(model_id: Optional[str] = None, config: Optional["ExperimentConfig"] = None) -> "BaseLLM":
-    """Factory function to create and return a language model instance based on the provided model_id.
+    """Create and return a language model instance based on the provided model_id.
 
     This function supports three types of language models:
     1. LocalLLM: For running models locally.
@@ -204,18 +204,16 @@ def get_optimizer(
     meta_llm: "BaseLLM",
     task: "BaseTask",
     optimizer: Optional["OptimizerType"] = None,
-    task_description: Optional[str] = None,
     config: Optional["ExperimentConfig"] = None,
 ) -> "BaseOptimizer":
-    """Creates and returns an optimizer instance based on provided parameters.
+    """Create and return an optimizer instance based on provided parameters.
 
     Args:
         predictor: The predictor used for prompt evaluation
         meta_llm: The language model used for generating meta-prompts
         task: The task object used for evaluating prompts
         optimizer: String identifying which optimizer to use
         meta_prompt: Meta prompt text for the optimizer
-        task_description: Description of the task for the optimizer
         config: Configuration object with default parameters
 
     Returns:
@@ -225,10 +223,6 @@ def get_optimizer(
         ValueError: If an unknown optimizer type is specified
     """
     final_optimizer = optimizer or (config.optimizer if config else None)
-    if config is None:
-        config = ExperimentConfig()
-    if task_description is not None:
-        config.task_description = task_description
 
     if final_optimizer == "capo":
         return CAPO(
@@ -238,6 +232,14 @@ def get_optimizer(
             config=config,
         )
 
+    if final_optimizer == "capoeira":
+        return Capoeira(
+            predictor=predictor,
+            meta_llm=meta_llm,
+            task=task,
+            config=config,
+        )
+
     if final_optimizer == "evopromptde":
         return EvoPromptDE(predictor=predictor, meta_llm=meta_llm, task=task, config=config)
 
@@ -253,7 +255,7 @@ def get_optimizer(
 def get_exemplar_selector(
     name: Literal["random", "random_search"], task: "BaseTask", predictor: "BasePredictor"
 ) -> "BaseExemplarSelector":
-    """Factory function to get an exemplar selector based on the given name.
+    """Get an exemplar selector based on the given name.
 
     Args:
         name (str): The name of the exemplar selector to instantiate.
@@ -274,8 +276,10 @@ def get_exemplar_selector(
         raise ValueError(f"Unknown exemplar selector: {name}")
 
 
-def get_predictor(downstream_llm=None, type: "PredictorType" = "marker", *args, **kwargs) -> "BasePredictor":
-    """Factory function to create and return a predictor instance.
+def get_predictor(
+    downstream_llm: Optional["BaseLLM"] = None, type: "PredictorType" = "marker", *args, **kwargs
+) -> "BasePredictor":
+    """Create and return a predictor instance.
 
     This function supports three types of predictors:
     1. FirstOccurrencePredictor: A predictor that classifies based on first occurrence of the label.
@@ -292,6 +296,7 @@ def get_predictor(downstream_llm=None, type: "PredictorType" = "marker", *args,
     Returns:
         An instance of FirstOccurrencePredictor or MarkerBasedPredictor.
     """
+    assert downstream_llm is not None, "downstream_llm must be provided to create a predictor."
     if type == "first_occurrence":
         return FirstOccurrencePredictor(downstream_llm, *args, **kwargs)
     elif type == "marker":

@@ -210,7 +210,7 @@ def _submit(self, coro):
         return asyncio.run_coroutine_threadsafe(coro, self._loop)
 
     def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
-        """Synchronously obtain responses for a batch of prompts.
+        """Obtain responses synchronously for a batch of prompts.
 
         This is the main entrypoint used by external callers. It handles system
         prompt broadcasting and delegates the actual work to the async batch

@@ -13,8 +13,8 @@
 logger = get_logger(__name__)
 
 try:
-    from transformers import AutoTokenizer  # type: ignore
-    from vllm import LLM, SamplingParams
+    from vllm import LLM
+    from vllm.sampling_params import SamplingParams
 
     imports_successful = True
 except ImportError:
@@ -113,23 +113,23 @@ def __init__(
 
         self.llm = LLM(**llm_params)
 
-        # Initialize tokenizer separately for potential pre-processing
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.tokenizer = self.llm.get_tokenizer()
 
         if batch_size is None:
-            cache_config = self.llm.llm_engine.model_executor.cache_config
-            if (
-                cache_config.num_gpu_blocks is not None
-                and cache_config.block_size is not None
-                and self.max_model_len is not None
-            ):
-                self.batch_size = int(
-                    (cache_config.num_gpu_blocks * cache_config.block_size / self.max_model_len) * 0.95
-                )
-                logger.info(f"🚀 Batch size set to {self.batch_size} based on GPU memory.")
+            max_num_seqs = int(llm_kwargs.get("max_num_seqs", 1))
+            max_num_batched_tokens = llm_kwargs.get("max_num_batched_tokens", None)
+
+            # Heuristic: if vLLM is capped by batched tokens, don't feed more seqs than fit.
+            if max_num_batched_tokens is not None and self.max_model_len is not None:
+                token_limited = max(1, int(max_num_batched_tokens) // int(self.max_model_len))
+                self.batch_size = max(1, min(max_num_seqs, token_limited))
             else:
-                self.batch_size = 1
-                logger.warning("⚠️ Could not determine batch size from GPU memory. Using batch size of 1.")
+                self.batch_size = max(1, max_num_seqs)
+
+            logger.info(
+                f"🚀 Batch size set to {self.batch_size} (max_num_seqs={max_num_seqs}, "
+                f"max_num_batched_tokens={max_num_batched_tokens}, max_model_len={self.max_model_len})."
+            )
         else:
             self.batch_size = batch_size
 

diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py
@@ -1,12 +1,14 @@
 """Module for prompt optimizers."""
 
 from promptolution.optimizers.capo import CAPO
+from promptolution.optimizers.capoeira import Capoeira
 from promptolution.optimizers.evoprompt_de import EvoPromptDE
 from promptolution.optimizers.evoprompt_ga import EvoPromptGA
 from promptolution.optimizers.opro import OPRO
 
 __all__ = [
     "CAPO",
+    "Capoeira",
     "EvoPromptDE",
     "EvoPromptGA",
     "OPRO",

@@ -15,7 +15,7 @@
 
 logger = get_logger(__name__)
 
-OptimizerType = Literal["evopromptde", "evopromptga", "opro", "capo"]
+OptimizerType = Literal["evopromptde", "evopromptga", "opro", "capo", "capoeira"]
 
 
 class BaseOptimizer(ABC):
@@ -31,6 +31,8 @@ class BaseOptimizer(ABC):
         predictor: The predictor used for prompt evaluation (if applicable).
     """
 
+    supports_multi_objective: bool = False
+
     def __init__(
         self,
         predictor: "BasePredictor",
@@ -50,6 +52,12 @@ def __init__(
         """
         # Set up optimizer state
         self.prompts: List[Prompt] = [Prompt(p) for p in initial_prompts] if initial_prompts else []
+        if task.task_type == "multi" and not self.supports_multi_objective:
+            logger.warning(
+                f"{self.__class__.__name__} does not support multi-objective tasks; objectives will be averaged equally.",
+            )
+            task.activate_scalarized_objective()
+
         self.task = task
         self.callbacks: List["BaseCallback"] = callbacks or []
         self.predictor = predictor