add share_gpt benchmarking results

feifeibear · web-flow · commit 1da363e9d220 · 2023-09-22T17:18:04.000+08:00
diff --git a/README.md b/README.md
@@ -17,16 +17,10 @@ The speculative sampling is proposed by Google and Deepmind independently. So I
 You need prepare a pair of models using the same embedding and vocabulary. The approximation model should be smaller than the target model. Here are some
 tested model pairs.
 
-<center>
-
-| Approx Model | Target Model |
-|--------------|--------------|
-| [bloomz-7b1](https://huggingface.co/bigscience/bloomz-7b1/tree/main) | [bloom-560m](https://huggingface.co/bigscience/bloom-560m/tree/main) |
-| [TinyLlama-1.1B](https://huggingface.co/PY007/TinyLlama-1.1B-step-50K-105b) | llama-7b |
 
 </center>
 
-In the sample, I use [bloomz-7b1](https://huggingface.co/bigscience/bloomz-7b1/tree/main) as the target model, [bloom-560m](https://huggingface.co/bigscience/bloom-560m/tree/main) as the approximation model.
+In the sample, we demostrate [bloomz-7b1](https://huggingface.co/bigscience/bloomz-7b1/tree/main) as the target model, [bloom-560m](https://huggingface.co/bigscience/bloom-560m/tree/main) as the approximation model. 
 
 ```bash
 python main.py \
@@ -35,10 +29,21 @@ python main.py \
     --approx_model_name bigscience/bloom-560m
 ```
 
-You can also use `--v` args to see a token is generated by which model.
+You can also use `-v` args to see a token is generated by which model.
 
 ![example image](./imgs/sps.jpg "console output")
 
+I recommand you to use llama2-7B and llama2-70B as the approximation and target model respectively. I did observe speedup on this case as shown in the following.
+Note the choice of approx model and target model are essential for the speedup. The speedup will not be observed in the following cases:
+If the models are both small ones, the speedup will not be observed since the speed differences are not significant.
+If the model size difference is too large, more rejection and resampling will occure.
+Also the sampling logic is not efficient enough. I noticed substantial overhead is on Softmax and Layernorm. I will try to optimize it in the future.
+Do not histant to open an idea on performance improvements.
+
+|    | llama2-7b | llama2-70b | Speculative |
+|--------------|:--------------:|:--------------:|:--------------:|
+| speed(tokens/sec) | 1084.86 | 329.83 | 427.02 |
+
 ### Serving
 Start an inference server.
 ```bash
diff --git a/benchmark.py b/benchmark.py
@@ -0,0 +1,111 @@
+
+import torch
+import argparse
+import contexttimer
+from colorama import Fore, Style
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+from sampling import autoregressive_sampling, speculative_sampling, speculative_sampling_v2
+from globals import Decoder
+import json
+from  tqdm import tqdm
+
+# my local models
+MODELZOO = {
+    # llama-1
+    # https://huggingface.co/PY007/TinyLlama-1.1B-step-50K-105b
+    "llama1b": "/share_nfs/fangjiarui/root/code/hf_models/TinyLlama-1.1B-step-50K-105b",
+    "llama7b": "/share_nfs/tianzhi/code/llama-7b",
+    "llama30b": "/share_nfs/fangjiarui/root/code/hf_models/llama-30b-hf",
+    "llama2-7b" : "/share_nfs/fangjiarui/root/code/hf_models/llama-2-7b-hf",
+    "llama2-70b" : "/share_nfs/fangjiarui/root/code/hf_models/llama-2-70b-hf",
+    "bloom-560m": "/share_nfs/fangjiarui/root/code/hf_models/bloom-560m",
+    "bloom7b": "/share_nfs/fangjiarui/root/code/hf_models/bloomz-7b1",
+    "baichuan-7b": "/share_nfs/duanqiyuan/models/source_models/hf/baichuan-7B",
+    "baichuan-13b": "/share_nfs/duanqiyuan/models/source_models/hf/Baichuan-13B-Base",
+}
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='args for main.py')
+
+    parser.add_argument('--input', type=str, default="Suggest at least five related search terms to \"Mạng neural nhân tạo\".")
+    parser.add_argument('--approx_model_name', type=str, default=MODELZOO["llama2-7b"])
+    parser.add_argument('--target_model_name', type=str, default=MODELZOO["llama2-70b"])
+    parser.add_argument('--verbose', '-v', action='store_true', default=False, help='enable verbose mode')
+    parser.add_argument('--seed', '-s', type=int, default=None, help='set a random seed, which can makes the result reproducible')
+    parser.add_argument('--benchmark', '-b', action='store_true', default=False, help='show benchmark results.')
+    parser.add_argument('--profiling', '-p', action='store_true', default=False, help='collect torch profiler results.')
+    parser.add_argument('--max_tokens', '-M', type=int, default=20, help='max token number generated.')
+    parser.add_argument('--gamma', '-g', type=int, default=4, help='guess time.')
+    args = parser.parse_args()
+    return args
+
+
+def benchmark(fn, info, *args, **kwargs):
+
+    test_sample_num = 5
+    with contexttimer.Timer() as t:
+        total_tokens = 0
+        with open('/share_nfs/fangjiarui/root/code/datasets/share_gpt.jsonl', 'r') as file:
+            # add tqdm
+            
+            with tqdm(total=test_sample_num, desc=f"{info} benchmarking") as pbar:
+                for line in file.readlines():
+                    data = json.loads(line)
+                    for obj in data:
+                        content = obj["content"]
+                        # print("content", content)
+                        input_ids = Decoder().encode(content, return_tensors='pt').to('cuda')
+                        if len(input_ids[0]) > 2048 :
+                            continue
+                        output_ids = fn(input_ids, *args, **kwargs)
+                        generated_text = Decoder().decode(output_ids)
+                        # print("generated_text", generated_text)
+                        total_tokens += (len(generated_text) - len(input_ids))
+                    test_sample_num -= 1
+                    if test_sample_num < 0:
+                        break
+                    
+                    pbar.update(1)
+                    
+    print(f"\n [benchmark] {info} tokens/sec: {total_tokens / t.elapsed}, {t.elapsed} sec generates {total_tokens} tokens")
+
+def generate(input_text, approx_model_name, target_model_name, num_tokens=100, gamma = 4,
+             random_seed = None):
+    # NOTE() approx_model_name and target_model_name should use the same tokenizer!
+    
+    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    
+    tokenizer = AutoTokenizer.from_pretrained(approx_model_name, trust_remote_code=True)
+  
+    Decoder().set_tokenizer(tokenizer)
+    
+    print(f"begin loading models: \n {approx_model_name} \n {target_model_name}")
+    small_model = AutoModelForCausalLM.from_pretrained(approx_model_name, 
+                                                       torch_dtype=torch.float16,
+                                                       device_map="auto",
+                                                       trust_remote_code=True)
+    large_model = AutoModelForCausalLM.from_pretrained(target_model_name, 
+                                                       torch_dtype=torch.float16,
+                                                       device_map="auto",
+                                                       trust_remote_code=True)
+    print("finish loading models")
+    
+    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(torch_device)
+
+    top_k = 20
+    top_p = 0.9
+
+    torch.manual_seed(123)
+    benchmark(autoregressive_sampling, "AS_large", large_model, num_tokens, top_k = top_k, top_p=top_p)
+
+    torch.manual_seed(123)
+    benchmark(autoregressive_sampling, "AS_small", small_model, num_tokens, top_k = top_k, top_p=top_p)
+
+    torch.manual_seed(123)
+    benchmark(speculative_sampling, "SP", small_model, large_model, max_len = num_tokens, gamma = gamma, top_k = top_k, top_p=top_p, random_seed = random_seed)
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    
+    generate(args.input, args.approx_model_name, args.target_model_name, num_tokens=args.max_tokens, gamma=args.gamma)
diff --git a/globals.py b/globals.py
@@ -15,5 +15,8 @@ def __init__(self):
     def set_tokenizer(self, tokenizer):
         self.tokenizer = tokenizer
 
+    def encode(self, s: str, return_tensors='pt') -> torch.Tensor:
+        return self.tokenizer.encode(s, return_tensors=return_tensors)
+    
     def decode(self, t: torch.Tensor) -> str:
         return self.tokenizer.decode(t[0], skip_special_tokens=True)
diff --git a/main.py b/main.py
@@ -2,20 +2,24 @@
 import torch
 import argparse
 import contexttimer
-
+from colorama import Fore, Style
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from torch.profiler import ProfilerActivity
 
 from sampling import autoregressive_sampling, speculative_sampling, speculative_sampling_v2
 from globals import Decoder
 
+
+
+
 # my local models
 MODELZOO = {
+    # llama-1
     # https://huggingface.co/PY007/TinyLlama-1.1B-step-50K-105b
     "llama1b": "/share_nfs/fangjiarui/root/code/hf_models/TinyLlama-1.1B-step-50K-105b",
     "llama7b": "/share_nfs/tianzhi/code/llama-7b",
-    # https://huggingface.co/huggyllama/llama-13b
-    "llama13b": None,
+    "llama30b": "/share_nfs/fangjiarui/root/code/hf_models/llama-30b-hf",
+    "llama2-7b" : "/share_nfs/fangjiarui/root/code/hf_models/llama-2-7b-hf",
+    "llama2-70b" : "/share_nfs/fangjiarui/root/code/hf_models/llama-2-70b-hf",
     "bloom-560m": "/share_nfs/fangjiarui/root/code/hf_models/bloom-560m",
     "bloom7b": "/share_nfs/fangjiarui/root/code/hf_models/bloomz-7b1",
     "baichuan-7b": "/share_nfs/duanqiyuan/models/source_models/hf/baichuan-7B",
@@ -25,15 +29,22 @@
 def parse_arguments():
     parser = argparse.ArgumentParser(description='args for main.py')
 
-    parser.add_argument('--input', type=str, default="Suggest at least five related search terms to \"Mạng neural nhân tạo\".")
-    parser.add_argument('--approx_model_name', type=str, default=MODELZOO["bloom-560m"])
-    parser.add_argument('--target_model_name', type=str, default=MODELZOO["bloom7b"])
+    parser.add_argument('--input', type=str, default="Any recommendations for my holidays in Abu Dhabi?")
+    parser.add_argument('--approx_model_name', type=str, default=MODELZOO["llama2-7b"])
+    parser.add_argument('--target_model_name', type=str, default=MODELZOO["llama2-70b"])
     parser.add_argument('--verbose', '-v', action='store_true', default=False, help='enable verbose mode')
-    parser.add_argument('--seed', '-s', type=int, default=None, help='set a random seed')
+    parser.add_argument('--seed', '-s', type=int, default=None, help='set a random seed, which can makes the result reproducible')
+    parser.add_argument('--benchmark', '-b', action='store_true', default=False, help='show benchmark results.')
+    parser.add_argument('--profiling', '-p', action='store_true', default=False, help='collect torch profiler results.')
+    parser.add_argument('--max_tokens', '-M', type=int, default=20, help='max token number generated.')
+    parser.add_argument('--gamma', '-g', type=int, default=4, help='guess time.')
     args = parser.parse_args()
     return args
 
 
+def color_print(text):
+    print(Fore.RED + text + Style.RESET_ALL)
+    
 def benchmark(fn, print_prefix, use_profiler=True, *args, **kwargs):
     TEST_TIME = 10
     profile_filename = f"./profile_logs/{print_prefix}"
@@ -57,7 +68,8 @@ def benchmark(fn, print_prefix, use_profiler=True, *args, **kwargs):
 
     print(f"\n [benchmark] {print_prefix}, tokens/sec: {len(output[0]) / t.elapsed / TEST_TIME}, {t.elapsed / TEST_TIME} sec generates {len(output[0])} tokens")
 
-def generate(input_text, approx_model_name, target_model_name, num_tokens=40, random_seed = None, verbose = False, use_benchmark = False):
+def generate(input_text, approx_model_name, target_model_name, num_tokens=20, gamma = 4,
+             random_seed = None, verbose = False, use_benchmark = False, use_profiling = False):
     # NOTE() approx_model_name and target_model_name should use the same tokenizer!
     
     torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -85,37 +97,37 @@ def generate(input_text, approx_model_name, target_model_name, num_tokens=40, ra
     torch.manual_seed(123)
     output = autoregressive_sampling(input_ids, large_model, num_tokens, top_k = top_k, top_p=top_p)
     generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-    print(f"large (target) model autoregressive_sampling: {generated_text}")
+    color_print(f"large (target) model autoregressive_sampling: {generated_text}")
     
-    TEST_TIME = 10
     if use_benchmark:
-        benchmark(autoregressive_sampling, "AS_large", True,
+        benchmark(autoregressive_sampling, "AS_large", use_profiling,
                   input_ids, large_model, num_tokens, top_k = top_k, top_p=top_p)
 
     torch.manual_seed(123)
     output = autoregressive_sampling(input_ids, small_model, num_tokens, top_k = top_k, top_p=top_p)
     generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-    print(f"small (approx) model autoregressive_sampling: {generated_text}")
+    color_print(f"small (approx) model autoregressive_sampling: {generated_text}")
     
     if use_benchmark:
-        benchmark(autoregressive_sampling, "AS_small", True,
+        benchmark(autoregressive_sampling, "AS_small", use_profiling,
                   input_ids, small_model, num_tokens, top_k = top_k, top_p=top_p)
     
     torch.manual_seed(123)
     output = speculative_sampling_v2(input_ids, small_model, large_model, num_tokens, top_k = top_k, top_p=top_p, random_seed = random_seed)
     generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-    print(f"deepmind's speculative_sampling: {generated_text}")   
+    color_print(f"deepmind's speculative_sampling: {generated_text}")   
 
     torch.manual_seed(123)
-    output = speculative_sampling(input_ids, small_model, large_model, num_tokens, top_k = top_k, top_p=top_p, random_seed = random_seed, verbose = verbose)
+    output = speculative_sampling(input_ids, small_model, large_model, num_tokens, gamma = gamma, top_k = top_k, top_p=top_p, random_seed = random_seed, verbose = verbose)
     generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-    print(f"google's speculative_sampling: {generated_text}")
+    color_print(f"google's speculative_sampling: {generated_text}")
     
     if use_benchmark:
-        benchmark(speculative_sampling, "SP", True,
-                  input_ids, small_model, large_model, max_len = num_tokens, top_k = top_k, top_p=top_p, random_seed = random_seed)
+        benchmark(speculative_sampling, "SP", use_profiling,
+                  input_ids, small_model, large_model, max_len = num_tokens, gamma = gamma, top_k = top_k, top_p=top_p, random_seed = random_seed)
 
 if __name__ == "__main__":
     args = parse_arguments()
     
-    generate(args.input, args.approx_model_name, args.target_model_name, random_seed = args.seed, verbose=args.verbose)
+    generate(args.input, args.approx_model_name, args.target_model_name, num_tokens=args.max_tokens, gamma=args.gamma,
+             random_seed = args.seed, verbose=args.verbose, use_benchmark = args.benchmark)
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,5 @@ transformers==4.33.2
 torch==2.0.1
 contexttimer
 flask
-transformers_stream_generator
+transformers_stream_generator
+colorama
diff --git a/sampling/kvcache_model.py b/sampling/kvcache_model.py
@@ -30,7 +30,6 @@ def _forward_with_kvcache(self, input_ids : torch.Tensor, use_debug = True) -> t
             self._prob_history = outputs.logits
             for i in range(self._prob_history.shape[-2]):   
                 self._prob_history[:, i, :] = norm_logits(self._prob_history[:, i, :], self._temperature, self._top_k, self._top_p)
-            # self._prob_history[:, -1, :] = norm_logits(self._prob_history[:, -1, :], self._temperature, self._top_k, self._top_p)
             self._past_key_values = outputs.past_key_values
             last_q = self._prob_history[:, -1, :]
         else:
diff --git a/sampling/speculative_sampling.py b/sampling/speculative_sampling.py
@@ -41,6 +41,10 @@ def speculative_sampling(prefix : torch.Tensor, approx_model : torch.nn.Module,
     approx_model_cache = KVCacheModel(approx_model, temperature, top_k, top_p)
     target_model_cache = KVCacheModel(target_model, temperature, top_k, top_p)
     
+    resample_count = 0
+    target_sample_count = 0
+    accepted_count = 0
+    
     while prefix.shape[1] < T:
         # q = M_q[prefix + x_0, x_1, .., x_(gamma-2)]
         prefix_len = prefix.shape[1]
@@ -64,6 +68,8 @@ def speculative_sampling(prefix : torch.Tensor, approx_model : torch.nn.Module,
             
             if verbose:
                 print(f"approx guess accepted {j[0]}: \033[31m{Decoder().decode(torch.tensor([j]))}\033[0m")
+
+            accepted_count += 1
         
         # print(f"n : {n}, i : {i}, prefix_len + gamma - 1: {prefix_len + gamma - 1}")
         assert n >= prefix_len - 1, f"n {n}, prefix_len {prefix_len}"
@@ -78,20 +84,22 @@ def speculative_sampling(prefix : torch.Tensor, approx_model : torch.nn.Module,
             t = sample(max_fn(target_model_cache._prob_history[:, n, :] - approx_model_cache._prob_history[:, n, :]))
             if verbose:
                 print(f"target resamples at position {n}: \033[34m{Decoder().decode(t)}\033[0m")
-            
+            resample_count += 1
             target_model_cache.rollback(n+1)
         else:
             # all approx model decoding accepted
             assert n == target_model_cache._prob_history.shape[1] - 1
             t = sample(target_model_cache._prob_history[:, -1, :])
             if verbose:
                 print(f"target samples {n}: \033[35m{Decoder().decode(t)}\033[0m")
+            target_sample_count += 1
             target_model_cache.rollback(n+2)
         
         
         prefix = torch.cat((prefix, t), dim=1)
 
-
+    if verbose:
+        print(f"generated tokens numbers {prefix.shape[-1] - seq_len}, accepted_count {accepted_count}, target_sample_count {target_sample_count}, resample_count {resample_count}")
     return prefix
 
 
diff --git a/serving.py b/serving.py
@@ -55,9 +55,6 @@ def predict():
     return jsonify(result)
 
 if __name__ == '__main__':
-    # Load the model
-    # load_model("/share_nfs/fangjiarui/root/code/hf_models/bloom-560m")
-
     GLOBAL_SERVER = Server(approx_model_name="/share_nfs/fangjiarui/root/code/hf_models/bloom-560m",
            target_model_name="/share_nfs/fangjiarui/root/code/hf_models/bloomz-7b1")
     # Start the Flask service