triton-inference-server
diff --git a/‎README.md
+8 b/‎README.md
+8
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
+15-2 b/‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
+15-2
diff --git a/‎dockerfile/Dockerfile.trt_llm_backend
-1 b/‎dockerfile/Dockerfile.trt_llm_backend
-1
diff --git a/‎docs/baichuan.md
+2-2 b/‎docs/baichuan.md
+2-2
diff --git a/‎docs/llama.md
+2-2 b/‎docs/llama.md
+2-2
diff --git a/‎inflight_batcher_llm/client/__init__.py b/‎inflight_batcher_llm/client/__init__.py
@@ -200,8 +200,16 @@ The following table shows the fields that need to be modified before deployment:
 | Name | Description
 | :----------------------: | :-----------------------------: |
 | `decoupled` | Controls streaming. Decoupled mode must be set to `True` if using the streaming option from the client. |
+| `max_beam_width` | The maximum beam width that any request may ask for when using beam search |
 | `gpt_model_type` | Set to `inflight_fused_batching` when enabling in-flight batching support. To disable in-flight batching, set to `V1` |
 | `gpt_model_path` | Path to the TensorRT-LLM engines for deployment. In this example, the path should be set to `/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1` as the tensorrtllm_backend directory will be mounted to `/tensorrtllm_backend` within the container |
+| `max_tokens_in_paged_kv_cache` | The maximum size of the KV cache in number of tokens |
+| `max_attention_window_size` | When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults to maximum sequence length |
+| `batch_scheduler_policy` | Set to `max_utilization` to greedily pack as many requests as possible in each current in-flight batching iteration. This maximizes the throughput but may result in overheads due to request pause/resume if KV cache limits are reached during execution. Set to `guaranteed_no_evict` to guarantee that a started request is never paused.|
+| `kv_cache_free_gpu_mem_fraction` | Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache|
+| `max_num_sequences` | Maximum number of sequences that the in-flight batching scheme can maintain state for. Defaults to `max_batch_size` if `enable_trt_overlap` is `false` and to `2 * max_batch_size` if `enable_trt_overlap` is `true`, where `max_batch_size` is the TRT engine maximum batch size.
+| `enable_trt_overlap` | Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
+| `exclude_input_in_output` | Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens  |
 
 *triton_model_repo/postprocessing/config.pbtxt*
 
 
@@ -55,6 +55,13 @@ input [
     data_type: TYPE_INT32
     dims: [ 1 ]
   },
+  {
+    name: "draft_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
   {
     name: "end_id"
     data_type: TYPE_INT32
@@ -246,9 +253,9 @@ parameters: {
   }
 }
 parameters: {
-  key: "max_kv_cache_length"
+  key: "max_attention_window_size"
   value: {
-    string_value: "${max_kv_cache_length}"
+    string_value: "${max_attention_window_size}"
   }
 }
 parameters: {
@@ -281,3 +288,9 @@ parameters: {
     string_value: "${exclude_input_in_output}"
   }
 }
+parameters: {
+  key: "use_context_fmha_for_generation"
+  value: {
+    string_value: "${use_context_fmha_for_generation}"
+  }
+}
@@ -58,7 +58,6 @@ FROM trt_llm_backend_builder as final
 WORKDIR /app/
 COPY --from=trt_llm_builder /app/tensorrt_llm/build /app/tensorrt_llm/build
 RUN cd /app/tensorrt_llm/build && pip3 install *.whl
-RUN rm -rf /app/tensorrt_llm
 
 # Install TensorRT-LLM backend
 RUN mkdir /opt/tritonserver/backends/tensorrtllm
 
@@ -44,7 +44,7 @@ python3 tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokeni
 python3 tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},tokenizer_type:auto,triton_max_batch_size:64,postprocessing_instance_count:1
 python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False
 python3 tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_kv_cache_length:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,batching_strategy:inflight_batching,max_queue_delay_microseconds:600
+python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,batching_strategy:inflight_batching,max_queue_delay_microseconds:600
 ```
 
 * Launch server
@@ -178,7 +178,7 @@ python3 tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokeni
 python3 tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},tokenizer_type:auto,triton_max_batch_size:64,postprocessing_instance_count:1
 python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True
 python3 tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_kv_cache_length:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,batching_strategy:inflight_batching,max_queue_delay_microseconds:600
+python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,batching_strategy:inflight_batching,max_queue_delay_microseconds:600
 
 pip pinstall SentencePiece
 # please add `trust_remote_code=True` in tokenizer of preprocessing and postprocessing. Considering the security, we don't add it by default.
 
@@ -25,7 +25,7 @@ python3 tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer
 python3 tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},tokenizer_type:llama,triton_max_batch_size:64,postprocessing_instance_count:1
 python3 tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False
 python3 tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/llama/7B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_kv_cache_length:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,batching_strategy:inflight_batching,max_queue_delay_microseconds:600
+python3 tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/llama/7B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,batching_strategy:inflight_batching,max_queue_delay_microseconds:600
 ```
 
 * Launch server
@@ -114,7 +114,7 @@ python3 tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer
 python3 tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},tokenizer_type:llama,triton_max_batch_size:64,postprocessing_instance_count:1
 python3 tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True
 python3 tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/llama/7B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_kv_cache_length:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,batching_strategy:inflight_batching,max_queue_delay_microseconds:600
+python3 tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/llama/7B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,batching_strategy:inflight_batching,max_queue_delay_microseconds:600
 
 pip pinstall SentencePiece
 python3 scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,13 @@ input [`
`55`	`55`	`data_type: TYPE_INT32`
`56`	`56`	`dims: [ 1 ]`
`57`	`57`	`},`
	`58`	`+ {`
	`59`	`+ name: "draft_input_ids"`
	`60`	`+ data_type: TYPE_INT32`
	`61`	`+ dims: [ -1 ]`
	`62`	`+ optional: true`
	`63`	`+ allow_ragged_batch: true`
	`64`	`+ },`
`58`	`65`	`{`
`59`	`66`	`name: "end_id"`
`60`	`67`	`data_type: TYPE_INT32`
`@@ -246,9 +253,9 @@ parameters: {`
`246`	`253`	`}`
`247`	`254`	`}`
`248`	`255`	`parameters: {`
`249`		`- key: "max_kv_cache_length"`
	`256`	`+ key: "max_attention_window_size"`
`250`	`257`	`value: {`
`251`		`- string_value: "${max_kv_cache_length}"`
	`258`	`+ string_value: "${max_attention_window_size}"`
`252`	`259`	`}`
`253`	`260`	`}`
`254`	`261`	`parameters: {`
`@@ -281,3 +288,9 @@ parameters: {`
`281`	`288`	`string_value: "${exclude_input_in_output}"`
`282`	`289`	`}`
`283`	`290`	`}`
	`291`	`+parameters: {`
	`292`	`+ key: "use_context_fmha_for_generation"`
	`293`	`+ value: {`
	`294`	`+ string_value: "${use_context_fmha_for_generation}"`
	`295`	`+ }`
	`296`	`+}`