@@ -66,7 +66,7 @@ def init_device(self):
6666 device = self .device ,
6767 rank = self .rank ,
6868 local_rank = self .local_rank )
69-
69+
7070 def graph_optimize_and_warm_up_model (self ) -> None :
7171 """
7272 Optimizes the inference graph using the specified optimization options.
@@ -86,9 +86,10 @@ def determine_available_memory(self) -> int:
8686 You may limit the usage of GPU memory
8787 by adjusting the `gpu_memory_utilization` parameter.
8888 """
89- from fastdeploy .model_executor .ops .xpu import \
90- xpu_get_free_global_memory , xpu_get_total_global_memory , xpu_get_used_global_memory
91-
89+ from fastdeploy .model_executor .ops .xpu import (
90+ xpu_get_free_global_memory , xpu_get_total_global_memory ,
91+ xpu_get_used_global_memory )
92+
9293 total_memory = xpu_get_total_global_memory (self .local_rank )
9394 used_memory = xpu_get_used_global_memory (self .local_rank )
9495 free_memory = xpu_get_free_global_memory (self .local_rank )
@@ -98,20 +99,20 @@ def determine_available_memory(self) -> int:
9899
99100 self .model_runner .prepare_profile ()
100101 self .model_runner .profile_run ()
101-
102+
102103 total_available_memory = int (total_memory * self .parallel_config .gpu_memory_utilization )
103104 used_memory = xpu_get_used_global_memory (self .local_rank )
104105 available_kv_cache_memory = total_available_memory - used_memory
105106 model_block_memory_used = self .cal_theortical_kvcache ()
106- available_kv_cache_memory += model_block_memory_used * self .parallel_config .max_block_num
107+ available_kv_cache_memory += model_block_memory_used * self .parallel_config .total_block_num
107108
108109 self .model_runner .clear_block_table ()
109110
110111 logger .info (f"After warm up, total_available_memory: { total_available_memory } , \
111112 used_memory: { used_memory } , available_kv_cache_memory: { available_kv_cache_memory } " )
112113 paddle .device .xpu .empty_cache ()
113114 return available_kv_cache_memory # approximate value
114-
115+
115116 def cal_theortical_kvcache (self ) -> int :
116117 """ """
117118 return self .model_runner .cal_theortical_kvcache ()
@@ -154,10 +155,6 @@ def check_health(self) -> bool:
154155 """ """
155156 return True
156157
157- def cal_theortical_kvcache (self ) -> int :
158- """ """
159- return self .model_runner .cal_theortical_kvcache ()
160-
161158 def reinitialize_kv_cache (self , num_gpu_blocks : int ) -> None :
162159 """ """
163160 self .model_runner .update_share_input_block_num (
0 commit comments