[BugFix] Fix Configs (PaddlePaddle#2849)

YuanRisheng · web-flow · commit 101ad333325e · 2025-07-15T19:50:36.000-07:00
* fix config

* fix config
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -58,6 +58,7 @@ class MoEPhase(Enum):
     "freq_allocation":20,
     "tie_word_embeddings":False,
     "rms_norm_eps":1e-5,
+    "moe_num_experts": None,
 }
 
 
@@ -143,7 +144,7 @@ def __init__(
         self.model_name_or_path: str = "./output"
         self.max_num_seqs: int = 34
         # Set default block num for profile run
-        self.max_block_num: int = 2000
+        self.total_block_num: int = 2000
         # block size
         self.block_size: int = 64
         # Engine worker queue port
diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
@@ -93,7 +93,7 @@ def dummy_prefill_inputs(self, num_tokens: int, batch_size: int,
                              expected_decode_len: int):
         """Set dummy prefill inputs to model_inputs"""
         max_dec_len = expected_decode_len + 1
-        self.num_gpu_blocks = self.parallel_config.max_block_num
+        self.num_gpu_blocks = self.parallel_config.total_block_num
         self.initialize_kv_cache()
         full_length = min(num_tokens // batch_size,
                           self.parallel_config.max_model_len - max_dec_len)
@@ -327,8 +327,8 @@ def _init_model_inputs(self):
 
         self.free_list = list(
             range(
-                self.parallel_config.max_block_num - 1,
-                int(self.parallel_config.max_block_num *
+                self.parallel_config.total_block_num - 1,
+                int(self.parallel_config.total_block_num *
                     self.parallel_config.kv_cache_ratio) - 1,
                 -1,
             ))
diff --git a/fastdeploy/worker/dcu_worker.py b/fastdeploy/worker/dcu_worker.py
@@ -13,18 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-import gc
 import time
-from typing import List, Optional
 
 import paddle
-import paddle.nn as nn
 
 from fastdeploy.config import FDConfig
-from fastdeploy.engine.request import Request
 from fastdeploy.utils import get_logger
-from fastdeploy.worker.gpu_model_runner import GPUModelRunner
-from fastdeploy.worker.output import ModelRunnerOutput
 from fastdeploy.worker.gpu_worker import GpuWorker
 
 logger = get_logger("dcu_worker", "dcu_worker.log")
@@ -97,7 +91,7 @@ def determine_available_memory(self) -> int:
         paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run
         available_kv_cache_memory = total_gpu_memory * \
             self.parallel_config.gpu_memory_utilization - after_used_gpu_memory - paddle_peak_increase
-        available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
+        available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
 
         end_time = time.perf_counter()
         logger.info(
diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py
@@ -480,8 +480,8 @@ def _init_share_inputs(self, max_num_seqs: int):
         # Initialize free list
         free_list = list(
             range(
-                self.parallel_config.max_block_num - 1,
-                int(self.parallel_config.max_block_num *
+                self.parallel_config.total_block_num - 1,
+                int(self.parallel_config.total_block_num *
                     self.parallel_config.kv_cache_ratio) - 1, -1))
         self.free_list_len = len(free_list)
         self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -1114,7 +1114,7 @@ def profile_run(self) -> None:
         """Execute a forward pass with dummy inputs to profile the memory usage of the model."""
 
         # Initialize kv cache for profile run. After profile run kv cache will be reset.
-        self.num_gcu_blocks = self.parallel_config.max_block_num
+        self.num_gcu_blocks = self.parallel_config.total_block_num
         self.initialize_kv_cache()
 
         # 1. Profile with multimodal encoder & encoder cache
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -591,8 +591,8 @@ def _init_share_inputs(self, max_num_seqs: int):
         # Initialize free list
         free_list = list(
             range(
-                self.parallel_config.max_block_num - 1,
-                int(self.parallel_config.max_block_num *
+                self.parallel_config.total_block_num - 1,
+                int(self.parallel_config.total_block_num *
                     self.parallel_config.kv_cache_ratio) - 1, -1))
         self.free_list_len = len(free_list)
         self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -1295,7 +1295,7 @@ def profile_run(self) -> None:
 
         # Initialize kv cache for profile run. After profile run kv cache will be reset.
         # TODO(gongshaotian): Optimize the management logic of kvcache
-        self.num_gpu_blocks = self.parallel_config.max_block_num
+        self.num_gpu_blocks = self.parallel_config.total_block_num
         self.initialize_kv_cache()
 
         # 1. Profile with multimodal encoder & encoder cache
diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py
@@ -61,7 +61,8 @@ def init_device(self):
             gc.collect()
             paddle.device.cuda.empty_cache()
             if self.parallel_config.enable_custom_all_reduce:
-                from fastdeploy.distributed.communication_op import use_custom_allreduce
+                from fastdeploy.distributed.communication_op import \
+                    use_custom_allreduce
                 use_custom_allreduce()
         else:
             raise RuntimeError(
@@ -137,7 +138,7 @@ def determine_available_memory(self) -> int:
 
         available_kv_cache_memory = after_run_meminfo.total * \
             self.parallel_config.gpu_memory_utilization - after_run_meminfo.used - paddle_peak_increase
-        available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
+        available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
 
         end_time = time.perf_counter()
         logger.info((
diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py
@@ -468,8 +468,8 @@ def _init_share_inputs(self, max_num_seqs: int):
         # Initialize free list
         free_list = list(
             range(
-                self.parallel_config.max_block_num - 1,
-                int(self.parallel_config.max_block_num *
+                self.parallel_config.total_block_num - 1,
+                int(self.parallel_config.total_block_num *
                     self.parallel_config.kv_cache_ratio) - 1, -1))
         self.free_list_len = len(free_list)
         self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -1069,7 +1069,7 @@ def profile_run(self) -> None:
 
         # Initialize kv cache for profile run. After profile run kv cache will be reset.
         # TODO(gongshaotian): Optimize the management logic of kvcache
-        self.num_gpu_blocks = self.parallel_config.max_block_num
+        self.num_gpu_blocks = self.parallel_config.total_block_num
         self.initialize_kv_cache()
 
         # 1. Profile with multimodal encoder & encoder cache
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
@@ -372,7 +372,7 @@ def determine_num_available_blocks(self) -> None:
             self.get_profile_block_num_signal.value[
                 self.local_rank] = num_blocks_global
         else:
-            num_blocks_global = self.fd_config.parallel_config.max_block_num
+            num_blocks_global = self.fd_config.parallel_config.total_block_num
         # NOTE(liuzichang): Too big num_blocks_global will lead to error 700
         # 4. Updata share inputs
         self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
@@ -479,8 +479,8 @@ def _init_share_inputs(self, max_num_seqs: int):
         # Initialize free list
         free_list = list(
             range(
-                self.parallel_config.max_block_num - 1,
-                int(self.parallel_config.max_block_num *
+                self.parallel_config.total_block_num - 1,
+                int(self.parallel_config.total_block_num *
                     self.parallel_config.kv_cache_ratio) - 1, -1))
         self.free_list_len = len(free_list)
         self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -757,7 +757,7 @@ class at the server level, which is too granular for ModelRunner.
     def prepare_profile(self) -> None:
         """Prepare the profile run by setting the block number and initializing the KV cache."""
         paddle.device.xpu.empty_cache()
-        self.num_gpu_blocks = self.parallel_config.max_block_num
+        self.num_gpu_blocks = self.parallel_config.total_block_num
         self.initialize_kv_cache()
 
     def profile_run(self) -> None:
diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py
@@ -66,7 +66,7 @@ def init_device(self):
             device=self.device,
             rank=self.rank,
             local_rank=self.local_rank)
-        
+
     def graph_optimize_and_warm_up_model(self) -> None:
         """
             Optimizes the inference graph using the specified optimization options.
@@ -86,9 +86,10 @@ def determine_available_memory(self) -> int:
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
-        from fastdeploy.model_executor.ops.xpu import \
-            xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory
-        
+        from fastdeploy.model_executor.ops.xpu import (
+            xpu_get_free_global_memory, xpu_get_total_global_memory,
+            xpu_get_used_global_memory)
+
         total_memory = xpu_get_total_global_memory(self.local_rank)
         used_memory = xpu_get_used_global_memory(self.local_rank)
         free_memory = xpu_get_free_global_memory(self.local_rank)
@@ -98,20 +99,20 @@ def determine_available_memory(self) -> int:
 
         self.model_runner.prepare_profile()
         self.model_runner.profile_run()
-        
+
         total_available_memory = int(total_memory * self.parallel_config.gpu_memory_utilization)
         used_memory = xpu_get_used_global_memory(self.local_rank)
         available_kv_cache_memory = total_available_memory - used_memory
         model_block_memory_used = self.cal_theortical_kvcache()
-        available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
+        available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
 
         self.model_runner.clear_block_table()
 
         logger.info(f"After warm up, total_available_memory: {total_available_memory}, \
                     used_memory: {used_memory}, available_kv_cache_memory: {available_kv_cache_memory}")
         paddle.device.xpu.empty_cache()
         return available_kv_cache_memory  # approximate value
-    
+
     def cal_theortical_kvcache(self) -> int:
         """ """
         return self.model_runner.cal_theortical_kvcache()
@@ -154,10 +155,6 @@ def check_health(self) -> bool:
         """ """
         return True
 
-    def cal_theortical_kvcache(self) -> int:
-        """ """
-        return self.model_runner.cal_theortical_kvcache()
-
     def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
         """ """
         self.model_runner.update_share_input_block_num(