diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py index e54957bfa51..7600d660734 100644 --- a/fastdeploy/__init__.py +++ b/fastdeploy/__init__.py @@ -54,7 +54,8 @@ get_version_info, ) -paddle.compat.enable_torch_proxy(scope={"triton"}) +if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"): + paddle.compat.enable_torch_proxy(scope={"triton"}) # paddle.compat.enable_torch_proxy(scope={"triton"}) enables the torch proxy # specifically for the 'triton' module. This means `import torch` inside 'triton' # will actually import paddle's compatibility layer (acting as torch). diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index c384a9d2810..da62a4f4f7f 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -57,7 +57,9 @@ from fastdeploy.platforms import current_platform -paddle.compat.enable_torch_proxy(scope={"cutlass"}) +if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"): + # Guard against environments where paddle.compat or enable_torch_proxy is missing + paddle.compat.enable_torch_proxy(scope={"cutlass"}) flashmask_attention_v4 = None if current_platform.is_cuda(): diff --git a/fastdeploy/model_executor/layers/moe/ep.py b/fastdeploy/model_executor/layers/moe/ep.py index 4489f5ec089..249e7936ba5 100644 --- a/fastdeploy/model_executor/layers/moe/ep.py +++ b/fastdeploy/model_executor/layers/moe/ep.py @@ -40,7 +40,8 @@ def load_deep_ep() -> ModuleType: try: if envs.FD_USE_PFCC_DEEP_EP: # Enable torch proxy before importing deep_ep (required by PFCC/PaddleFleet variants) - paddle.compat.enable_torch_proxy(scope={"deep_ep"}) + if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"): + paddle.compat.enable_torch_proxy(scope={"deep_ep"}) try: import paddlefleet.ops.deep_ep as deep_ep # type: ignore diff --git a/fastdeploy/model_executor/layers/quantization/fp8_utils.py b/fastdeploy/model_executor/layers/quantization/fp8_utils.py index b7f00c37eea..d09ebce803b 100644 --- a/fastdeploy/model_executor/layers/quantization/fp8_utils.py +++ b/fastdeploy/model_executor/layers/quantization/fp8_utils.py @@ -33,7 +33,8 @@ def load_deep_gemm(): if current_platform.is_cuda(): if get_sm_version() == 100: # SM100 should use PFCC DeepGemm - paddle.compat.enable_torch_proxy(scope={"deep_gemm"}) + if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"): + paddle.compat.enable_torch_proxy(scope={"deep_gemm"}) try: import paddlefleet.ops.deep_gemm as deep_gemm diff --git a/fastdeploy/model_executor/layers/quantization/mxfp4.py b/fastdeploy/model_executor/layers/quantization/mxfp4.py index e64dc10b76b..5abdd3ed5ae 100644 --- a/fastdeploy/model_executor/layers/quantization/mxfp4.py +++ b/fastdeploy/model_executor/layers/quantization/mxfp4.py @@ -35,7 +35,8 @@ from ..moe import FusedMoE from .quant_base import QuantConfigBase, QuantMethodBase -paddle.compat.enable_torch_proxy(scope={"flashinfer"}) +if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"): + paddle.compat.enable_torch_proxy(scope={"flashinfer"}) logger = get_logger("config", "config.log") diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index e07db868d3d..0177703ae06 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -30,7 +30,8 @@ from .quant_base import QuantConfigBase, QuantMethodBase -paddle.compat.enable_torch_proxy(scope={"flashinfer"}) +if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"): + paddle.compat.enable_torch_proxy(scope={"flashinfer"}) def next_power_of_2(n: int): diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py index 8ff528248ee..cf2f32b41fa 100644 --- a/fastdeploy/model_executor/layers/rotary_embedding.py +++ b/fastdeploy/model_executor/layers/rotary_embedding.py @@ -14,6 +14,7 @@ # limitations under the License. """ +import functools import math from typing import Optional, Tuple @@ -29,6 +30,13 @@ from .utils import CpuGuard +@functools.lru_cache(maxsize=128) +def get_inv_freq(rotary_dim, base, device): + # Calculate inverse frequency for rotary embedding + inv_freq = base ** (-paddle.arange(0, rotary_dim, 2, dtype="float32") / rotary_dim) + return inv_freq.to(device) + + class ErnieRotaryEmbedding: def __init__(self, rotary_dim, base, partial_rotary_factor): """ @@ -41,6 +49,7 @@ def __init__(self, rotary_dim, base, partial_rotary_factor): def __call__(self, position_ids): bsz, max_seq_len = position_ids.shape[:2] inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim) + inv_freq = inv_freq.to(position_ids.place) partial_rotary_position_ids = position_ids / self.partial_rotary_factor freqs = paddle.einsum("ij,k->ijk", partial_rotary_position_ids.cast("float32"), inv_freq) if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"): @@ -87,7 +96,7 @@ def __init__(self, rotary_dim, base, partial_rotary_factor): def __call__(self, position_ids): bsz, max_seq_len = position_ids.shape[:2] - inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim) + inv_freq = get_inv_freq(self.rotary_dim, self.base, position_ids.place) freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq) # shape: [B, S, D/2] rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32") @@ -111,7 +120,7 @@ def __init__(self, rotary_dim, base, partial_rotary_factor): def __call__(self, position_ids): bsz, max_seq_len = position_ids.shape[:2] rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32") - inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim) + inv_freq = get_inv_freq(self.rotary_dim, self.base, position_ids.place) # shape: [B, S, D/2] freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq) diff --git a/fastdeploy/model_executor/ops/iluvatar/moe_ops.py b/fastdeploy/model_executor/ops/iluvatar/moe_ops.py index 3ed6b02e46a..8e676f4b0c9 100644 --- a/fastdeploy/model_executor/ops/iluvatar/moe_ops.py +++ b/fastdeploy/model_executor/ops/iluvatar/moe_ops.py @@ -17,7 +17,16 @@ from typing import Optional import paddle -from paddle.nn.functional import swiglu + +try: + from paddle.nn.functional import swiglu +except ImportError: + + def swiglu(x): + x, y = paddle.chunk(x, chunks=2, axis=-1) + return paddle.nn.functional.silu(x) * y + + from paddle.nn.quant import weight_only_linear try: diff --git a/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py b/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py index d7d75090561..98931ad0c38 100644 --- a/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py +++ b/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py @@ -304,7 +304,8 @@ def test_pd_separation(): # 验证响应 assert any( - keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言", "百度", "智能助手"] + keyword in response.choices[0].message.content + for keyword in ["人工智能", "文心一言", "百度", "智能助手", "研发", "语言"] ), f"响应内容不符合预期: {response.choices[0].message.content}" print("\nPD分离测试通过!")