diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py
index e54957bfa51..7600d660734 100644
--- a/fastdeploy/__init__.py
+++ b/fastdeploy/__init__.py
@@ -54,7 +54,8 @@
     get_version_info,
 )
 
-paddle.compat.enable_torch_proxy(scope={"triton"})
+if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"):
+    paddle.compat.enable_torch_proxy(scope={"triton"})
 # paddle.compat.enable_torch_proxy(scope={"triton"}) enables the torch proxy
 # specifically for the 'triton' module. This means `import torch` inside 'triton'
 # will actually import paddle's compatibility layer (acting as torch).
diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
index c384a9d2810..da62a4f4f7f 100644
--- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
@@ -57,7 +57,9 @@
 
 from fastdeploy.platforms import current_platform
 
-paddle.compat.enable_torch_proxy(scope={"cutlass"})
+if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"):
+    # Guard against environments where paddle.compat or enable_torch_proxy is missing
+    paddle.compat.enable_torch_proxy(scope={"cutlass"})
 flashmask_attention_v4 = None
 
 if current_platform.is_cuda():
diff --git a/fastdeploy/model_executor/layers/moe/ep.py b/fastdeploy/model_executor/layers/moe/ep.py
index 4489f5ec089..249e7936ba5 100644
--- a/fastdeploy/model_executor/layers/moe/ep.py
+++ b/fastdeploy/model_executor/layers/moe/ep.py
@@ -40,7 +40,8 @@ def load_deep_ep() -> ModuleType:
     try:
         if envs.FD_USE_PFCC_DEEP_EP:
             # Enable torch proxy before importing deep_ep (required by PFCC/PaddleFleet variants)
-            paddle.compat.enable_torch_proxy(scope={"deep_ep"})
+            if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"):
+                paddle.compat.enable_torch_proxy(scope={"deep_ep"})
             try:
                 import paddlefleet.ops.deep_ep as deep_ep  # type: ignore
 
diff --git a/fastdeploy/model_executor/layers/quantization/fp8_utils.py b/fastdeploy/model_executor/layers/quantization/fp8_utils.py
index b7f00c37eea..d09ebce803b 100644
--- a/fastdeploy/model_executor/layers/quantization/fp8_utils.py
+++ b/fastdeploy/model_executor/layers/quantization/fp8_utils.py
@@ -33,7 +33,8 @@ def load_deep_gemm():
     if current_platform.is_cuda():
         if get_sm_version() == 100:
             # SM100 should use PFCC DeepGemm
-            paddle.compat.enable_torch_proxy(scope={"deep_gemm"})
+            if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"):
+                paddle.compat.enable_torch_proxy(scope={"deep_gemm"})
             try:
                 import paddlefleet.ops.deep_gemm as deep_gemm
 
diff --git a/fastdeploy/model_executor/layers/quantization/mxfp4.py b/fastdeploy/model_executor/layers/quantization/mxfp4.py
index e64dc10b76b..5abdd3ed5ae 100644
--- a/fastdeploy/model_executor/layers/quantization/mxfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/mxfp4.py
@@ -35,7 +35,8 @@
 from ..moe import FusedMoE
 from .quant_base import QuantConfigBase, QuantMethodBase
 
-paddle.compat.enable_torch_proxy(scope={"flashinfer"})
+if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"):
+    paddle.compat.enable_torch_proxy(scope={"flashinfer"})
 
 logger = get_logger("config", "config.log")
 
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index e07db868d3d..0177703ae06 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -30,7 +30,8 @@
 
 from .quant_base import QuantConfigBase, QuantMethodBase
 
-paddle.compat.enable_torch_proxy(scope={"flashinfer"})
+if hasattr(paddle, "compat") and hasattr(paddle.compat, "enable_torch_proxy"):
+    paddle.compat.enable_torch_proxy(scope={"flashinfer"})
 
 
 def next_power_of_2(n: int):
diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py
index 8ff528248ee..cf2f32b41fa 100644
--- a/fastdeploy/model_executor/layers/rotary_embedding.py
+++ b/fastdeploy/model_executor/layers/rotary_embedding.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """
 
+import functools
 import math
 from typing import Optional, Tuple
 
@@ -29,6 +30,13 @@
 from .utils import CpuGuard
 
 
+@functools.lru_cache(maxsize=128)
+def get_inv_freq(rotary_dim, base, device):
+    # Calculate inverse frequency for rotary embedding
+    inv_freq = base ** (-paddle.arange(0, rotary_dim, 2, dtype="float32") / rotary_dim)
+    return inv_freq.to(device)
+
+
 class ErnieRotaryEmbedding:
     def __init__(self, rotary_dim, base, partial_rotary_factor):
         """
@@ -41,6 +49,7 @@ def __init__(self, rotary_dim, base, partial_rotary_factor):
     def __call__(self, position_ids):
         bsz, max_seq_len = position_ids.shape[:2]
         inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim)
+        inv_freq = inv_freq.to(position_ids.place)
         partial_rotary_position_ids = position_ids / self.partial_rotary_factor
         freqs = paddle.einsum("ij,k->ijk", partial_rotary_position_ids.cast("float32"), inv_freq)
         if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"):
@@ -87,7 +96,7 @@ def __init__(self, rotary_dim, base, partial_rotary_factor):
 
     def __call__(self, position_ids):
         bsz, max_seq_len = position_ids.shape[:2]
-        inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim)
+        inv_freq = get_inv_freq(self.rotary_dim, self.base, position_ids.place)
         freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq)
         # shape: [B, S, D/2]
         rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32")
@@ -111,7 +120,7 @@ def __init__(self, rotary_dim, base, partial_rotary_factor):
     def __call__(self, position_ids):
         bsz, max_seq_len = position_ids.shape[:2]
         rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32")
-        inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim)
+        inv_freq = get_inv_freq(self.rotary_dim, self.base, position_ids.place)
 
         # shape: [B, S, D/2]
         freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq)
diff --git a/fastdeploy/model_executor/ops/iluvatar/moe_ops.py b/fastdeploy/model_executor/ops/iluvatar/moe_ops.py
index 3ed6b02e46a..8e676f4b0c9 100644
--- a/fastdeploy/model_executor/ops/iluvatar/moe_ops.py
+++ b/fastdeploy/model_executor/ops/iluvatar/moe_ops.py
@@ -17,7 +17,16 @@
 from typing import Optional
 
 import paddle
-from paddle.nn.functional import swiglu
+
+try:
+    from paddle.nn.functional import swiglu
+except ImportError:
+
+    def swiglu(x):
+        x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return paddle.nn.functional.silu(x) * y
+
+
 from paddle.nn.quant import weight_only_linear
 
 try:
diff --git a/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py b/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py
index d7d75090561..98931ad0c38 100644
--- a/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py
+++ b/tests/xpu_ci/8cards_cases/test_pd_21b_ep4tp1.py
@@ -304,7 +304,8 @@ def test_pd_separation():
 
         # 验证响应
         assert any(
-            keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言", "百度", "智能助手"]
+            keyword in response.choices[0].message.content
+            for keyword in ["人工智能", "文心一言", "百度", "智能助手", "研发", "语言"]
         ), f"响应内容不符合预期: {response.choices[0].message.content}"
 
         print("\nPD分离测试通过!")