[chore] Add diffusers-format example to LongCatAudioDiTPipeline (#13483)

RuixiangMa · github-actions[bot] · dg845 · web-flow · commit 947bc23ba42e · 2026-04-15T21:52:15.000-07:00
* [chore] Add diffusers-format example and seed parameter to LongCatAudioDiTPipeline

Signed-off-by: Lancer &lt;maruixiang6688@gmail.com&gt;

* Apply style fixes

* Apply suggestions from code review

Co-authored-by: dg845 &lt;58458699+dg845@users.noreply.github.com&gt;

* upd

Signed-off-by: Lancer &lt;maruixiang6688@gmail.com&gt;

* Apply style fixes

---------

Signed-off-by: Lancer &lt;maruixiang6688@gmail.com&gt;
Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
Co-authored-by: dg845 &lt;58458699+dg845@users.noreply.github.com&gt;
diff --git a/docs/source/en/api/pipelines/longcat_audio_dit.md b/docs/source/en/api/pipelines/longcat_audio_dit.md
@@ -14,15 +14,10 @@ specific language governing permissions and limitations under the License.
 
 LongCat-AudioDiT is a text-to-audio diffusion model from Meituan LongCat. The diffusers integration exposes a standard [`DiffusionPipeline`] interface for text-conditioned audio generation.
 
-This pipeline supports loading the original flat LongCat checkpoint layout from either a local directory or a Hugging Face Hub repository containing:
-
-- `config.json`
-- `model.safetensors`
-
-The loader builds the text encoder, transformer, and VAE from `config.json`, restores component weights from `model.safetensors`, and ties the shared UMT5 embedding when needed.
-
 This pipeline was adapted from the LongCat-AudioDiT reference implementation: https://github.com/meituan-longcat/LongCat-AudioDiT
 
+This pipeline supports loading from a local directory or Hugging Face Hub repository in diffusers format (containing `text_encoder/`, `transformer/`, `vae/`, `tokenizer/`, and `scheduler/` subfolders).
+
 ## Usage
 
 ```py
@@ -31,27 +26,29 @@ import torch
 from diffusers import LongCatAudioDiTPipeline
 
 pipeline = LongCatAudioDiTPipeline.from_pretrained(
-    "meituan-longcat/LongCat-AudioDiT-1B",
+    "ruixiangma/LongCat-AudioDiT-1B-Diffusers",
     torch_dtype=torch.float16,
 )
 pipeline = pipeline.to("cuda")
 
+prompt = "A calm ocean wave ambience with soft wind in the background."
 audio = pipeline(
-    prompt="A calm ocean wave ambience with soft wind in the background.",
-    audio_end_in_s=5.0,
+    prompt,
+    audio_duration_s=5.0,
     num_inference_steps=16,
     guidance_scale=4.0,
-    output_type="pt",
-).audios
+    generator=torch.Generator("cuda").manual_seed(42),
+).audios[0, 0]
 
-output = audio[0, 0].float().cpu().numpy()
-sf.write("longcat.wav", output, pipeline.sample_rate)
+sf.write("longcat.wav", audio, pipeline.sample_rate)
 ```
 
 ## Tips
 
-- `audio_end_in_s` is the most direct way to control output duration.
-- `output_type="pt"` returns a PyTorch tensor shaped `(batch, channels, samples)`.
+- `audio_duration_s` is the most direct way to control output duration.
+- Use `generator=torch.Generator("cuda").manual_seed(42)` to make generation reproducible.
+- Output shape is `(batch, channels, samples)` - use `.audios[0, 0]` to get a single audio sample.
+- The pipeline outputs mono audio (1 channel). If you need stereo, you can duplicate the channel: `audio.unsqueeze(0).repeat(1, 2, 1)`.
 
 ## LongCatAudioDiTPipeline
 
diff --git a/src/diffusers/pipelines/longcat_audio_dit/pipeline_longcat_audio_dit.py b/src/diffusers/pipelines/longcat_audio_dit/pipeline_longcat_audio_dit.py
@@ -25,12 +25,35 @@
 from ...models import LongCatAudioDiTTransformer, LongCatAudioDiTVae
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
+from ...utils.doc_utils import replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 
 
 logger = logging.get_logger(__name__)
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import soundfile as sf
+        >>> import torch
+        >>> from diffusers import LongCatAudioDiTPipeline
+
+        >>> pipe = LongCatAudioDiTPipeline.from_pretrained("ruixiangma/LongCat-AudioDiT-1B-Diffusers")
+        >>> pipe.to("cuda")
+
+        >>> prompt = "A calm ocean wave ambience with soft wind in the background."
+        >>> audio = pipe(
+        ...     prompt,
+        ...     audio_duration_s=5.0,
+        ...     num_inference_steps=20,
+        ...     guidance_scale=4.0,
+        ...     generator=torch.Generator("cuda").manual_seed(42),
+        ... ).audios[0, 0]
+        >>> sf.write("output.wav", audio, pipe.sample_rate)
+        ```
+"""
+
 
 def _lens_to_mask(lengths: torch.Tensor, length: int | None = None) -> torch.BoolTensor:
     if length is None:
@@ -194,6 +217,7 @@ def check_inputs(
                 )
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: str | list[str],
@@ -228,6 +252,8 @@ def __call__(
                 inputs specified by `callback_on_step_end_tensor_inputs`.
             callback_on_step_end_tensor_inputs (`list`, defaults to `["latents"]`):
                 Tensor inputs passed to `callback_on_step_end`.
+
+        Examples:
         """
         if prompt is None:
             prompt = []