From 50ceda00cffadbecc2792826c277cfd92d837aca Mon Sep 17 00:00:00 2001
From: "Wei Sun (Jack)" <weisun@google.com>
Date: Sat, 8 Nov 2025 13:21:06 -0800
Subject: [PATCH] docs(models): Updates `BaseLlm#generate_content_async`
 docstring with the expected behavior

NOTE:

- This is the expected behavior, so that SSE streaming and non-streaming behaviors are idential to each other in the persisted session.
- The underlying google_llm.py hasn't been fully comply with this behavior, which will be addressed in future changes.

Co-authored-by: Wei Sun (Jack) <weisun@google.com>
PiperOrigin-RevId: 829878175
---
 src/google/adk/models/base_llm.py | 93 ++++++++++++++++++++++++++++---
 1 file changed, 86 insertions(+), 7 deletions(-)

diff --git a/src/google/adk/models/base_llm.py b/src/google/adk/models/base_llm.py
index e385fab7..0f419a9b 100644
--- a/src/google/adk/models/base_llm.py
+++ b/src/google/adk/models/base_llm.py
@@ -50,20 +50,99 @@ class BaseLlm(BaseModel):
   async def generate_content_async(
       self, llm_request: LlmRequest, stream: bool = False
   ) -> AsyncGenerator[LlmResponse, None]:
-    """Generates one content from the given contents and tools.
+    """Generates content for a single model turn.
+
+    This method handles Server-Sent Events (SSE) streaming for unidirectional
+    content generation. For bidirectional streaming (e.g., Gemini Live API),
+    use the `connect()` method instead.
 
     Args:
       llm_request: LlmRequest, the request to send to the LLM.
-      stream: bool = False, whether to do streaming call.
+      stream: bool = False, whether to enable SSE streaming mode.
 
     Yields:
-      a generator of types.Content.
+      LlmResponse objects representing the model's response for one turn.
 
-      For non-streaming call, it will only yield one Content.
+      **Non-streaming mode (stream=False):**
 
-      For streaming call, it may yield more than one content, but all yielded
-      contents should be treated as one content by merging the
-      parts list.
+        Yields exactly one LlmResponse containing the complete model output
+        (text, function calls, bytes, etc.). This response has `partial=False`.
+
+      **Streaming mode (stream=True):**
+
+        Yields multiple LlmResponse objects as chunks arrive:
+
+        - Intermediate chunks: `partial=True` (progressive updates)
+        - Final chunk: `partial=False` (aggregated content from entire turn,
+          identical to stream=False output)
+        - Text consolidation: Consecutive text parts of the same type
+          (thought/non-thought) SHOULD merge without separator, but client
+          code must not rely on this - unconsolidated parts are unusual but also
+          valid
+
+      **Common content in partial chunks:**
+
+        All intermediate chunks have `partial=True` regardless of content type.
+        Common examples include:
+
+        - Text: Streams incrementally as tokens arrive
+        - Function calls: May arrive in separate chunks
+        - Bytes (e.g., images): Typically arrive as single chunk, interleaved
+          with text
+        - Thoughts: Stream incrementally when thinking_config is enabled
+
+      **Examples:**
+
+      1. Simple text streaming::
+
+           LlmResponse(partial=True,  parts=["The weather"])
+           LlmResponse(partial=True,  parts=[" in Tokyo is"])
+           LlmResponse(partial=True,  parts=[" sunny."])
+           LlmResponse(partial=False, parts=["The weather in Tokyo is sunny."])
+
+      2. Text + function call::
+
+           LlmResponse(partial=True,  parts=[Text("Let me check...")])
+           LlmResponse(partial=True,  parts=[FunctionCall("get_weather", ...)])
+           LlmResponse(partial=False, parts=[Text("Let me check..."),
+                                             FunctionCall("get_weather", ...)])
+
+      3. Parallel function calls across chunks::
+
+           LlmResponse(partial=True,  parts=[Text("Checking both cities...")])
+           LlmResponse(partial=True,  parts=[FunctionCall("get_weather", Tokyo)])
+           LlmResponse(partial=True,  parts=[FunctionCall("get_weather", NYC)])
+           LlmResponse(partial=False, parts=[Text("Checking both cities..."),
+                                             FunctionCall("get_weather", Tokyo),
+                                             FunctionCall("get_weather", NYC)])
+
+      4. Text + bytes (image generation with gemini-2.5-flash-image)::
+
+           LlmResponse(partial=True,  parts=[Text("Here's an image of a dog.")])
+           LlmResponse(partial=True,  parts=[Text("\n")])
+           LlmResponse(partial=True,  parts=[Blob(image/png, 1.6MB)])
+           LlmResponse(partial=True,  parts=[Text("It carries a bone")])
+           LlmResponse(partial=True,  parts=[Text(" and running around.")])
+           LlmResponse(partial=False, parts=[Text("Here's an image of a dog.\n"),
+                                             Blob(image/png, 1.6MB),
+                                             Text("It carries a bone and running around.")])
+
+         Note: Consecutive text parts before and after blob merge separately.
+
+      5. Text with thinking (gemini-2.5-flash with thinking_config)::
+
+           LlmResponse(partial=True,  parts=[Thought("Let me analyze...")])
+           LlmResponse(partial=True,  parts=[Thought("The user wants...")])
+           LlmResponse(partial=True,  parts=[Text("Based on my analysis,")])
+           LlmResponse(partial=True,  parts=[Text(" the answer is 42.")])
+           LlmResponse(partial=False, parts=[Thought("Let me analyze...The user wants..."),
+                                             Text("Based on my analysis, the answer is 42.")])
+
+         Note: Consecutive parts of same type merge (thoughts→thought, text→text).
+
+      **Important:** All yielded responses represent one logical model turn.
+      The final response with `partial=False` should be identical to the
+      response that would be received with `stream=False`.
     """
     raise NotImplementedError(
         f'Async generation is not supported for {self.model}.'