From 50ceda00cffadbecc2792826c277cfd92d837aca Mon Sep 17 00:00:00 2001 From: "Wei Sun (Jack)" Date: Sat, 8 Nov 2025 13:21:06 -0800 Subject: [PATCH] docs(models): Updates `BaseLlm#generate_content_async` docstring with the expected behavior NOTE: - This is the expected behavior, so that SSE streaming and non-streaming behaviors are idential to each other in the persisted session. - The underlying google_llm.py hasn't been fully comply with this behavior, which will be addressed in future changes. Co-authored-by: Wei Sun (Jack) PiperOrigin-RevId: 829878175 --- src/google/adk/models/base_llm.py | 93 ++++++++++++++++++++++++++++--- 1 file changed, 86 insertions(+), 7 deletions(-) diff --git a/src/google/adk/models/base_llm.py b/src/google/adk/models/base_llm.py index e385fab7..0f419a9b 100644 --- a/src/google/adk/models/base_llm.py +++ b/src/google/adk/models/base_llm.py @@ -50,20 +50,99 @@ class BaseLlm(BaseModel): async def generate_content_async( self, llm_request: LlmRequest, stream: bool = False ) -> AsyncGenerator[LlmResponse, None]: - """Generates one content from the given contents and tools. + """Generates content for a single model turn. + + This method handles Server-Sent Events (SSE) streaming for unidirectional + content generation. For bidirectional streaming (e.g., Gemini Live API), + use the `connect()` method instead. Args: llm_request: LlmRequest, the request to send to the LLM. - stream: bool = False, whether to do streaming call. + stream: bool = False, whether to enable SSE streaming mode. Yields: - a generator of types.Content. + LlmResponse objects representing the model's response for one turn. - For non-streaming call, it will only yield one Content. + **Non-streaming mode (stream=False):** - For streaming call, it may yield more than one content, but all yielded - contents should be treated as one content by merging the - parts list. + Yields exactly one LlmResponse containing the complete model output + (text, function calls, bytes, etc.). This response has `partial=False`. + + **Streaming mode (stream=True):** + + Yields multiple LlmResponse objects as chunks arrive: + + - Intermediate chunks: `partial=True` (progressive updates) + - Final chunk: `partial=False` (aggregated content from entire turn, + identical to stream=False output) + - Text consolidation: Consecutive text parts of the same type + (thought/non-thought) SHOULD merge without separator, but client + code must not rely on this - unconsolidated parts are unusual but also + valid + + **Common content in partial chunks:** + + All intermediate chunks have `partial=True` regardless of content type. + Common examples include: + + - Text: Streams incrementally as tokens arrive + - Function calls: May arrive in separate chunks + - Bytes (e.g., images): Typically arrive as single chunk, interleaved + with text + - Thoughts: Stream incrementally when thinking_config is enabled + + **Examples:** + + 1. Simple text streaming:: + + LlmResponse(partial=True, parts=["The weather"]) + LlmResponse(partial=True, parts=[" in Tokyo is"]) + LlmResponse(partial=True, parts=[" sunny."]) + LlmResponse(partial=False, parts=["The weather in Tokyo is sunny."]) + + 2. Text + function call:: + + LlmResponse(partial=True, parts=[Text("Let me check...")]) + LlmResponse(partial=True, parts=[FunctionCall("get_weather", ...)]) + LlmResponse(partial=False, parts=[Text("Let me check..."), + FunctionCall("get_weather", ...)]) + + 3. Parallel function calls across chunks:: + + LlmResponse(partial=True, parts=[Text("Checking both cities...")]) + LlmResponse(partial=True, parts=[FunctionCall("get_weather", Tokyo)]) + LlmResponse(partial=True, parts=[FunctionCall("get_weather", NYC)]) + LlmResponse(partial=False, parts=[Text("Checking both cities..."), + FunctionCall("get_weather", Tokyo), + FunctionCall("get_weather", NYC)]) + + 4. Text + bytes (image generation with gemini-2.5-flash-image):: + + LlmResponse(partial=True, parts=[Text("Here's an image of a dog.")]) + LlmResponse(partial=True, parts=[Text("\n")]) + LlmResponse(partial=True, parts=[Blob(image/png, 1.6MB)]) + LlmResponse(partial=True, parts=[Text("It carries a bone")]) + LlmResponse(partial=True, parts=[Text(" and running around.")]) + LlmResponse(partial=False, parts=[Text("Here's an image of a dog.\n"), + Blob(image/png, 1.6MB), + Text("It carries a bone and running around.")]) + + Note: Consecutive text parts before and after blob merge separately. + + 5. Text with thinking (gemini-2.5-flash with thinking_config):: + + LlmResponse(partial=True, parts=[Thought("Let me analyze...")]) + LlmResponse(partial=True, parts=[Thought("The user wants...")]) + LlmResponse(partial=True, parts=[Text("Based on my analysis,")]) + LlmResponse(partial=True, parts=[Text(" the answer is 42.")]) + LlmResponse(partial=False, parts=[Thought("Let me analyze...The user wants..."), + Text("Based on my analysis, the answer is 42.")]) + + Note: Consecutive parts of same type merge (thoughts→thought, text→text). + + **Important:** All yielded responses represent one logical model turn. + The final response with `partial=False` should be identical to the + response that would be received with `stream=False`. """ raise NotImplementedError( f'Async generation is not supported for {self.model}.'