diff --git a/contributing/samples/adk_answering_agent/README.md b/contributing/samples/adk_answering_agent/README.md index e2af591d..158c825a 100644 --- a/contributing/samples/adk_answering_agent/README.md +++ b/contributing/samples/adk_answering_agent/README.md @@ -2,7 +2,11 @@ The ADK Answering Agent is a Python-based agent designed to help answer questions in GitHub discussions for the `google/adk-python` repository. It uses a large language model to analyze open discussions, retrieve information from document store, generate response, and post a comment in the github discussion. -This agent can be operated in three distinct modes: an interactive mode for local use, a batch script mode for oncall use, or as a fully automated GitHub Actions workflow (TBD). +This agent can be operated in three distinct modes: + +- An interactive mode for local use. +- A batch script mode for oncall use. +- A fully automated GitHub Actions workflow (TBD). --- @@ -50,6 +54,15 @@ The `main.py` is reserved for the Github Workflow. The detailed setup for the au --- +## Update the Knowledge Base + +The `upload_docs_to_vertex_ai_search.py` is a script to upload ADK related docs to Vertex AI Search datastore to update the knowledge base. It can be executed with the following command in your terminal: + +```bash +export PYTHONPATH=contributing/samples # If not already exported +python -m adk_answering_agent.upload_docs_to_vertex_ai_search +``` + ## Setup and Configuration Whether running in interactive or workflow mode, the agent requires the following setup. @@ -59,7 +72,7 @@ The agent requires the following Python libraries. ```bash pip install --upgrade pip -pip install google-adk requests +pip install google-adk ``` The agent also requires gcloud login: @@ -68,6 +81,12 @@ The agent also requires gcloud login: gcloud auth application-default login ``` +The upload script requires the following additional Python libraries. + +```bash +pip install google-cloud-storage google-cloud-discoveryengine +``` + ### Environment Variables The following environment variables are required for the agent to connect to the necessary services. @@ -75,9 +94,15 @@ The following environment variables are required for the agent to connect to the * `GOOGLE_GENAI_USE_VERTEXAI=TRUE`: **(Required)** Use Google Vertex AI for the authentication. * `GOOGLE_CLOUD_PROJECT=YOUR_PROJECT_ID`: **(Required)** The Google Cloud project ID. * `GOOGLE_CLOUD_LOCATION=LOCATION`: **(Required)** The Google Cloud region. -* `VERTEXAI_DATASTORE_ID=YOUR_DATASTORE_ID`: **(Required)** The Vertex AI datastore ID for the document store (i.e. knowledge base). +* `VERTEXAI_DATASTORE_ID=YOUR_DATASTORE_ID`: **(Required)** The full Vertex AI datastore ID for the document store (i.e. knowledge base), with the format of `projects/{project_number}/locations/{location}/collections/{collection}/dataStores/{datastore_id}`. * `OWNER`: The GitHub organization or username that owns the repository (e.g., `google`). Needed for both modes. * `REPO`: The name of the GitHub repository (e.g., `adk-python`). Needed for both modes. * `INTERACTIVE`: Controls the agent's interaction mode. For the automated workflow, this is set to `0`. For interactive mode, it should be set to `1` or left unset. +The following environment variables are required to upload the docs to update the knowledge base. + +* `GCS_BUCKET_NAME=YOUR_GCS_BUCKET_NAME`: **(Required)** The name of the GCS bucket to store the documents. +* `ADK_DOCS_ROOT_PATH=YOUR_ADK_DOCS_ROOT_PATH`: **(Required)** Path to the root of the downloaded adk-docs repo. +* `ADK_PYTHON_ROOT_PATH=YOUR_ADK_PYTHON_ROOT_PATH`: **(Required)** Path to the root of the downloaded adk-python repo. + For local execution in interactive mode, you can place these variables in a `.env` file in the project's root directory. For the GitHub workflow, they should be configured as repository secrets. \ No newline at end of file diff --git a/contributing/samples/adk_answering_agent/settings.py b/contributing/samples/adk_answering_agent/settings.py index 9f152678..c8bd146b 100644 --- a/contributing/samples/adk_answering_agent/settings.py +++ b/contributing/samples/adk_answering_agent/settings.py @@ -29,6 +29,11 @@ VERTEXAI_DATASTORE_ID = os.getenv("VERTEXAI_DATASTORE_ID") if not VERTEXAI_DATASTORE_ID: raise ValueError("VERTEXAI_DATASTORE_ID environment variable not set") +GOOGLE_CLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME") +ADK_DOCS_ROOT_PATH = os.getenv("ADK_DOCS_ROOT_PATH") +ADK_PYTHON_ROOT_PATH = os.getenv("ADK_PYTHON_ROOT_PATH") + OWNER = os.getenv("OWNER", "google") REPO = os.getenv("REPO", "adk-python") BOT_RESPONSE_LABEL = os.getenv("BOT_RESPONSE_LABEL", "bot responded") diff --git a/contributing/samples/adk_answering_agent/upload_docs_to_vertex_ai_search.py b/contributing/samples/adk_answering_agent/upload_docs_to_vertex_ai_search.py new file mode 100644 index 00000000..9dd7ca6a --- /dev/null +++ b/contributing/samples/adk_answering_agent/upload_docs_to_vertex_ai_search.py @@ -0,0 +1,222 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +from adk_answering_agent.settings import ADK_DOCS_ROOT_PATH +from adk_answering_agent.settings import ADK_PYTHON_ROOT_PATH +from adk_answering_agent.settings import GCS_BUCKET_NAME +from adk_answering_agent.settings import GOOGLE_CLOUD_PROJECT +from adk_answering_agent.settings import VERTEXAI_DATASTORE_ID +from google.api_core.exceptions import GoogleAPICallError +from google.cloud import discoveryengine_v1beta as discoveryengine +from google.cloud import storage +import markdown + +GCS_PREFIX_TO_ROOT_PATH = { + "adk-docs": ADK_DOCS_ROOT_PATH, + "adk-python": ADK_PYTHON_ROOT_PATH, +} + + +def cleanup_gcs_prefix(project_id: str, bucket_name: str, prefix: str) -> bool: + """Delete all the objects with the given prefix in the bucket.""" + print(f"Start cleaning up GCS: gs://{bucket_name}/{prefix}...") + try: + storage_client = storage.Client(project=project_id) + bucket = storage_client.bucket(bucket_name) + blobs = list(bucket.list_blobs(prefix=prefix)) + + if not blobs: + print("GCS target location is already empty, no need to clean up.") + return True + + bucket.delete_blobs(blobs) + print(f"Successfully deleted {len(blobs)} objects.") + return True + except GoogleAPICallError as e: + print(f"[ERROR] Failed to clean up GCS: {e}", file=sys.stderr) + return False + + +def upload_directory_to_gcs( + source_directory: str, project_id: str, bucket_name: str, prefix: str +) -> bool: + """Upload the whole directory into GCS.""" + print( + f"Start uploading directory {source_directory} to GCS:" + f" gs://{bucket_name}/{prefix}..." + ) + + if not os.path.isdir(source_directory): + print(f"[Error] {source_directory} is not a directory or does not exist.") + return False + + storage_client = storage.Client(project=project_id) + bucket = storage_client.bucket(bucket_name) + file_count = 0 + for root, dirs, files in os.walk(source_directory): + # Modify the 'dirs' list in-place to prevent os.walk from descending + # into hidden directories. + dirs[:] = [d for d in dirs if not d.startswith(".")] + + # Keep only .md and .py files. + files = [f for f in files if f.endswith(".md") or f.endswith(".py")] + + for filename in files: + local_path = os.path.join(root, filename) + + relative_path = os.path.relpath(local_path, source_directory) + gcs_path = os.path.join(prefix, relative_path) + + try: + content_type = None + if filename.lower().endswith(".md"): + # Vertex AI search doesn't recognize text/markdown, + # convert it to html and use text/html instead + content_type = "text/html" + with open(local_path, "r", encoding="utf-8") as f: + md_content = f.read() + html_content = markdown.markdown( + md_content, output_format="html5", encoding="utf-8" + ) + if not html_content: + print(" - Skipped empty file: " + local_path) + continue + gcs_path = gcs_path.removesuffix(".md") + ".html" + bucket.blob(gcs_path).upload_from_string( + html_content, content_type=content_type + ) + else: # Python files + bucket.blob(gcs_path).upload_from_filename( + local_path, content_type=content_type + ) + type_msg = ( + f"(type {content_type})" if content_type else "(type auto-detect)" + ) + print( + f" - Uploaded {type_msg}: {local_path} ->" + f" gs://{bucket_name}/{gcs_path}" + ) + file_count += 1 + except GoogleAPICallError as e: + print( + f"[ERROR] Error uploading file {local_path}: {e}", file=sys.stderr + ) + return False + + print(f"Sucessfully uploaded {file_count} files to GCS.") + return True + + +def import_from_gcs_to_vertex_ai( + full_datastore_id: str, + gcs_bucket: str, +) -> bool: + """Triggers a bulk import task from a GCS folder to Vertex AI Search.""" + print(f"Triggering FULL SYNC import from gs://{gcs_bucket}/**...") + + try: + client = discoveryengine.DocumentServiceClient() + gcs_uri = f"gs://{gcs_bucket}/**" + request = discoveryengine.ImportDocumentsRequest( + # parent has the format of + # "projects/{project_number}/locations/{location}/collections/{collection}/dataStores/{datastore_id}/branches/default_branch" + parent=full_datastore_id + "/branches/default_branch", + # Specify the GCS source and use "content" for unstructed data. + gcs_source=discoveryengine.GcsSource( + input_uris=[gcs_uri], data_schema="content" + ), + reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.FULL, + ) + operation = client.import_documents(request=request) + print( + "Successfully started full sync import operation." + f"Operation Name: {operation.operation.name}" + ) + return True + + except GoogleAPICallError as e: + print(f"[ERROR] Error triggering import: {e}", file=sys.stderr) + return False + + +def main(): + # Check required environment variables. + if not GOOGLE_CLOUD_PROJECT: + print( + "[ERROR] GOOGLE_CLOUD_PROJECT environment variable not set. Exiting...", + file=sys.stderr, + ) + return 1 + if not GCS_BUCKET_NAME: + print( + "[ERROR] GCS_BUCKET_NAME environment variable not set. Exiting...", + file=sys.stderr, + ) + return 1 + if not VERTEXAI_DATASTORE_ID: + print( + "[ERROR] VERTEXAI_DATASTORE_ID environment variable not set." + " Exiting...", + file=sys.stderr, + ) + return 1 + if not ADK_DOCS_ROOT_PATH: + print( + "[ERROR] ADK_DOCS_ROOT_PATH environment variable not set. Exiting...", + file=sys.stderr, + ) + return 1 + if not ADK_PYTHON_ROOT_PATH: + print( + "[ERROR] ADK_PYTHON_ROOT_PATH environment variable not set. Exiting...", + file=sys.stderr, + ) + return 1 + + for gcs_prefix in GCS_PREFIX_TO_ROOT_PATH: + # 1. Cleanup the GSC for a clean start. + if not cleanup_gcs_prefix( + GOOGLE_CLOUD_PROJECT, GCS_BUCKET_NAME, gcs_prefix + ): + print("[ERROR] Failed to clean up GCS. Exiting...", file=sys.stderr) + return 1 + + # 2. Upload the docs to GCS. + if not upload_directory_to_gcs( + GCS_PREFIX_TO_ROOT_PATH[gcs_prefix], + GOOGLE_CLOUD_PROJECT, + GCS_BUCKET_NAME, + gcs_prefix, + ): + print("[ERROR] Failed to upload docs to GCS. Exiting...", file=sys.stderr) + return 1 + + # 3. Import the docs from GCS to Vertex AI Search. + if not import_from_gcs_to_vertex_ai(VERTEXAI_DATASTORE_ID, GCS_BUCKET_NAME): + print( + "[ERROR] Failed to import docs from GCS to Vertex AI Search." + " Exiting...", + file=sys.stderr, + ) + return 1 + + print("--- Sync task has been successfully initiated ---") + return 0 + + +if __name__ == "__main__": + sys.exit(main())