Files
ada-eval/tests/test_cli.py
2025-11-19 10:08:23 +00:00

306 lines
11 KiB
Python

import itertools
import sys
from pathlib import Path
from unittest.mock import Mock, patch
import pytest
from ada_eval.cli import main
from ada_eval.datasets import Eval
from ada_eval.paths import (
COMPACTED_DATASETS_DIR,
EVALUATED_DATASETS_DIR,
EXPANDED_DATASETS_DIR,
GENERATED_DATASETS_DIR,
)
from ada_eval.tools.factory import Tool
def test_no_args(capsys):
with patch.object(sys, "argv", ["ada-eval"]), pytest.raises(SystemExit):
main()
output = capsys.readouterr()
assert "error: the following arguments are required: {" in output.err
assert output.out == ""
def test_generate(capsys):
# Helper function to patch `sys.argv`
def patch_args(
tool: str | None = None,
tool_config_file: str | None = None,
dataset: str | None = None,
jobs: str | None = None,
):
test_args = ["ada-eval", "generate"]
if tool is not None:
test_args += ["--tool", tool]
if tool_config_file is not None:
test_args += ["--tool-config-file", tool_config_file]
if dataset is not None:
test_args += ["--dataset", dataset]
if jobs is not None:
test_args += ["--jobs", jobs]
return patch.object(sys, "argv", test_args)
# Mock the tool factory
mock_tool = Mock()
mock_create_tool = Mock(return_value=mock_tool)
with patch("ada_eval.cli.create_tool", mock_create_tool):
# Test with no arguments (should complain about missing `--tool` and
# `--tool-config-file`)
with patch_args(), pytest.raises(SystemExit):
main()
output = capsys.readouterr()
assert (
"error: the following arguments are required: --tool, --tool-config-file"
in output.err
)
assert output.out == ""
mock_create_tool.assert_not_called()
mock_tool.apply_to_directory.assert_not_called()
# Test with an invalid tool name
with patch_args("invalid_tool", "path/to/config"), pytest.raises(SystemExit):
main()
output = capsys.readouterr()
assert "argument --tool: invalid Tool value: 'invalid_tool'" in output.err
assert output.out == ""
mock_create_tool.assert_not_called()
mock_tool.apply_to_directory.assert_not_called()
# Test with various valid argument combinations
mock_cpu_count = Mock(return_value=8)
cpu_count_patch = patch("ada_eval.cli.cpu_count", mock_cpu_count)
for tool, dataset, jobs in itertools.product(
["shell_script", "SHELL_SCRIPT", "ShElL_ScRiPt"],
[None, "path/to/dataset"],
[None, "2", "4"],
):
with patch_args(tool, "path/to/config", dataset, jobs), cpu_count_patch:
main()
output = capsys.readouterr()
assert output.err == ""
assert output.out == ""
mock_create_tool.assert_called_once_with(
Tool.SHELL_SCRIPT, Path("path/to/config")
)
dataset_path = COMPACTED_DATASETS_DIR if dataset is None else Path(dataset)
mock_tool.apply_to_directory.assert_called_once_with(
path=dataset_path,
output_dir=GENERATED_DATASETS_DIR,
jobs=8 if jobs is None else int(jobs),
)
mock_create_tool.reset_mock()
def test_evaluate(capsys):
# Helper function to patch `sys.argv`
def patch_args(
evals: list[str] | None = None,
dataset: str | None = None,
jobs: str | None = None,
*,
canonical: bool = False,
):
test_args = ["ada-eval", "evaluate"]
if canonical:
test_args.append("--canonical")
if evals is not None:
test_args += ["--evals", *evals]
if dataset is not None:
test_args += ["--dataset", dataset]
if jobs is not None:
test_args += ["--jobs", jobs]
return patch.object(sys, "argv", test_args)
# Mock the `evaluate_directory()` and `cpu_count()` functions
mock_evaluate_directory = Mock()
mock_cpu_count = Mock(return_value=8)
eval_dir_patch = patch("ada_eval.cli.evaluate_directory", mock_evaluate_directory)
cpu_count_patch = patch("ada_eval.cli.cpu_count", mock_cpu_count)
with eval_dir_patch, cpu_count_patch:
# Test with an invalid eval name
with patch_args(["invalid_eval"]), pytest.raises(SystemExit):
main()
output = capsys.readouterr()
assert "argument --evals: invalid Eval value: 'invalid_eval'" in output.err
assert output.out == ""
mock_evaluate_directory.assert_not_called()
# Test with various valid argument combinations
for evals, dataset, jobs, canonical in itertools.product(
[None, ["PrOvE", "build"], ["prove"]],
[None, "path/to/dataset"],
[None, "2", "4"],
[False, True],
):
with patch_args(evals, dataset, jobs, canonical=canonical):
main()
output = capsys.readouterr()
assert output.err == ""
assert output.out == ""
expected_evals = (
[Eval.BUILD, Eval.PROVE, Eval.TEST]
if evals is None
else ([Eval.PROVE, Eval.BUILD] if "build" in evals else [Eval.PROVE])
)
if canonical:
expected_dataset_path = (
EXPANDED_DATASETS_DIR if dataset is None else Path(dataset)
)
expected_output_dir = expected_dataset_path
else:
expected_dataset_path = (
GENERATED_DATASETS_DIR if dataset is None else Path(dataset)
)
expected_output_dir = EVALUATED_DATASETS_DIR
mock_evaluate_directory.assert_called_once_with(
evals=expected_evals,
path=expected_dataset_path,
output_dir=expected_output_dir,
jobs=8 if jobs is None else int(jobs),
canonical_evaluation=canonical,
)
mock_evaluate_directory.reset_mock()
def test_check_datasets(capsys):
# Helper function to patch `sys.argv`
def patch_args(datasets: list[str] | None = None, jobs: str | None = None):
test_args = ["ada-eval", "check-datasets"]
if datasets is not None:
test_args += ["--datasets", *datasets]
if jobs is not None:
test_args += ["--jobs", jobs]
return patch.object(sys, "argv", test_args)
# Mock the `check_base_datasets()` and `cpu_count()` functions
mock_check_base_datasets = Mock()
mock_cpu_count = Mock(return_value=8)
check_base_datasets_patch = patch(
"ada_eval.cli.check_base_datasets", mock_check_base_datasets
)
cpu_count_patch = patch("ada_eval.cli.cpu_count", mock_cpu_count)
with check_base_datasets_patch, cpu_count_patch:
# Test with empty `--datasets`
with patch_args([]), pytest.raises(SystemExit):
main()
output = capsys.readouterr()
assert "argument --datasets: expected at least one argument" in output.err
assert output.out == ""
mock_check_base_datasets.assert_not_called()
# Test with various valid argument combinations
for datasets, jobs in itertools.product(
[None, ["path/to/dataset"], ["path/to/dataset1", "path/to/dataset2"]],
[None, "2", "4"],
):
with patch_args(datasets, jobs):
main()
output = capsys.readouterr()
assert output.err == ""
assert output.out == ""
mock_check_base_datasets.assert_called_once_with(
dataset_dirs=(
[EXPANDED_DATASETS_DIR, COMPACTED_DATASETS_DIR]
if datasets is None
else [Path(d) for d in datasets]
),
jobs=8 if jobs is None else int(jobs),
)
mock_check_base_datasets.reset_mock()
def test_report(capsys: pytest.CaptureFixture[str]):
# Helper function to patch `sys.argv`
def patch_args( # noqa: PLR0913
dataset_dirs: list[str] | None = None,
datasets: set[str] | None = None,
dataset_kinds: set[str] | None = None,
samples: set[str] | None = None,
with_metric: list[list[str]] | None = None,
*,
list_samples: bool = False,
):
test_args = ["ada-eval", "report"]
if dataset_dirs is not None:
test_args += ["--dataset-dirs", *dataset_dirs]
if datasets is not None:
test_args += ["--datasets", *datasets]
if dataset_kinds is not None:
test_args += ["--dataset-kinds", *dataset_kinds]
if samples is not None:
test_args += ["--samples", *samples]
if with_metric is not None:
for metric_path in with_metric:
test_args += ["--with-metric", *metric_path]
if list_samples:
test_args.append("--list-samples")
return patch.object(sys, "argv", test_args)
# Mock the `report_evaluation_results()` function
mock_report_evaluation_results = Mock()
report_patch = patch(
"ada_eval.cli.report_evaluation_results", mock_report_evaluation_results
)
with report_patch:
# Test with an invalid dataset kind
with patch_args(dataset_kinds={"invalid"}), pytest.raises(SystemExit):
main()
output = capsys.readouterr()
assert (
"argument --dataset-kinds: invalid SampleKind value: 'invalid'"
in output.err
)
assert output.out == ""
mock_report_evaluation_results.assert_not_called()
# Test with various valid argument combinations
for (
dataset_dirs,
datasets,
dataset_kinds,
samples,
with_metric,
list_samples,
) in itertools.product(
[None, ["path/to/dataset_dir"], ["dir1", "dir2"]],
[None, {"dataset1"}, {"dataset1", "dataset2"}],
[None, {"ada"}, {"EXPLAIN", "sPaRk"}],
[None, {"sample1"}, {"sample1", "sample2"}],
[None, [["metric0"]], [["metric1", "submetric0"], ["metric2"]]],
[False, True],
):
with patch_args(
dataset_dirs=dataset_dirs,
datasets=datasets,
dataset_kinds=dataset_kinds,
samples=samples,
with_metric=with_metric,
list_samples=list_samples,
):
main()
output = capsys.readouterr()
assert output.err == ""
assert output.out == ""
expected_dataset_dirs = (
[EVALUATED_DATASETS_DIR]
if dataset_dirs is None
else [Path(d) for d in dataset_dirs]
)
expected_dataset_kinds = (
None
if dataset_kinds is None
else {kind.lower() for kind in dataset_kinds}
)
mock_report_evaluation_results.assert_called_once_with(
dataset_dirs=expected_dataset_dirs,
datasets_filter=datasets,
dataset_kinds_filter=expected_dataset_kinds,
samples_filter=samples,
metrics_filter=with_metric,
list_samples=list_samples,
)
mock_report_evaluation_results.reset_mock()