mirror of
https://github.com/AdaCore/ada-eval.git
synced 2026-02-12 13:53:19 -08:00
306 lines
11 KiB
Python
306 lines
11 KiB
Python
import itertools
|
|
import sys
|
|
from pathlib import Path
|
|
from unittest.mock import Mock, patch
|
|
|
|
import pytest
|
|
|
|
from ada_eval.cli import main
|
|
from ada_eval.datasets import Eval
|
|
from ada_eval.paths import (
|
|
COMPACTED_DATASETS_DIR,
|
|
EVALUATED_DATASETS_DIR,
|
|
EXPANDED_DATASETS_DIR,
|
|
GENERATED_DATASETS_DIR,
|
|
)
|
|
from ada_eval.tools.factory import Tool
|
|
|
|
|
|
def test_no_args(capsys):
|
|
with patch.object(sys, "argv", ["ada-eval"]), pytest.raises(SystemExit):
|
|
main()
|
|
output = capsys.readouterr()
|
|
assert "error: the following arguments are required: {" in output.err
|
|
assert output.out == ""
|
|
|
|
|
|
def test_generate(capsys):
|
|
# Helper function to patch `sys.argv`
|
|
def patch_args(
|
|
tool: str | None = None,
|
|
tool_config_file: str | None = None,
|
|
dataset: str | None = None,
|
|
jobs: str | None = None,
|
|
):
|
|
test_args = ["ada-eval", "generate"]
|
|
if tool is not None:
|
|
test_args += ["--tool", tool]
|
|
if tool_config_file is not None:
|
|
test_args += ["--tool-config-file", tool_config_file]
|
|
if dataset is not None:
|
|
test_args += ["--dataset", dataset]
|
|
if jobs is not None:
|
|
test_args += ["--jobs", jobs]
|
|
return patch.object(sys, "argv", test_args)
|
|
|
|
# Mock the tool factory
|
|
mock_tool = Mock()
|
|
mock_create_tool = Mock(return_value=mock_tool)
|
|
with patch("ada_eval.cli.create_tool", mock_create_tool):
|
|
# Test with no arguments (should complain about missing `--tool` and
|
|
# `--tool-config-file`)
|
|
with patch_args(), pytest.raises(SystemExit):
|
|
main()
|
|
output = capsys.readouterr()
|
|
assert (
|
|
"error: the following arguments are required: --tool, --tool-config-file"
|
|
in output.err
|
|
)
|
|
assert output.out == ""
|
|
mock_create_tool.assert_not_called()
|
|
mock_tool.apply_to_directory.assert_not_called()
|
|
|
|
# Test with an invalid tool name
|
|
with patch_args("invalid_tool", "path/to/config"), pytest.raises(SystemExit):
|
|
main()
|
|
output = capsys.readouterr()
|
|
assert "argument --tool: invalid Tool value: 'invalid_tool'" in output.err
|
|
assert output.out == ""
|
|
mock_create_tool.assert_not_called()
|
|
mock_tool.apply_to_directory.assert_not_called()
|
|
|
|
# Test with various valid argument combinations
|
|
mock_cpu_count = Mock(return_value=8)
|
|
cpu_count_patch = patch("ada_eval.cli.cpu_count", mock_cpu_count)
|
|
for tool, dataset, jobs in itertools.product(
|
|
["shell_script", "SHELL_SCRIPT", "ShElL_ScRiPt"],
|
|
[None, "path/to/dataset"],
|
|
[None, "2", "4"],
|
|
):
|
|
with patch_args(tool, "path/to/config", dataset, jobs), cpu_count_patch:
|
|
main()
|
|
output = capsys.readouterr()
|
|
assert output.err == ""
|
|
assert output.out == ""
|
|
mock_create_tool.assert_called_once_with(
|
|
Tool.SHELL_SCRIPT, Path("path/to/config")
|
|
)
|
|
dataset_path = COMPACTED_DATASETS_DIR if dataset is None else Path(dataset)
|
|
mock_tool.apply_to_directory.assert_called_once_with(
|
|
path=dataset_path,
|
|
output_dir=GENERATED_DATASETS_DIR,
|
|
jobs=8 if jobs is None else int(jobs),
|
|
)
|
|
mock_create_tool.reset_mock()
|
|
|
|
|
|
def test_evaluate(capsys):
|
|
# Helper function to patch `sys.argv`
|
|
def patch_args(
|
|
evals: list[str] | None = None,
|
|
dataset: str | None = None,
|
|
jobs: str | None = None,
|
|
*,
|
|
canonical: bool = False,
|
|
):
|
|
test_args = ["ada-eval", "evaluate"]
|
|
if canonical:
|
|
test_args.append("--canonical")
|
|
if evals is not None:
|
|
test_args += ["--evals", *evals]
|
|
if dataset is not None:
|
|
test_args += ["--dataset", dataset]
|
|
if jobs is not None:
|
|
test_args += ["--jobs", jobs]
|
|
return patch.object(sys, "argv", test_args)
|
|
|
|
# Mock the `evaluate_directory()` and `cpu_count()` functions
|
|
mock_evaluate_directory = Mock()
|
|
mock_cpu_count = Mock(return_value=8)
|
|
eval_dir_patch = patch("ada_eval.cli.evaluate_directory", mock_evaluate_directory)
|
|
cpu_count_patch = patch("ada_eval.cli.cpu_count", mock_cpu_count)
|
|
with eval_dir_patch, cpu_count_patch:
|
|
# Test with an invalid eval name
|
|
with patch_args(["invalid_eval"]), pytest.raises(SystemExit):
|
|
main()
|
|
output = capsys.readouterr()
|
|
assert "argument --evals: invalid Eval value: 'invalid_eval'" in output.err
|
|
assert output.out == ""
|
|
mock_evaluate_directory.assert_not_called()
|
|
|
|
# Test with various valid argument combinations
|
|
for evals, dataset, jobs, canonical in itertools.product(
|
|
[None, ["PrOvE", "build"], ["prove"]],
|
|
[None, "path/to/dataset"],
|
|
[None, "2", "4"],
|
|
[False, True],
|
|
):
|
|
with patch_args(evals, dataset, jobs, canonical=canonical):
|
|
main()
|
|
output = capsys.readouterr()
|
|
assert output.err == ""
|
|
assert output.out == ""
|
|
expected_evals = (
|
|
[Eval.BUILD, Eval.PROVE, Eval.TEST]
|
|
if evals is None
|
|
else ([Eval.PROVE, Eval.BUILD] if "build" in evals else [Eval.PROVE])
|
|
)
|
|
if canonical:
|
|
expected_dataset_path = (
|
|
EXPANDED_DATASETS_DIR if dataset is None else Path(dataset)
|
|
)
|
|
expected_output_dir = expected_dataset_path
|
|
else:
|
|
expected_dataset_path = (
|
|
GENERATED_DATASETS_DIR if dataset is None else Path(dataset)
|
|
)
|
|
expected_output_dir = EVALUATED_DATASETS_DIR
|
|
mock_evaluate_directory.assert_called_once_with(
|
|
evals=expected_evals,
|
|
path=expected_dataset_path,
|
|
output_dir=expected_output_dir,
|
|
jobs=8 if jobs is None else int(jobs),
|
|
canonical_evaluation=canonical,
|
|
)
|
|
mock_evaluate_directory.reset_mock()
|
|
|
|
|
|
def test_check_datasets(capsys):
|
|
# Helper function to patch `sys.argv`
|
|
def patch_args(datasets: list[str] | None = None, jobs: str | None = None):
|
|
test_args = ["ada-eval", "check-datasets"]
|
|
if datasets is not None:
|
|
test_args += ["--datasets", *datasets]
|
|
if jobs is not None:
|
|
test_args += ["--jobs", jobs]
|
|
return patch.object(sys, "argv", test_args)
|
|
|
|
# Mock the `check_base_datasets()` and `cpu_count()` functions
|
|
mock_check_base_datasets = Mock()
|
|
mock_cpu_count = Mock(return_value=8)
|
|
check_base_datasets_patch = patch(
|
|
"ada_eval.cli.check_base_datasets", mock_check_base_datasets
|
|
)
|
|
cpu_count_patch = patch("ada_eval.cli.cpu_count", mock_cpu_count)
|
|
with check_base_datasets_patch, cpu_count_patch:
|
|
# Test with empty `--datasets`
|
|
with patch_args([]), pytest.raises(SystemExit):
|
|
main()
|
|
output = capsys.readouterr()
|
|
assert "argument --datasets: expected at least one argument" in output.err
|
|
assert output.out == ""
|
|
mock_check_base_datasets.assert_not_called()
|
|
|
|
# Test with various valid argument combinations
|
|
for datasets, jobs in itertools.product(
|
|
[None, ["path/to/dataset"], ["path/to/dataset1", "path/to/dataset2"]],
|
|
[None, "2", "4"],
|
|
):
|
|
with patch_args(datasets, jobs):
|
|
main()
|
|
output = capsys.readouterr()
|
|
assert output.err == ""
|
|
assert output.out == ""
|
|
mock_check_base_datasets.assert_called_once_with(
|
|
dataset_dirs=(
|
|
[EXPANDED_DATASETS_DIR, COMPACTED_DATASETS_DIR]
|
|
if datasets is None
|
|
else [Path(d) for d in datasets]
|
|
),
|
|
jobs=8 if jobs is None else int(jobs),
|
|
)
|
|
mock_check_base_datasets.reset_mock()
|
|
|
|
|
|
def test_report(capsys: pytest.CaptureFixture[str]):
|
|
# Helper function to patch `sys.argv`
|
|
def patch_args( # noqa: PLR0913
|
|
dataset_dirs: list[str] | None = None,
|
|
datasets: set[str] | None = None,
|
|
dataset_kinds: set[str] | None = None,
|
|
samples: set[str] | None = None,
|
|
with_metric: list[list[str]] | None = None,
|
|
*,
|
|
list_samples: bool = False,
|
|
):
|
|
test_args = ["ada-eval", "report"]
|
|
if dataset_dirs is not None:
|
|
test_args += ["--dataset-dirs", *dataset_dirs]
|
|
if datasets is not None:
|
|
test_args += ["--datasets", *datasets]
|
|
if dataset_kinds is not None:
|
|
test_args += ["--dataset-kinds", *dataset_kinds]
|
|
if samples is not None:
|
|
test_args += ["--samples", *samples]
|
|
if with_metric is not None:
|
|
for metric_path in with_metric:
|
|
test_args += ["--with-metric", *metric_path]
|
|
if list_samples:
|
|
test_args.append("--list-samples")
|
|
return patch.object(sys, "argv", test_args)
|
|
|
|
# Mock the `report_evaluation_results()` function
|
|
mock_report_evaluation_results = Mock()
|
|
report_patch = patch(
|
|
"ada_eval.cli.report_evaluation_results", mock_report_evaluation_results
|
|
)
|
|
with report_patch:
|
|
# Test with an invalid dataset kind
|
|
with patch_args(dataset_kinds={"invalid"}), pytest.raises(SystemExit):
|
|
main()
|
|
output = capsys.readouterr()
|
|
assert (
|
|
"argument --dataset-kinds: invalid SampleKind value: 'invalid'"
|
|
in output.err
|
|
)
|
|
assert output.out == ""
|
|
mock_report_evaluation_results.assert_not_called()
|
|
|
|
# Test with various valid argument combinations
|
|
for (
|
|
dataset_dirs,
|
|
datasets,
|
|
dataset_kinds,
|
|
samples,
|
|
with_metric,
|
|
list_samples,
|
|
) in itertools.product(
|
|
[None, ["path/to/dataset_dir"], ["dir1", "dir2"]],
|
|
[None, {"dataset1"}, {"dataset1", "dataset2"}],
|
|
[None, {"ada"}, {"EXPLAIN", "sPaRk"}],
|
|
[None, {"sample1"}, {"sample1", "sample2"}],
|
|
[None, [["metric0"]], [["metric1", "submetric0"], ["metric2"]]],
|
|
[False, True],
|
|
):
|
|
with patch_args(
|
|
dataset_dirs=dataset_dirs,
|
|
datasets=datasets,
|
|
dataset_kinds=dataset_kinds,
|
|
samples=samples,
|
|
with_metric=with_metric,
|
|
list_samples=list_samples,
|
|
):
|
|
main()
|
|
output = capsys.readouterr()
|
|
assert output.err == ""
|
|
assert output.out == ""
|
|
expected_dataset_dirs = (
|
|
[EVALUATED_DATASETS_DIR]
|
|
if dataset_dirs is None
|
|
else [Path(d) for d in dataset_dirs]
|
|
)
|
|
expected_dataset_kinds = (
|
|
None
|
|
if dataset_kinds is None
|
|
else {kind.lower() for kind in dataset_kinds}
|
|
)
|
|
mock_report_evaluation_results.assert_called_once_with(
|
|
dataset_dirs=expected_dataset_dirs,
|
|
datasets_filter=datasets,
|
|
dataset_kinds_filter=expected_dataset_kinds,
|
|
samples_filter=samples,
|
|
metrics_filter=with_metric,
|
|
list_samples=list_samples,
|
|
)
|
|
mock_report_evaluation_results.reset_mock()
|