ada-eval/tests/test_cli.py

import itertools
import sys
from pathlib import Path
from unittest.mock import Mock, patch

import pytest

from ada_eval.cli import main
from ada_eval.datasets import Eval
from ada_eval.paths import (
    COMPACTED_DATASETS_DIR,
    EVALUATED_DATASETS_DIR,
    EXPANDED_DATASETS_DIR,
    GENERATED_DATASETS_DIR,
)
from ada_eval.tools.factory import Tool


def test_no_args(capsys):
    with patch.object(sys, "argv", ["ada-eval"]), pytest.raises(SystemExit):
        main()
    output = capsys.readouterr()
    assert "error: the following arguments are required: {" in output.err
    assert output.out == ""


def test_generate(capsys):
    # Helper function to patch `sys.argv`
    def patch_args(
        tool: str | None = None,
        tool_config_file: str | None = None,
        dataset: str | None = None,
        jobs: str | None = None,
    ):
        test_args = ["ada-eval", "generate"]
        if tool is not None:
            test_args += ["--tool", tool]
        if tool_config_file is not None:
            test_args += ["--tool-config-file", tool_config_file]
        if dataset is not None:
            test_args += ["--dataset", dataset]
        if jobs is not None:
            test_args += ["--jobs", jobs]
        return patch.object(sys, "argv", test_args)

    # Mock the tool factory
    mock_tool = Mock()
    mock_create_tool = Mock(return_value=mock_tool)
    with patch("ada_eval.cli.create_tool", mock_create_tool):
        # Test with no arguments (should complain about missing `--tool` and
        # `--tool-config-file`)
        with patch_args(), pytest.raises(SystemExit):
            main()
        output = capsys.readouterr()
        assert (
            "error: the following arguments are required: --tool, --tool-config-file"
            in output.err
        )
        assert output.out == ""
        mock_create_tool.assert_not_called()
        mock_tool.apply_to_directory.assert_not_called()

        # Test with an invalid tool name
        with patch_args("invalid_tool", "path/to/config"), pytest.raises(SystemExit):
            main()
        output = capsys.readouterr()
        assert "argument --tool: invalid Tool value: 'invalid_tool'" in output.err
        assert output.out == ""
        mock_create_tool.assert_not_called()
        mock_tool.apply_to_directory.assert_not_called()

        # Test with various valid argument combinations
        mock_cpu_count = Mock(return_value=8)
        cpu_count_patch = patch("ada_eval.cli.cpu_count", mock_cpu_count)
        for tool, dataset, jobs in itertools.product(
            ["shell_script", "SHELL_SCRIPT", "ShElL_ScRiPt"],
            [None, "path/to/dataset"],
            [None, "2", "4"],
        ):
            with patch_args(tool, "path/to/config", dataset, jobs), cpu_count_patch:
                main()
            output = capsys.readouterr()
            assert output.err == ""
            assert output.out == ""
            mock_create_tool.assert_called_once_with(
                Tool.SHELL_SCRIPT, Path("path/to/config")
            )
            dataset_path = COMPACTED_DATASETS_DIR if dataset is None else Path(dataset)
            mock_tool.apply_to_directory.assert_called_once_with(
                path=dataset_path,
                output_dir=GENERATED_DATASETS_DIR,
                jobs=8 if jobs is None else int(jobs),
            )
            mock_create_tool.reset_mock()


def test_evaluate(capsys):
    # Helper function to patch `sys.argv`
    def patch_args(
        evals: list[str] | None = None,
        dataset: str | None = None,
        jobs: str | None = None,
        *,
        canonical: bool = False,
    ):
        test_args = ["ada-eval", "evaluate"]
        if canonical:
            test_args.append("--canonical")
        if evals is not None:
            test_args += ["--evals", *evals]
        if dataset is not None:
            test_args += ["--dataset", dataset]
        if jobs is not None:
            test_args += ["--jobs", jobs]
        return patch.object(sys, "argv", test_args)

    # Mock the `evaluate_directory()` and `cpu_count()` functions
    mock_evaluate_directory = Mock()
    mock_cpu_count = Mock(return_value=8)
    eval_dir_patch = patch("ada_eval.cli.evaluate_directory", mock_evaluate_directory)
    cpu_count_patch = patch("ada_eval.cli.cpu_count", mock_cpu_count)
    with eval_dir_patch, cpu_count_patch:
        # Test with an invalid eval name
        with patch_args(["invalid_eval"]), pytest.raises(SystemExit):
            main()
        output = capsys.readouterr()
        assert "argument --evals: invalid Eval value: 'invalid_eval'" in output.err
        assert output.out == ""
        mock_evaluate_directory.assert_not_called()

        # Test with various valid argument combinations
        for evals, dataset, jobs, canonical in itertools.product(
            [None, ["PrOvE", "build"], ["prove"]],
            [None, "path/to/dataset"],
            [None, "2", "4"],
            [False, True],
        ):
            with patch_args(evals, dataset, jobs, canonical=canonical):
                main()
            output = capsys.readouterr()
            assert output.err == ""
            assert output.out == ""
            expected_evals = (
                [Eval.BUILD, Eval.PROVE, Eval.TEST]
                if evals is None
                else ([Eval.PROVE, Eval.BUILD] if "build" in evals else [Eval.PROVE])
            )
            if canonical:
                expected_dataset_path = (
                    EXPANDED_DATASETS_DIR if dataset is None else Path(dataset)
                )
                expected_output_dir = expected_dataset_path
            else:
                expected_dataset_path = (
                    GENERATED_DATASETS_DIR if dataset is None else Path(dataset)
                )
                expected_output_dir = EVALUATED_DATASETS_DIR
            mock_evaluate_directory.assert_called_once_with(
                evals=expected_evals,
                path=expected_dataset_path,
                output_dir=expected_output_dir,
                jobs=8 if jobs is None else int(jobs),
                canonical_evaluation=canonical,
            )
            mock_evaluate_directory.reset_mock()


def test_check_datasets(capsys):
    # Helper function to patch `sys.argv`
    def patch_args(datasets: list[str] | None = None, jobs: str | None = None):
        test_args = ["ada-eval", "check-datasets"]
        if datasets is not None:
            test_args += ["--datasets", *datasets]
        if jobs is not None:
            test_args += ["--jobs", jobs]
        return patch.object(sys, "argv", test_args)

    # Mock the `check_base_datasets()` and `cpu_count()` functions
    mock_check_base_datasets = Mock()
    mock_cpu_count = Mock(return_value=8)
    check_base_datasets_patch = patch(
        "ada_eval.cli.check_base_datasets", mock_check_base_datasets
    )
    cpu_count_patch = patch("ada_eval.cli.cpu_count", mock_cpu_count)
    with check_base_datasets_patch, cpu_count_patch:
        # Test with empty `--datasets`
        with patch_args([]), pytest.raises(SystemExit):
            main()
        output = capsys.readouterr()
        assert "argument --datasets: expected at least one argument" in output.err
        assert output.out == ""
        mock_check_base_datasets.assert_not_called()

        # Test with various valid argument combinations
        for datasets, jobs in itertools.product(
            [None, ["path/to/dataset"], ["path/to/dataset1", "path/to/dataset2"]],
            [None, "2", "4"],
        ):
            with patch_args(datasets, jobs):
                main()
            output = capsys.readouterr()
            assert output.err == ""
            assert output.out == ""
            mock_check_base_datasets.assert_called_once_with(
                dataset_dirs=(
                    [EXPANDED_DATASETS_DIR, COMPACTED_DATASETS_DIR]
                    if datasets is None
                    else [Path(d) for d in datasets]
                ),
                jobs=8 if jobs is None else int(jobs),
            )
            mock_check_base_datasets.reset_mock()


def test_report(capsys: pytest.CaptureFixture[str]):
    # Helper function to patch `sys.argv`
    def patch_args(  # noqa: PLR0913
        dataset_dirs: list[str] | None = None,
        datasets: set[str] | None = None,
        dataset_kinds: set[str] | None = None,
        samples: set[str] | None = None,
        with_metric: list[list[str]] | None = None,
        *,
        list_samples: bool = False,
    ):
        test_args = ["ada-eval", "report"]
        if dataset_dirs is not None:
            test_args += ["--dataset-dirs", *dataset_dirs]
        if datasets is not None:
            test_args += ["--datasets", *datasets]
        if dataset_kinds is not None:
            test_args += ["--dataset-kinds", *dataset_kinds]
        if samples is not None:
            test_args += ["--samples", *samples]
        if with_metric is not None:
            for metric_path in with_metric:
                test_args += ["--with-metric", *metric_path]
        if list_samples:
            test_args.append("--list-samples")
        return patch.object(sys, "argv", test_args)

    # Mock the `report_evaluation_results()` function
    mock_report_evaluation_results = Mock()
    report_patch = patch(
        "ada_eval.cli.report_evaluation_results", mock_report_evaluation_results
    )
    with report_patch:
        # Test with an invalid dataset kind
        with patch_args(dataset_kinds={"invalid"}), pytest.raises(SystemExit):
            main()
        output = capsys.readouterr()
        assert (
            "argument --dataset-kinds: invalid SampleKind value: 'invalid'"
            in output.err
        )
        assert output.out == ""
        mock_report_evaluation_results.assert_not_called()

        # Test with various valid argument combinations
        for (
            dataset_dirs,
            datasets,
            dataset_kinds,
            samples,
            with_metric,
            list_samples,
        ) in itertools.product(
            [None, ["path/to/dataset_dir"], ["dir1", "dir2"]],
            [None, {"dataset1"}, {"dataset1", "dataset2"}],
            [None, {"ada"}, {"EXPLAIN", "sPaRk"}],
            [None, {"sample1"}, {"sample1", "sample2"}],
            [None, [["metric0"]], [["metric1", "submetric0"], ["metric2"]]],
            [False, True],
        ):
            with patch_args(
                dataset_dirs=dataset_dirs,
                datasets=datasets,
                dataset_kinds=dataset_kinds,
                samples=samples,
                with_metric=with_metric,
                list_samples=list_samples,
            ):
                main()
            output = capsys.readouterr()
            assert output.err == ""
            assert output.out == ""
            expected_dataset_dirs = (
                [EVALUATED_DATASETS_DIR]
                if dataset_dirs is None
                else [Path(d) for d in dataset_dirs]
            )
            expected_dataset_kinds = (
                None
                if dataset_kinds is None
                else {kind.lower() for kind in dataset_kinds}
            )
            mock_report_evaluation_results.assert_called_once_with(
                dataset_dirs=expected_dataset_dirs,
                datasets_filter=datasets,
                dataset_kinds_filter=expected_dataset_kinds,
                samples_filter=samples,
                metrics_filter=with_metric,
                list_samples=list_samples,
            )
            mock_report_evaluation_results.reset_mock()