diff --git a/README.md b/README.md index 4ef98c5..922165c 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ The configuration file is a yaml file that looks like this: # - local: A dependency that is defined in the config file. These dependencies do not have a remote field and must correspond to either a service defined in the 'services' section or a program defined in the 'x-programs' section. # - remote: A dependency that is defined in the devservices directory in a remote repository. These configs are automatically fetched from the remote repository and installed. Any dependency with a remote field will be treated as a remote dependency. Example: https://github.com/getsentry/snuba/blob/59a5258ccbb502827ebc1d3b1bf80c607a3301bf/devservices/config.yml#L8 # - modes: A list of modes for the service. Each mode includes a list of dependencies that are used in that mode. +# - healthcheck_timeout: Optional. The number of seconds to wait for all containers to become healthy before failing. Defaults to 180. x-sentry-service-config: version: 0.1 service_name: example-service diff --git a/devservices/commands/up.py b/devservices/commands/up.py index 90864e9..8143ea9 100644 --- a/devservices/commands/up.py +++ b/devservices/commands/up.py @@ -390,7 +390,9 @@ def bring_up_docker_compose_services( ) exit(1) try: - check_all_containers_healthy(status, containers_to_check) + check_all_containers_healthy( + status, containers_to_check, timeout=service.config.healthcheck_timeout + ) except ContainerHealthcheckFailedError as e: status.failure(str(e)) exit(1) diff --git a/devservices/configs/service_config.py b/devservices/configs/service_config.py index 5c69e45..80847cc 100644 --- a/devservices/configs/service_config.py +++ b/devservices/configs/service_config.py @@ -9,6 +9,7 @@ from devservices.constants import CONFIG_FILE_NAME from devservices.constants import DEVSERVICES_DIR_NAME +from devservices.constants import HEALTHCHECK_TIMEOUT from devservices.constants import DependencyType from devservices.exceptions import ConfigNotFoundError from devservices.exceptions import ConfigParseError @@ -40,6 +41,7 @@ class ServiceConfig: service_name: str dependencies: dict[str, Dependency] modes: dict[str, list[str]] + healthcheck_timeout: int = HEALTHCHECK_TIMEOUT def __post_init__(self) -> None: self._validate() @@ -59,6 +61,14 @@ def _validate(self) -> None: if "default" not in self.modes: raise ConfigValidationError("Default mode is required in service config") + if isinstance(self.healthcheck_timeout, bool) or ( + not isinstance(self.healthcheck_timeout, int) + or self.healthcheck_timeout <= 0 + ): + raise ConfigValidationError( + "healthcheck_timeout must be a positive integer" + ) + for mode, services in self.modes.items(): if not isinstance(services, list): raise ConfigValidationError(f"Services in mode '{mode}' must be a list") @@ -142,6 +152,9 @@ def load_service_config_from_file( service_name=service_config_data.get("service_name"), dependencies=dependencies, modes=service_config_data.get("modes", {}), + healthcheck_timeout=service_config_data.get("healthcheck_timeout") + if service_config_data.get("healthcheck_timeout") is not None + else HEALTHCHECK_TIMEOUT, ) return service_config diff --git a/devservices/constants.py b/devservices/constants.py index 5b893c4..b578329 100644 --- a/devservices/constants.py +++ b/devservices/constants.py @@ -65,7 +65,7 @@ class DependencyType(StrEnum): DEVSERVICES_CACHE_DIR, "latest_version.txt" ) DEVSERVICES_LATEST_VERSION_CACHE_TTL = timedelta(minutes=15) -# Healthcheck timeout set to 2 minutes to account for slow healthchecks -HEALTHCHECK_TIMEOUT = 120 +# Healthcheck timeout set to 3 minutes to account for slow healthchecks +HEALTHCHECK_TIMEOUT = 180 HEALTHCHECK_INTERVAL = 5 SUPERVISOR_TIMEOUT = 10 diff --git a/devservices/utils/docker.py b/devservices/utils/docker.py index 6992dfd..7eb08c4 100644 --- a/devservices/utils/docker.py +++ b/devservices/utils/docker.py @@ -43,25 +43,34 @@ def check_docker_daemon_running() -> None: def check_all_containers_healthy( - status: Status, containers: list[ContainerNames] + status: Status, + containers: list[ContainerNames], + timeout: int = HEALTHCHECK_TIMEOUT, ) -> None: """Ensures all containers are healthy.""" status.info("Waiting for all containers to be healthy") with concurrent.futures.ThreadPoolExecutor() as healthcheck_executor: futures = [ - healthcheck_executor.submit(wait_for_healthy, container, status) + healthcheck_executor.submit(wait_for_healthy, container, status, timeout) for container in containers ] for future in concurrent.futures.as_completed(futures): future.result() -def wait_for_healthy(container: ContainerNames, status: Status) -> None: +def wait_for_healthy( + container: ContainerNames, + status: Status, + timeout: int = HEALTHCHECK_TIMEOUT, +) -> None: """ Polls a Docker container's health status until it becomes healthy or a timeout is reached. """ + status.info( + f"Waiting for {container.short_name} to be healthy (timeout: {timeout}s)" + ) start = time.time() - while time.time() - start < HEALTHCHECK_TIMEOUT: + while time.time() - start < timeout: # Run docker inspect to get the container's health status try: # For containers with no healthchecks, the output will be "unknown" @@ -96,7 +105,7 @@ def wait_for_healthy(container: ContainerNames, status: Status) -> None: # If not healthy, wait and try again time.sleep(HEALTHCHECK_INTERVAL) - raise ContainerHealthcheckFailedError(container.short_name, HEALTHCHECK_TIMEOUT) + raise ContainerHealthcheckFailedError(container.short_name, timeout) @dataclass diff --git a/tests/commands/test_up.py b/tests/commands/test_up.py index 8c5f870..7c16bae 100644 --- a/tests/commands/test_up.py +++ b/tests/commands/test_up.py @@ -865,7 +865,7 @@ def test_up_docker_compose_container_healthcheck_failed( assert "Starting clickhouse" in captured.out.strip() assert "Starting redis" in captured.out.strip() assert ( - "Container container1 did not become healthy within 120 seconds." + "Container container1 did not become healthy within 180 seconds." in captured.out.strip() ) @@ -1376,6 +1376,7 @@ def test_up_multiple_modes_overlapping_running_service( mock_check_all_containers_healthy.assert_called_once_with( mock.ANY, ["container1", "container2"], + timeout=mock.ANY, ) captured = capsys.readouterr() diff --git a/tests/configs/test_service_config.py b/tests/configs/test_service_config.py index e25fc89..7823627 100644 --- a/tests/configs/test_service_config.py +++ b/tests/configs/test_service_config.py @@ -111,6 +111,7 @@ def test_load_service_config_from_file( for key, value in dependencies.items() }, "modes": modes, + "healthcheck_timeout": 180, } @@ -131,9 +132,49 @@ def test_load_service_config_from_file_no_dependencies(tmp_path: Path) -> None: "service_name": "example-service", "dependencies": {}, "modes": {"default": []}, + "healthcheck_timeout": 180, } +def test_load_service_config_from_file_null_healthcheck_timeout( + tmp_path: Path, +) -> None: + config = { + "x-sentry-service-config": { + "version": 0.1, + "service_name": "example-service", + "modes": {"default": []}, + "healthcheck_timeout": None, + }, + "services": {}, + } + create_config_file(tmp_path, config) + + service_config = load_service_config_from_file(str(tmp_path)) + assert service_config.healthcheck_timeout == 180 + + +@pytest.mark.parametrize("invalid_timeout", [-1, 0, "120", True, False]) +def test_load_service_config_from_file_invalid_healthcheck_timeout( + tmp_path: Path, + invalid_timeout: object, +) -> None: + config = { + "x-sentry-service-config": { + "version": 0.1, + "service_name": "example-service", + "modes": {"default": []}, + "healthcheck_timeout": invalid_timeout, + }, + "services": {}, + } + create_config_file(tmp_path, config) + + with pytest.raises(ConfigValidationError) as e: + load_service_config_from_file(str(tmp_path)) + assert str(e.value) == "healthcheck_timeout must be a positive integer" + + def test_load_service_config_from_file_missing_config(tmp_path: Path) -> None: with pytest.raises(ConfigNotFoundError) as e: load_service_config_from_file(str(tmp_path)) diff --git a/tests/utils/test_docker.py b/tests/utils/test_docker.py index 938a288..4c799cb 100644 --- a/tests/utils/test_docker.py +++ b/tests/utils/test_docker.py @@ -533,10 +533,12 @@ def test_check_all_containers_healthy_success( mock.call( ContainerNames(name="devservices-container1", short_name="container1"), mock_status, + 180, ), mock.call( ContainerNames(name="devservices-container2", short_name="container2"), mock_status, + 180, ), ] ) @@ -671,10 +673,12 @@ def test_check_all_containers_healthy_failure( mock.call( ContainerNames(name="devservices-container1", short_name="container1"), mock_status, + 180, ), mock.call( ContainerNames(name="devservices-container2", short_name="container2"), mock_status, + 180, ), ] )