From 06815fe177e5980ef049ec859c3fc8d26a28290d Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Mon, 24 Nov 2025 10:41:24 -0600 Subject: [PATCH] [script][wait_until] Fix FIFO ordering and reentrancy bugs (#12049) Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- esphome/components/script/script.h | 20 +-- esphome/core/base_automation.h | 9 +- .../fixtures/script_delay_with_params.yaml | 131 ++++++++++++++++++ .../fixtures/wait_until_fifo_ordering.yaml | 82 +++++++++++ tests/integration/test_script_delay_params.py | 121 ++++++++++++++++ tests/integration/test_wait_until_ordering.py | 90 ++++++++++++ 6 files changed, 441 insertions(+), 12 deletions(-) create mode 100644 tests/integration/fixtures/script_delay_with_params.yaml create mode 100644 tests/integration/fixtures/wait_until_fifo_ordering.yaml create mode 100644 tests/integration/test_script_delay_params.py create mode 100644 tests/integration/test_wait_until_ordering.py diff --git a/esphome/components/script/script.h b/esphome/components/script/script.h index 51cece01e..d60ed657f 100644 --- a/esphome/components/script/script.h +++ b/esphome/components/script/script.h @@ -1,8 +1,8 @@ #pragma once +#include #include #include -#include #include "esphome/core/automation.h" #include "esphome/core/component.h" #include "esphome/core/helpers.h" @@ -290,10 +290,10 @@ template class ScriptWaitAction : public Action, } // Store parameters for later execution - this->param_queue_.emplace_front(x...); - // Enable loop now that we have work to do + this->param_queue_.emplace_back(x...); + // Enable loop now that we have work to do - don't call loop() synchronously! + // Let the event loop call it to avoid reentrancy issues this->enable_loop(); - this->loop(); } void loop() override { @@ -303,13 +303,17 @@ template class ScriptWaitAction : public Action, if (this->script_->is_running()) return; - while (!this->param_queue_.empty()) { + // Only process ONE queued item per loop iteration + // Processing all items in a while loop causes infinite loops because + // play_next_() can trigger more items to be queued + if (!this->param_queue_.empty()) { auto ¶ms = this->param_queue_.front(); this->play_next_tuple_(params, typename gens::type()); this->param_queue_.pop_front(); + } else { + // Queue is now empty - disable loop until next play_complex + this->disable_loop(); } - // Queue is now empty - disable loop until next play_complex - this->disable_loop(); } void play(const Ts &...x) override { /* ignore - see play_complex */ @@ -326,7 +330,7 @@ template class ScriptWaitAction : public Action, } C *script_; - std::forward_list> param_queue_; + std::list> param_queue_; }; } // namespace script diff --git a/esphome/core/base_automation.h b/esphome/core/base_automation.h index c2519da83..e8878ac25 100644 --- a/esphome/core/base_automation.h +++ b/esphome/core/base_automation.h @@ -9,8 +9,8 @@ #include "esphome/core/application.h" #include "esphome/core/helpers.h" +#include #include -#include namespace esphome { @@ -445,9 +445,10 @@ template class WaitUntilAction : public Action, public Co // Store for later processing auto now = millis(); auto timeout = this->timeout_value_.optional_value(x...); - this->var_queue_.emplace_front(now, timeout, std::make_tuple(x...)); + this->var_queue_.emplace_back(now, timeout, std::make_tuple(x...)); - // Do immediate check with fresh timestamp + // Do immediate check with fresh timestamp - don't call loop() synchronously! + // Let the event loop call it to avoid reentrancy issues if (this->process_queue_(now)) { // Only enable loop if we still have pending items this->enable_loop(); @@ -499,7 +500,7 @@ template class WaitUntilAction : public Action, public Co } Condition *condition_; - std::forward_list, std::tuple>> var_queue_{}; + std::list, std::tuple>> var_queue_{}; }; template class UpdateComponentAction : public Action { diff --git a/tests/integration/fixtures/script_delay_with_params.yaml b/tests/integration/fixtures/script_delay_with_params.yaml new file mode 100644 index 000000000..2a0f16d9f --- /dev/null +++ b/tests/integration/fixtures/script_delay_with_params.yaml @@ -0,0 +1,131 @@ +esphome: + name: test-script-delay-params + +host: + +api: + actions: + # Test case from issue #12044: parent script with repeat calling child with delay + - action: test_repeat_with_delay + then: + - logger.log: "=== TEST: Repeat loop calling script with delay and parameters ===" + - script.execute: father_script + + # Test case from issue #12043: script.wait with delayed child script + - action: test_script_wait + then: + - logger.log: "=== TEST: script.wait with delayed child script ===" + - script.execute: show_start_page + - script.wait: show_start_page + - logger.log: "After wait: script completed successfully" + + # Test: Delay with different parameter types + - action: test_delay_param_types + then: + - logger.log: "=== TEST: Delay with various parameter types ===" + - script.execute: + id: delay_with_int + val: 42 + - delay: 50ms + - script.execute: + id: delay_with_string + msg: "test message" + - delay: 50ms + - script.execute: + id: delay_with_float + num: 3.14 + +logger: + level: DEBUG + +script: + # Reproduces issue #12044: child script with conditional delay + - id: son_script + mode: single + parameters: + iteration: int + then: + - logger.log: + format: "Son script started with iteration %d" + args: ['iteration'] + - if: + condition: + lambda: 'return iteration >= 5;' + then: + - logger.log: + format: "Son script delaying for iteration %d" + args: ['iteration'] + - delay: 100ms + - logger.log: + format: "Son script finished with iteration %d" + args: ['iteration'] + + # Reproduces issue #12044: parent script with repeat loop + - id: father_script + mode: single + then: + - repeat: + count: 10 + then: + - logger.log: + format: "Father iteration %d: calling son" + args: ['iteration'] + - script.execute: + id: son_script + iteration: !lambda 'return iteration;' + - script.wait: son_script + - logger.log: + format: "Father iteration %d: son finished, wait returned" + args: ['iteration'] + + # Reproduces issue #12043: script.wait hangs + - id: show_start_page + mode: single + then: + - logger.log: "Start page: beginning" + - delay: 100ms + - logger.log: "Start page: after delay" + - delay: 100ms + - logger.log: "Start page: completed" + + # Test delay with int parameter + - id: delay_with_int + mode: single + parameters: + val: int + then: + - logger.log: + format: "Int test: before delay, val=%d" + args: ['val'] + - delay: 50ms + - logger.log: + format: "Int test: after delay, val=%d" + args: ['val'] + + # Test delay with string parameter + - id: delay_with_string + mode: single + parameters: + msg: string + then: + - logger.log: + format: "String test: before delay, msg=%s" + args: ['msg.c_str()'] + - delay: 50ms + - logger.log: + format: "String test: after delay, msg=%s" + args: ['msg.c_str()'] + + # Test delay with float parameter + - id: delay_with_float + mode: single + parameters: + num: float + then: + - logger.log: + format: "Float test: before delay, num=%.2f" + args: ['num'] + - delay: 50ms + - logger.log: + format: "Float test: after delay, num=%.2f" + args: ['num'] diff --git a/tests/integration/fixtures/wait_until_fifo_ordering.yaml b/tests/integration/fixtures/wait_until_fifo_ordering.yaml new file mode 100644 index 000000000..5dd60c875 --- /dev/null +++ b/tests/integration/fixtures/wait_until_fifo_ordering.yaml @@ -0,0 +1,82 @@ +esphome: + name: test-wait-until-ordering + +host: + +api: + actions: + - action: test_wait_until_fifo + then: + - logger.log: "=== TEST: wait_until should execute in FIFO order ===" + - globals.set: + id: gate_open + value: 'false' + - delay: 100ms + # Start multiple parallel executions of coordinator script + # Each will call the shared waiter script, queueing in same wait_until + - script.execute: coordinator_0 + - script.execute: coordinator_1 + - script.execute: coordinator_2 + - script.execute: coordinator_3 + - script.execute: coordinator_4 + # Give scripts time to reach wait_until and queue + - delay: 200ms + - logger.log: "Opening gate - all wait_until should complete now" + - globals.set: + id: gate_open + value: 'true' + - delay: 500ms + - logger.log: "Test complete" + +globals: + - id: gate_open + type: bool + initial_value: 'false' + +script: + # Shared waiter with single wait_until action (all coordinators call this) + - id: waiter + mode: parallel + parameters: + iter: int + then: + - lambda: 'ESP_LOGD("main", "Queueing iteration %d", iter);' + - wait_until: + condition: + lambda: 'return id(gate_open);' + timeout: 5s + - lambda: 'ESP_LOGD("main", "Completed iteration %d", iter);' + + # Coordinator scripts - each calls shared waiter with different iteration number + - id: coordinator_0 + then: + - script.execute: + id: waiter + iter: 0 + + - id: coordinator_1 + then: + - script.execute: + id: waiter + iter: 1 + + - id: coordinator_2 + then: + - script.execute: + id: waiter + iter: 2 + + - id: coordinator_3 + then: + - script.execute: + id: waiter + iter: 3 + + - id: coordinator_4 + then: + - script.execute: + id: waiter + iter: 4 + +logger: + level: DEBUG diff --git a/tests/integration/test_script_delay_params.py b/tests/integration/test_script_delay_params.py new file mode 100644 index 000000000..1b5d70863 --- /dev/null +++ b/tests/integration/test_script_delay_params.py @@ -0,0 +1,121 @@ +"""Integration test for script.wait FIFO ordering (issues #12043, #12044). + +This test verifies that ScriptWaitAction processes queued items in FIFO order. + +PR #7972 introduced bugs in ScriptWaitAction: +- Used emplace_front() causing LIFO ordering instead of FIFO +- Called loop() synchronously causing reentrancy issues +- Used while loop processing entire queue causing infinite loops + +These bugs manifested as: +- Scripts becoming "zombies" (stuck in running state) +- script.wait hanging forever +- Incorrect execution order +""" + +from __future__ import annotations + +import asyncio +import re + +import pytest + +from .types import APIClientConnectedFactory, RunCompiledFunction + + +@pytest.mark.asyncio +async def test_script_delay_with_params( + yaml_config: str, + run_compiled: RunCompiledFunction, + api_client_connected: APIClientConnectedFactory, +) -> None: + """Test that script.wait processes queued items in FIFO order. + + This reproduces issues #12043 and #12044 where scripts would hang or become + zombies due to LIFO ordering bugs in ScriptWaitAction from PR #7972. + """ + test_complete = asyncio.Event() + + # Patterns to match in logs + father_calling_pattern = re.compile(r"Father iteration (\d+): calling son") + son_started_pattern = re.compile(r"Son script started with iteration (\d+)") + son_delaying_pattern = re.compile(r"Son script delaying for iteration (\d+)") + son_finished_pattern = re.compile(r"Son script finished with iteration (\d+)") + father_wait_returned_pattern = re.compile( + r"Father iteration (\d+): son finished, wait returned" + ) + + # Track which iterations completed + father_calling = set() + son_started = set() + son_delaying = set() + son_finished = set() + wait_returned = set() + + def check_output(line: str) -> None: + """Check log output for expected messages.""" + if test_complete.is_set(): + return + + if mo := father_calling_pattern.search(line): + father_calling.add(int(mo.group(1))) + elif mo := son_started_pattern.search(line): + son_started.add(int(mo.group(1))) + elif mo := son_delaying_pattern.search(line): + son_delaying.add(int(mo.group(1))) + elif mo := son_finished_pattern.search(line): + son_finished.add(int(mo.group(1))) + elif mo := father_wait_returned_pattern.search(line): + iteration = int(mo.group(1)) + wait_returned.add(iteration) + # Test completes when iteration 9 finishes + if iteration == 9: + test_complete.set() + + # Run with log monitoring + async with ( + run_compiled(yaml_config, line_callback=check_output), + api_client_connected() as client, + ): + # Verify device info + device_info = await client.device_info() + assert device_info is not None + assert device_info.name == "test-script-delay-params" + + # Get services + _, services = await client.list_entities_services() + test_service = next( + (s for s in services if s.name == "test_repeat_with_delay"), None + ) + assert test_service is not None, "test_repeat_with_delay service not found" + + # Execute the test + client.execute_service(test_service, {}) + + # Wait for test to complete (10 iterations * ~100ms each + margin) + try: + await asyncio.wait_for(test_complete.wait(), timeout=5.0) + except TimeoutError: + pytest.fail( + f"Test timed out. Completed iterations: {sorted(wait_returned)}. " + f"This likely indicates the script became a zombie (issue #12044)." + ) + + # Verify all 10 iterations completed successfully + expected_iterations = set(range(10)) + assert father_calling == expected_iterations, "Not all iterations started" + assert son_started == expected_iterations, ( + "Son script not started for all iterations" + ) + assert son_finished == expected_iterations, ( + "Son script not finished for all iterations" + ) + assert wait_returned == expected_iterations, ( + "script.wait did not return for all iterations" + ) + + # Verify delays were triggered for iterations >= 5 + expected_delays = set(range(5, 10)) + assert son_delaying == expected_delays, ( + "Delays not triggered for iterations >= 5" + ) diff --git a/tests/integration/test_wait_until_ordering.py b/tests/integration/test_wait_until_ordering.py new file mode 100644 index 000000000..7c39913e5 --- /dev/null +++ b/tests/integration/test_wait_until_ordering.py @@ -0,0 +1,90 @@ +"""Integration test for wait_until FIFO ordering. + +This test verifies that when multiple wait_until actions are queued, +they execute in FIFO (First In First Out) order, not LIFO. + +PR #7972 introduced a bug where emplace_front() was used, causing +LIFO ordering which is incorrect. +""" + +from __future__ import annotations + +import asyncio +import re + +import pytest + +from .types import APIClientConnectedFactory, RunCompiledFunction + + +@pytest.mark.asyncio +async def test_wait_until_fifo_ordering( + yaml_config: str, + run_compiled: RunCompiledFunction, + api_client_connected: APIClientConnectedFactory, +) -> None: + """Test that wait_until executes queued items in FIFO order. + + With the bug (using emplace_front), the order would be 4,3,2,1,0 (LIFO). + With the fix (using emplace_back), the order should be 0,1,2,3,4 (FIFO). + """ + test_complete = asyncio.Event() + + # Track completion order + completed_order = [] + + # Patterns to match + queuing_pattern = re.compile(r"Queueing iteration (\d+)") + completed_pattern = re.compile(r"Completed iteration (\d+)") + + def check_output(line: str) -> None: + """Check log output for completion order.""" + if test_complete.is_set(): + return + + if mo := queuing_pattern.search(line): + iteration = int(mo.group(1)) + + elif mo := completed_pattern.search(line): + iteration = int(mo.group(1)) + completed_order.append(iteration) + + # Test completes when all 5 have completed + if len(completed_order) == 5: + test_complete.set() + + # Run with log monitoring + async with ( + run_compiled(yaml_config, line_callback=check_output), + api_client_connected() as client, + ): + # Verify device info + device_info = await client.device_info() + assert device_info is not None + assert device_info.name == "test-wait-until-ordering" + + # Get services + _, services = await client.list_entities_services() + test_service = next( + (s for s in services if s.name == "test_wait_until_fifo"), None + ) + assert test_service is not None, "test_wait_until_fifo service not found" + + # Execute the test + client.execute_service(test_service, {}) + + # Wait for test to complete + try: + await asyncio.wait_for(test_complete.wait(), timeout=5.0) + except TimeoutError: + pytest.fail( + f"Test timed out. Completed order: {completed_order}. " + f"Expected 5 completions but got {len(completed_order)}." + ) + + # Verify FIFO order + expected_order = [0, 1, 2, 3, 4] + assert completed_order == expected_order, ( + f"Unexpected order: {completed_order}. " + f"Expected FIFO order: {expected_order}" + )