[api] Limit Nagle batching for log messages to reduce LWIP buffer pressure (#13439)

This commit is contained in:
J. Nick Koston
2026-01-22 08:09:14 -10:00
committed by Jonathan Swoboda
parent 3c3d5c2fca
commit 95eebcd74f
2 changed files with 46 additions and 40 deletions

View File

@@ -1844,23 +1844,8 @@ bool APIConnection::send_buffer(ProtoWriteBuffer buffer, uint8_t message_type) {
return false; return false;
} }
// Toggle Nagle's algorithm based on message type to prevent log messages from // Set TCP_NODELAY based on message type - see set_nodelay_for_message() for details
// filling the TCP send buffer and crowding out important state updates. this->helper_->set_nodelay_for_message(is_log_message);
//
// This honors the `no_delay` proto option - SubscribeLogsResponse is the only
// message with `option (no_delay) = false;` in api.proto, indicating it should
// allow Nagle coalescing. This option existed since 2019 but was never implemented.
//
// - Log messages: Enable Nagle (NODELAY=false) so small log packets coalesce
// into fewer, larger packets. They flush naturally via TCP delayed ACK timer
// (~200ms), buffer filling, or when a state update triggers a flush.
//
// - All other messages (state updates, responses): Disable Nagle (NODELAY=true)
// for immediate delivery. These are time-sensitive and should not be delayed.
//
// This must be done proactively BEFORE the buffer fills up - checking buffer
// state here would be too late since we'd already be in a degraded state.
this->helper_->set_nodelay(!is_log_message);
APIError err = this->helper_->write_protobuf_packet(message_type, buffer); APIError err = this->helper_->write_protobuf_packet(message_type, buffer);
if (err == APIError::WOULD_BLOCK) if (err == APIError::WOULD_BLOCK)

View File

@@ -120,26 +120,39 @@ class APIFrameHelper {
} }
return APIError::OK; return APIError::OK;
} }
/// Toggle TCP_NODELAY socket option to control Nagle's algorithm. // Manage TCP_NODELAY (Nagle's algorithm) based on message type.
/// //
/// This is used to allow log messages to coalesce (Nagle enabled) while keeping // For non-log messages (sensor data, state updates): Always disable Nagle
/// state updates low-latency (NODELAY enabled). Without this, many small log // (NODELAY on) for immediate delivery - these are time-sensitive.
/// packets fill the TCP send buffer, crowding out important state updates. //
/// // For log messages: Use Nagle to coalesce multiple small log packets into
/// State is tracked to minimize setsockopt() overhead - on lwip_raw (ESP8266/RP2040) // fewer larger packets, reducing WiFi overhead. However, we limit batching
/// this is just a boolean assignment; on other platforms it's a lightweight syscall. // to 3 messages to avoid excessive LWIP buffer pressure on memory-constrained
/// // devices like ESP8266. LWIP's TCP_OVERSIZE option coalesces the data into
/// @param enable true to enable NODELAY (disable Nagle), false to enable Nagle // shared pbufs, but holding data too long waiting for Nagle's timer causes
/// @return true if successful or already in desired state // buffer exhaustion and dropped messages.
bool set_nodelay(bool enable) { //
if (this->nodelay_enabled_ == enable) // Flow: Log 1 (Nagle on) -> Log 2 (Nagle on) -> Log 3 (NODELAY, flush all)
return true; //
int val = enable ? 1 : 0; void set_nodelay_for_message(bool is_log_message) {
int err = this->socket_->setsockopt(IPPROTO_TCP, TCP_NODELAY, &val, sizeof(int)); if (!is_log_message) {
if (err == 0) { if (this->nodelay_state_ != NODELAY_ON) {
this->nodelay_enabled_ = enable; this->set_nodelay_raw_(true);
this->nodelay_state_ = NODELAY_ON;
}
return;
}
// Log messages 1-3: state transitions -1 -> 1 -> 2 -> -1 (flush on 3rd)
if (this->nodelay_state_ == NODELAY_ON) {
this->set_nodelay_raw_(false);
this->nodelay_state_ = 1;
} else if (this->nodelay_state_ >= LOG_NAGLE_COUNT) {
this->set_nodelay_raw_(true);
this->nodelay_state_ = NODELAY_ON;
} else {
this->nodelay_state_++;
} }
return err == 0;
} }
virtual APIError write_protobuf_packet(uint8_t type, ProtoWriteBuffer buffer) = 0; virtual APIError write_protobuf_packet(uint8_t type, ProtoWriteBuffer buffer) = 0;
// Write multiple protobuf messages in a single operation // Write multiple protobuf messages in a single operation
@@ -229,10 +242,18 @@ class APIFrameHelper {
uint8_t tx_buf_head_{0}; uint8_t tx_buf_head_{0};
uint8_t tx_buf_tail_{0}; uint8_t tx_buf_tail_{0};
uint8_t tx_buf_count_{0}; uint8_t tx_buf_count_{0};
// Tracks TCP_NODELAY state to minimize setsockopt() calls. Initialized to true // Nagle batching state for log messages. NODELAY_ON (-1) means NODELAY is enabled
// since init_common_() enables NODELAY. Used by set_nodelay() to allow log // (immediate send). Values 1-2 count log messages in the current Nagle batch.
// messages to coalesce while keeping state updates low-latency. // After LOG_NAGLE_COUNT logs, we switch to NODELAY to flush and reset.
bool nodelay_enabled_{true}; static constexpr int8_t NODELAY_ON = -1;
static constexpr int8_t LOG_NAGLE_COUNT = 2;
int8_t nodelay_state_{NODELAY_ON};
// Internal helper to set TCP_NODELAY socket option
void set_nodelay_raw_(bool enable) {
int val = enable ? 1 : 0;
this->socket_->setsockopt(IPPROTO_TCP, TCP_NODELAY, &val, sizeof(int));
}
// Common initialization for both plaintext and noise protocols // Common initialization for both plaintext and noise protocols
APIError init_common_(); APIError init_common_();