esphome/esphome/analyze_memory/ram_strings.py

"""Analyzer for RAM-stored strings in ESP8266/ESP32 firmware ELF files.

This module identifies strings that are stored in RAM sections (.data, .bss, .rodata)
rather than in flash sections (.irom0.text, .irom.text), which is important for
memory-constrained platforms like ESP8266.
"""

from __future__ import annotations

from collections import defaultdict
from dataclasses import dataclass
import logging
from pathlib import Path
import re
import subprocess

from .demangle import batch_demangle
from .toolchain import find_tool

_LOGGER = logging.getLogger(__name__)

# ESP8266: .rodata is in RAM (DRAM), not flash
# ESP32: .rodata is in flash, mapped to data bus
ESP8266_RAM_SECTIONS = frozenset([".data", ".rodata", ".bss"])
ESP8266_FLASH_SECTIONS = frozenset([".irom0.text", ".irom.text", ".text"])

# ESP32: .rodata is memory-mapped from flash
ESP32_RAM_SECTIONS = frozenset([".data", ".bss", ".dram0.data", ".dram0.bss"])
ESP32_FLASH_SECTIONS = frozenset([".text", ".rodata", ".flash.text", ".flash.rodata"])

# nm symbol types for data symbols (D=global data, d=local data, R=rodata, B=bss)
DATA_SYMBOL_TYPES = frozenset(["D", "d", "R", "r", "B", "b"])


@dataclass
class SectionInfo:
    """Information about an ELF section."""

    name: str
    address: int
    size: int


@dataclass
class RamString:
    """A string found in RAM."""

    section: str
    address: int
    content: str

    @property
    def size(self) -> int:
        """Size in bytes including null terminator."""
        return len(self.content) + 1


@dataclass
class RamSymbol:
    """A symbol found in RAM."""

    name: str
    sym_type: str
    address: int
    size: int
    section: str
    demangled: str = ""  # Demangled name, set after batch demangling


class RamStringsAnalyzer:
    """Analyzes ELF files to find strings stored in RAM."""

    def __init__(
        self,
        elf_path: str,
        objdump_path: str | None = None,
        min_length: int = 8,
        platform: str = "esp32",
    ) -> None:
        """Initialize the RAM strings analyzer.

        Args:
            elf_path: Path to the ELF file to analyze
            objdump_path: Path to objdump binary (used to find other tools)
            min_length: Minimum string length to report (default: 8)
            platform: Platform name ("esp8266", "esp32", etc.) for section mapping
        """
        self.elf_path = Path(elf_path)
        if not self.elf_path.exists():
            raise FileNotFoundError(f"ELF file not found: {elf_path}")

        self.objdump_path = objdump_path
        self.min_length = min_length
        self.platform = platform

        # Set RAM/flash sections based on platform
        if self.platform == "esp8266":
            self.ram_sections = ESP8266_RAM_SECTIONS
            self.flash_sections = ESP8266_FLASH_SECTIONS
        else:
            # ESP32 and other platforms
            self.ram_sections = ESP32_RAM_SECTIONS
            self.flash_sections = ESP32_FLASH_SECTIONS

        self.sections: dict[str, SectionInfo] = {}
        self.ram_strings: list[RamString] = []
        self.ram_symbols: list[RamSymbol] = []

    def _run_command(self, cmd: list[str]) -> str:
        """Run a command and return its output."""
        try:
            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
            return result.stdout
        except subprocess.CalledProcessError as e:
            _LOGGER.debug("Command failed: %s - %s", " ".join(cmd), e.stderr)
            raise
        except FileNotFoundError:
            _LOGGER.warning("Command not found: %s", cmd[0])
            raise

    def analyze(self) -> None:
        """Perform the full RAM analysis."""
        self._parse_sections()
        self._extract_strings()
        self._analyze_symbols()
        self._demangle_symbols()

    def _parse_sections(self) -> None:
        """Parse section headers from ELF file."""
        objdump = find_tool("objdump", self.objdump_path)
        if not objdump:
            _LOGGER.error("Could not find objdump command")
            return

        try:
            output = self._run_command([objdump, "-h", str(self.elf_path)])
        except (subprocess.CalledProcessError, FileNotFoundError):
            return

        # Parse section headers
        # Format: Idx Name          Size      VMA       LMA       File off  Algn
        section_pattern = re.compile(
            r"^\s*\d+\s+(\S+)\s+([0-9a-fA-F]+)\s+([0-9a-fA-F]+)"
        )

        for line in output.split("\n"):
            if match := section_pattern.match(line):
                name = match.group(1)
                size = int(match.group(2), 16)
                vma = int(match.group(3), 16)
                self.sections[name] = SectionInfo(name, vma, size)

    def _extract_strings(self) -> None:
        """Extract strings from RAM sections."""
        objdump = find_tool("objdump", self.objdump_path)
        if not objdump:
            return

        for section_name in self.ram_sections:
            if section_name not in self.sections:
                continue

            try:
                output = self._run_command(
                    [objdump, "-s", "-j", section_name, str(self.elf_path)]
                )
            except subprocess.CalledProcessError:
                # Section may exist but have no content (e.g., .bss)
                continue
            except FileNotFoundError:
                continue

            strings = self._parse_hex_dump(output, section_name)
            self.ram_strings.extend(strings)

    def _parse_hex_dump(self, output: str, section_name: str) -> list[RamString]:
        """Parse hex dump output to extract strings.

        Args:
            output: Output from objdump -s
            section_name: Name of the section being parsed

        Returns:
            List of RamString objects
        """
        strings: list[RamString] = []
        current_string = bytearray()
        string_start_addr = 0

        for line in output.split("\n"):
            # Lines look like: " 3ffef8a0 00000000 00000000 00000000 00000000  ................"
            match = re.match(r"^\s+([0-9a-fA-F]+)\s+((?:[0-9a-fA-F]{2,8}\s*)+)", line)
            if not match:
                continue

            addr = int(match.group(1), 16)
            hex_data = match.group(2).strip()

            # Convert hex to bytes
            hex_bytes = hex_data.split()
            byte_offset = 0
            for hex_chunk in hex_bytes:
                # Handle both byte-by-byte and word formats
                for i in range(0, len(hex_chunk), 2):
                    byte_val = int(hex_chunk[i : i + 2], 16)
                    if 0x20 <= byte_val <= 0x7E:  # Printable ASCII
                        if not current_string:
                            string_start_addr = addr + byte_offset
                        current_string.append(byte_val)
                    else:
                        if byte_val == 0 and len(current_string) >= self.min_length:
                            # Found null terminator
                            strings.append(
                                RamString(
                                    section=section_name,
                                    address=string_start_addr,
                                    content=current_string.decode(
                                        "ascii", errors="ignore"
                                    ),
                                )
                            )
                        current_string = bytearray()
                    byte_offset += 1

        return strings

    def _analyze_symbols(self) -> None:
        """Analyze symbols in RAM sections."""
        nm = find_tool("nm", self.objdump_path)
        if not nm:
            return

        try:
            output = self._run_command([nm, "-S", "--size-sort", str(self.elf_path)])
        except (subprocess.CalledProcessError, FileNotFoundError):
            return

        for line in output.split("\n"):
            parts = line.split()
            if len(parts) < 4:
                continue

            try:
                addr = int(parts[0], 16)
                size = int(parts[1], 16) if parts[1] != "?" else 0
            except ValueError:
                continue

            sym_type = parts[2]
            name = " ".join(parts[3:])

            # Filter for data symbols
            if sym_type not in DATA_SYMBOL_TYPES:
                continue

            # Check if symbol is in a RAM section
            for section_name in self.ram_sections:
                if section_name not in self.sections:
                    continue

                section = self.sections[section_name]
                if section.address <= addr < section.address + section.size:
                    self.ram_symbols.append(
                        RamSymbol(
                            name=name,
                            sym_type=sym_type,
                            address=addr,
                            size=size,
                            section=section_name,
                        )
                    )
                    break

    def _demangle_symbols(self) -> None:
        """Batch demangle all RAM symbol names."""
        if not self.ram_symbols:
            return

        # Collect all symbol names and demangle them
        symbol_names = [s.name for s in self.ram_symbols]
        demangle_cache = batch_demangle(symbol_names, objdump_path=self.objdump_path)

        # Assign demangled names to symbols
        for symbol in self.ram_symbols:
            symbol.demangled = demangle_cache.get(symbol.name, symbol.name)

    def _get_sections_size(self, section_names: frozenset[str]) -> int:
        """Get total size of specified sections."""
        return sum(
            section.size
            for name, section in self.sections.items()
            if name in section_names
        )

    def get_total_ram_usage(self) -> int:
        """Get total RAM usage from RAM sections."""
        return self._get_sections_size(self.ram_sections)

    def get_total_flash_usage(self) -> int:
        """Get total flash usage from flash sections."""
        return self._get_sections_size(self.flash_sections)

    def get_total_string_bytes(self) -> int:
        """Get total bytes used by strings in RAM."""
        return sum(s.size for s in self.ram_strings)

    def get_repeated_strings(self) -> list[tuple[str, int]]:
        """Find strings that appear multiple times.

        Returns:
            List of (string, count) tuples sorted by potential savings
        """
        string_counts: dict[str, int] = defaultdict(int)
        for ram_string in self.ram_strings:
            string_counts[ram_string.content] += 1

        return sorted(
            [(s, c) for s, c in string_counts.items() if c > 1],
            key=lambda x: x[1] * (len(x[0]) + 1),
            reverse=True,
        )

    def get_long_strings(self, min_len: int = 20) -> list[RamString]:
        """Get strings longer than the specified length.

        Args:
            min_len: Minimum string length

        Returns:
            List of RamString objects sorted by length
        """
        return sorted(
            [s for s in self.ram_strings if len(s.content) >= min_len],
            key=lambda x: len(x.content),
            reverse=True,
        )

    def get_largest_symbols(self, min_size: int = 100) -> list[RamSymbol]:
        """Get RAM symbols larger than the specified size.

        Args:
            min_size: Minimum symbol size in bytes

        Returns:
            List of RamSymbol objects sorted by size
        """
        return sorted(
            [s for s in self.ram_symbols if s.size >= min_size],
            key=lambda x: x.size,
            reverse=True,
        )

    def generate_report(self, show_all_sections: bool = False) -> str:
        """Generate a formatted RAM strings analysis report.

        Args:
            show_all_sections: If True, show all sections, not just RAM

        Returns:
            Formatted report string
        """
        lines: list[str] = []
        table_width = 80

        lines.append("=" * table_width)
        lines.append(
            f"RAM Strings Analysis ({self.platform.upper()})".center(table_width)
        )
        lines.append("=" * table_width)
        lines.append("")

        # Section Analysis
        lines.append("SECTION ANALYSIS")
        lines.append("-" * table_width)
        lines.append(f"{'Section':<20} {'Address':<12} {'Size':<12} {'Location'}")
        lines.append("-" * table_width)

        total_ram_usage = 0
        total_flash_usage = 0

        for name, section in sorted(self.sections.items(), key=lambda x: x[1].address):
            if name in self.ram_sections:
                location = "RAM"
                total_ram_usage += section.size
            elif name in self.flash_sections:
                location = "FLASH"
                total_flash_usage += section.size
            else:
                location = "OTHER"

            if show_all_sections or name in self.ram_sections:
                lines.append(
                    f"{name:<20} 0x{section.address:08x}   {section.size:>8} B   {location}"
                )

        lines.append("-" * table_width)
        lines.append(f"Total RAM sections size: {total_ram_usage:,} bytes")
        lines.append(f"Total Flash sections size: {total_flash_usage:,} bytes")

        # Strings in RAM
        lines.append("")
        lines.append("=" * table_width)
        lines.append("STRINGS IN RAM SECTIONS")
        lines.append("=" * table_width)
        lines.append(
            "Note: .bss sections contain uninitialized data (no strings to extract)"
        )

        # Group strings by section
        strings_by_section: dict[str, list[RamString]] = defaultdict(list)
        for ram_string in self.ram_strings:
            strings_by_section[ram_string.section].append(ram_string)

        for section_name in sorted(strings_by_section.keys()):
            section_strings = strings_by_section[section_name]
            lines.append(f"\nSection: {section_name}")
            lines.append("-" * 40)
            for ram_string in sorted(section_strings, key=lambda x: x.address):
                clean_string = ram_string.content[:100] + (
                    "..." if len(ram_string.content) > 100 else ""
                )
                lines.append(
                    f'  0x{ram_string.address:08x}: "{clean_string}" (len={len(ram_string.content)})'
                )

        # Large RAM symbols
        lines.append("")
        lines.append("=" * table_width)
        lines.append("LARGE DATA SYMBOLS IN RAM (>= 50 bytes)")
        lines.append("=" * table_width)

        largest_symbols = self.get_largest_symbols(50)
        lines.append(f"\n{'Symbol':<50} {'Type':<6} {'Size':<10} {'Section'}")
        lines.append("-" * table_width)

        for symbol in largest_symbols:
            # Use demangled name if available, otherwise raw name
            display_name = symbol.demangled or symbol.name
            name_display = display_name[:49] if len(display_name) > 49 else display_name
            lines.append(
                f"{name_display:<50} {symbol.sym_type:<6} {symbol.size:>8} B  {symbol.section}"
            )

        # Summary
        lines.append("")
        lines.append("=" * table_width)
        lines.append("SUMMARY")
        lines.append("=" * table_width)
        lines.append(f"Total strings found in RAM: {len(self.ram_strings)}")
        total_string_bytes = self.get_total_string_bytes()
        lines.append(f"Total bytes used by strings: {total_string_bytes:,}")

        # Optimization targets
        lines.append("")
        lines.append("=" * table_width)
        lines.append("POTENTIAL OPTIMIZATION TARGETS")
        lines.append("=" * table_width)

        # Repeated strings
        repeated = self.get_repeated_strings()[:10]
        if repeated:
            lines.append("\nRepeated strings (could be deduplicated):")
            for string, count in repeated:
                savings = (count - 1) * (len(string) + 1)
                clean_string = string[:50] + ("..." if len(string) > 50 else "")
                lines.append(
                    f'  "{clean_string}" - appears {count} times (potential savings: {savings} bytes)'
                )

        # Long strings - platform-specific advice
        long_strings = self.get_long_strings(20)[:10]
        if long_strings:
            if self.platform == "esp8266":
                lines.append(
                    "\nLong strings that could be moved to PROGMEM (>= 20 chars):"
                )
            else:
                # ESP32: strings in DRAM are typically there for a reason
                # (interrupt handlers, pre-flash-init code, etc.)
                lines.append("\nLong strings in DRAM (>= 20 chars):")
                lines.append(
                    "Note: ESP32 DRAM strings may be required for interrupt/early-boot contexts"
                )
            for ram_string in long_strings:
                clean_string = ram_string.content[:60] + (
                    "..." if len(ram_string.content) > 60 else ""
                )
                lines.append(
                    f'  {ram_string.section} @ 0x{ram_string.address:08x}: "{clean_string}" ({len(ram_string.content)} bytes)'
                )

        lines.append("")
        return "\n".join(lines)