494 lines
17 KiB
Python
494 lines
17 KiB
Python
"""Analyzer for RAM-stored strings in ESP8266/ESP32 firmware ELF files.
|
|
|
|
This module identifies strings that are stored in RAM sections (.data, .bss, .rodata)
|
|
rather than in flash sections (.irom0.text, .irom.text), which is important for
|
|
memory-constrained platforms like ESP8266.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
import logging
|
|
from pathlib import Path
|
|
import re
|
|
import subprocess
|
|
|
|
from .demangle import batch_demangle
|
|
from .toolchain import find_tool
|
|
|
|
_LOGGER = logging.getLogger(__name__)
|
|
|
|
# ESP8266: .rodata is in RAM (DRAM), not flash
|
|
# ESP32: .rodata is in flash, mapped to data bus
|
|
ESP8266_RAM_SECTIONS = frozenset([".data", ".rodata", ".bss"])
|
|
ESP8266_FLASH_SECTIONS = frozenset([".irom0.text", ".irom.text", ".text"])
|
|
|
|
# ESP32: .rodata is memory-mapped from flash
|
|
ESP32_RAM_SECTIONS = frozenset([".data", ".bss", ".dram0.data", ".dram0.bss"])
|
|
ESP32_FLASH_SECTIONS = frozenset([".text", ".rodata", ".flash.text", ".flash.rodata"])
|
|
|
|
# nm symbol types for data symbols (D=global data, d=local data, R=rodata, B=bss)
|
|
DATA_SYMBOL_TYPES = frozenset(["D", "d", "R", "r", "B", "b"])
|
|
|
|
|
|
@dataclass
|
|
class SectionInfo:
|
|
"""Information about an ELF section."""
|
|
|
|
name: str
|
|
address: int
|
|
size: int
|
|
|
|
|
|
@dataclass
|
|
class RamString:
|
|
"""A string found in RAM."""
|
|
|
|
section: str
|
|
address: int
|
|
content: str
|
|
|
|
@property
|
|
def size(self) -> int:
|
|
"""Size in bytes including null terminator."""
|
|
return len(self.content) + 1
|
|
|
|
|
|
@dataclass
|
|
class RamSymbol:
|
|
"""A symbol found in RAM."""
|
|
|
|
name: str
|
|
sym_type: str
|
|
address: int
|
|
size: int
|
|
section: str
|
|
demangled: str = "" # Demangled name, set after batch demangling
|
|
|
|
|
|
class RamStringsAnalyzer:
|
|
"""Analyzes ELF files to find strings stored in RAM."""
|
|
|
|
def __init__(
|
|
self,
|
|
elf_path: str,
|
|
objdump_path: str | None = None,
|
|
min_length: int = 8,
|
|
platform: str = "esp32",
|
|
) -> None:
|
|
"""Initialize the RAM strings analyzer.
|
|
|
|
Args:
|
|
elf_path: Path to the ELF file to analyze
|
|
objdump_path: Path to objdump binary (used to find other tools)
|
|
min_length: Minimum string length to report (default: 8)
|
|
platform: Platform name ("esp8266", "esp32", etc.) for section mapping
|
|
"""
|
|
self.elf_path = Path(elf_path)
|
|
if not self.elf_path.exists():
|
|
raise FileNotFoundError(f"ELF file not found: {elf_path}")
|
|
|
|
self.objdump_path = objdump_path
|
|
self.min_length = min_length
|
|
self.platform = platform
|
|
|
|
# Set RAM/flash sections based on platform
|
|
if self.platform == "esp8266":
|
|
self.ram_sections = ESP8266_RAM_SECTIONS
|
|
self.flash_sections = ESP8266_FLASH_SECTIONS
|
|
else:
|
|
# ESP32 and other platforms
|
|
self.ram_sections = ESP32_RAM_SECTIONS
|
|
self.flash_sections = ESP32_FLASH_SECTIONS
|
|
|
|
self.sections: dict[str, SectionInfo] = {}
|
|
self.ram_strings: list[RamString] = []
|
|
self.ram_symbols: list[RamSymbol] = []
|
|
|
|
def _run_command(self, cmd: list[str]) -> str:
|
|
"""Run a command and return its output."""
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
return result.stdout
|
|
except subprocess.CalledProcessError as e:
|
|
_LOGGER.debug("Command failed: %s - %s", " ".join(cmd), e.stderr)
|
|
raise
|
|
except FileNotFoundError:
|
|
_LOGGER.warning("Command not found: %s", cmd[0])
|
|
raise
|
|
|
|
def analyze(self) -> None:
|
|
"""Perform the full RAM analysis."""
|
|
self._parse_sections()
|
|
self._extract_strings()
|
|
self._analyze_symbols()
|
|
self._demangle_symbols()
|
|
|
|
def _parse_sections(self) -> None:
|
|
"""Parse section headers from ELF file."""
|
|
objdump = find_tool("objdump", self.objdump_path)
|
|
if not objdump:
|
|
_LOGGER.error("Could not find objdump command")
|
|
return
|
|
|
|
try:
|
|
output = self._run_command([objdump, "-h", str(self.elf_path)])
|
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
return
|
|
|
|
# Parse section headers
|
|
# Format: Idx Name Size VMA LMA File off Algn
|
|
section_pattern = re.compile(
|
|
r"^\s*\d+\s+(\S+)\s+([0-9a-fA-F]+)\s+([0-9a-fA-F]+)"
|
|
)
|
|
|
|
for line in output.split("\n"):
|
|
if match := section_pattern.match(line):
|
|
name = match.group(1)
|
|
size = int(match.group(2), 16)
|
|
vma = int(match.group(3), 16)
|
|
self.sections[name] = SectionInfo(name, vma, size)
|
|
|
|
def _extract_strings(self) -> None:
|
|
"""Extract strings from RAM sections."""
|
|
objdump = find_tool("objdump", self.objdump_path)
|
|
if not objdump:
|
|
return
|
|
|
|
for section_name in self.ram_sections:
|
|
if section_name not in self.sections:
|
|
continue
|
|
|
|
try:
|
|
output = self._run_command(
|
|
[objdump, "-s", "-j", section_name, str(self.elf_path)]
|
|
)
|
|
except subprocess.CalledProcessError:
|
|
# Section may exist but have no content (e.g., .bss)
|
|
continue
|
|
except FileNotFoundError:
|
|
continue
|
|
|
|
strings = self._parse_hex_dump(output, section_name)
|
|
self.ram_strings.extend(strings)
|
|
|
|
def _parse_hex_dump(self, output: str, section_name: str) -> list[RamString]:
|
|
"""Parse hex dump output to extract strings.
|
|
|
|
Args:
|
|
output: Output from objdump -s
|
|
section_name: Name of the section being parsed
|
|
|
|
Returns:
|
|
List of RamString objects
|
|
"""
|
|
strings: list[RamString] = []
|
|
current_string = bytearray()
|
|
string_start_addr = 0
|
|
|
|
for line in output.split("\n"):
|
|
# Lines look like: " 3ffef8a0 00000000 00000000 00000000 00000000 ................"
|
|
match = re.match(r"^\s+([0-9a-fA-F]+)\s+((?:[0-9a-fA-F]{2,8}\s*)+)", line)
|
|
if not match:
|
|
continue
|
|
|
|
addr = int(match.group(1), 16)
|
|
hex_data = match.group(2).strip()
|
|
|
|
# Convert hex to bytes
|
|
hex_bytes = hex_data.split()
|
|
byte_offset = 0
|
|
for hex_chunk in hex_bytes:
|
|
# Handle both byte-by-byte and word formats
|
|
for i in range(0, len(hex_chunk), 2):
|
|
byte_val = int(hex_chunk[i : i + 2], 16)
|
|
if 0x20 <= byte_val <= 0x7E: # Printable ASCII
|
|
if not current_string:
|
|
string_start_addr = addr + byte_offset
|
|
current_string.append(byte_val)
|
|
else:
|
|
if byte_val == 0 and len(current_string) >= self.min_length:
|
|
# Found null terminator
|
|
strings.append(
|
|
RamString(
|
|
section=section_name,
|
|
address=string_start_addr,
|
|
content=current_string.decode(
|
|
"ascii", errors="ignore"
|
|
),
|
|
)
|
|
)
|
|
current_string = bytearray()
|
|
byte_offset += 1
|
|
|
|
return strings
|
|
|
|
def _analyze_symbols(self) -> None:
|
|
"""Analyze symbols in RAM sections."""
|
|
nm = find_tool("nm", self.objdump_path)
|
|
if not nm:
|
|
return
|
|
|
|
try:
|
|
output = self._run_command([nm, "-S", "--size-sort", str(self.elf_path)])
|
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
return
|
|
|
|
for line in output.split("\n"):
|
|
parts = line.split()
|
|
if len(parts) < 4:
|
|
continue
|
|
|
|
try:
|
|
addr = int(parts[0], 16)
|
|
size = int(parts[1], 16) if parts[1] != "?" else 0
|
|
except ValueError:
|
|
continue
|
|
|
|
sym_type = parts[2]
|
|
name = " ".join(parts[3:])
|
|
|
|
# Filter for data symbols
|
|
if sym_type not in DATA_SYMBOL_TYPES:
|
|
continue
|
|
|
|
# Check if symbol is in a RAM section
|
|
for section_name in self.ram_sections:
|
|
if section_name not in self.sections:
|
|
continue
|
|
|
|
section = self.sections[section_name]
|
|
if section.address <= addr < section.address + section.size:
|
|
self.ram_symbols.append(
|
|
RamSymbol(
|
|
name=name,
|
|
sym_type=sym_type,
|
|
address=addr,
|
|
size=size,
|
|
section=section_name,
|
|
)
|
|
)
|
|
break
|
|
|
|
def _demangle_symbols(self) -> None:
|
|
"""Batch demangle all RAM symbol names."""
|
|
if not self.ram_symbols:
|
|
return
|
|
|
|
# Collect all symbol names and demangle them
|
|
symbol_names = [s.name for s in self.ram_symbols]
|
|
demangle_cache = batch_demangle(symbol_names, objdump_path=self.objdump_path)
|
|
|
|
# Assign demangled names to symbols
|
|
for symbol in self.ram_symbols:
|
|
symbol.demangled = demangle_cache.get(symbol.name, symbol.name)
|
|
|
|
def _get_sections_size(self, section_names: frozenset[str]) -> int:
|
|
"""Get total size of specified sections."""
|
|
return sum(
|
|
section.size
|
|
for name, section in self.sections.items()
|
|
if name in section_names
|
|
)
|
|
|
|
def get_total_ram_usage(self) -> int:
|
|
"""Get total RAM usage from RAM sections."""
|
|
return self._get_sections_size(self.ram_sections)
|
|
|
|
def get_total_flash_usage(self) -> int:
|
|
"""Get total flash usage from flash sections."""
|
|
return self._get_sections_size(self.flash_sections)
|
|
|
|
def get_total_string_bytes(self) -> int:
|
|
"""Get total bytes used by strings in RAM."""
|
|
return sum(s.size for s in self.ram_strings)
|
|
|
|
def get_repeated_strings(self) -> list[tuple[str, int]]:
|
|
"""Find strings that appear multiple times.
|
|
|
|
Returns:
|
|
List of (string, count) tuples sorted by potential savings
|
|
"""
|
|
string_counts: dict[str, int] = defaultdict(int)
|
|
for ram_string in self.ram_strings:
|
|
string_counts[ram_string.content] += 1
|
|
|
|
return sorted(
|
|
[(s, c) for s, c in string_counts.items() if c > 1],
|
|
key=lambda x: x[1] * (len(x[0]) + 1),
|
|
reverse=True,
|
|
)
|
|
|
|
def get_long_strings(self, min_len: int = 20) -> list[RamString]:
|
|
"""Get strings longer than the specified length.
|
|
|
|
Args:
|
|
min_len: Minimum string length
|
|
|
|
Returns:
|
|
List of RamString objects sorted by length
|
|
"""
|
|
return sorted(
|
|
[s for s in self.ram_strings if len(s.content) >= min_len],
|
|
key=lambda x: len(x.content),
|
|
reverse=True,
|
|
)
|
|
|
|
def get_largest_symbols(self, min_size: int = 100) -> list[RamSymbol]:
|
|
"""Get RAM symbols larger than the specified size.
|
|
|
|
Args:
|
|
min_size: Minimum symbol size in bytes
|
|
|
|
Returns:
|
|
List of RamSymbol objects sorted by size
|
|
"""
|
|
return sorted(
|
|
[s for s in self.ram_symbols if s.size >= min_size],
|
|
key=lambda x: x.size,
|
|
reverse=True,
|
|
)
|
|
|
|
def generate_report(self, show_all_sections: bool = False) -> str:
|
|
"""Generate a formatted RAM strings analysis report.
|
|
|
|
Args:
|
|
show_all_sections: If True, show all sections, not just RAM
|
|
|
|
Returns:
|
|
Formatted report string
|
|
"""
|
|
lines: list[str] = []
|
|
table_width = 80
|
|
|
|
lines.append("=" * table_width)
|
|
lines.append(
|
|
f"RAM Strings Analysis ({self.platform.upper()})".center(table_width)
|
|
)
|
|
lines.append("=" * table_width)
|
|
lines.append("")
|
|
|
|
# Section Analysis
|
|
lines.append("SECTION ANALYSIS")
|
|
lines.append("-" * table_width)
|
|
lines.append(f"{'Section':<20} {'Address':<12} {'Size':<12} {'Location'}")
|
|
lines.append("-" * table_width)
|
|
|
|
total_ram_usage = 0
|
|
total_flash_usage = 0
|
|
|
|
for name, section in sorted(self.sections.items(), key=lambda x: x[1].address):
|
|
if name in self.ram_sections:
|
|
location = "RAM"
|
|
total_ram_usage += section.size
|
|
elif name in self.flash_sections:
|
|
location = "FLASH"
|
|
total_flash_usage += section.size
|
|
else:
|
|
location = "OTHER"
|
|
|
|
if show_all_sections or name in self.ram_sections:
|
|
lines.append(
|
|
f"{name:<20} 0x{section.address:08x} {section.size:>8} B {location}"
|
|
)
|
|
|
|
lines.append("-" * table_width)
|
|
lines.append(f"Total RAM sections size: {total_ram_usage:,} bytes")
|
|
lines.append(f"Total Flash sections size: {total_flash_usage:,} bytes")
|
|
|
|
# Strings in RAM
|
|
lines.append("")
|
|
lines.append("=" * table_width)
|
|
lines.append("STRINGS IN RAM SECTIONS")
|
|
lines.append("=" * table_width)
|
|
lines.append(
|
|
"Note: .bss sections contain uninitialized data (no strings to extract)"
|
|
)
|
|
|
|
# Group strings by section
|
|
strings_by_section: dict[str, list[RamString]] = defaultdict(list)
|
|
for ram_string in self.ram_strings:
|
|
strings_by_section[ram_string.section].append(ram_string)
|
|
|
|
for section_name in sorted(strings_by_section.keys()):
|
|
section_strings = strings_by_section[section_name]
|
|
lines.append(f"\nSection: {section_name}")
|
|
lines.append("-" * 40)
|
|
for ram_string in sorted(section_strings, key=lambda x: x.address):
|
|
clean_string = ram_string.content[:100] + (
|
|
"..." if len(ram_string.content) > 100 else ""
|
|
)
|
|
lines.append(
|
|
f' 0x{ram_string.address:08x}: "{clean_string}" (len={len(ram_string.content)})'
|
|
)
|
|
|
|
# Large RAM symbols
|
|
lines.append("")
|
|
lines.append("=" * table_width)
|
|
lines.append("LARGE DATA SYMBOLS IN RAM (>= 50 bytes)")
|
|
lines.append("=" * table_width)
|
|
|
|
largest_symbols = self.get_largest_symbols(50)
|
|
lines.append(f"\n{'Symbol':<50} {'Type':<6} {'Size':<10} {'Section'}")
|
|
lines.append("-" * table_width)
|
|
|
|
for symbol in largest_symbols:
|
|
# Use demangled name if available, otherwise raw name
|
|
display_name = symbol.demangled or symbol.name
|
|
name_display = display_name[:49] if len(display_name) > 49 else display_name
|
|
lines.append(
|
|
f"{name_display:<50} {symbol.sym_type:<6} {symbol.size:>8} B {symbol.section}"
|
|
)
|
|
|
|
# Summary
|
|
lines.append("")
|
|
lines.append("=" * table_width)
|
|
lines.append("SUMMARY")
|
|
lines.append("=" * table_width)
|
|
lines.append(f"Total strings found in RAM: {len(self.ram_strings)}")
|
|
total_string_bytes = self.get_total_string_bytes()
|
|
lines.append(f"Total bytes used by strings: {total_string_bytes:,}")
|
|
|
|
# Optimization targets
|
|
lines.append("")
|
|
lines.append("=" * table_width)
|
|
lines.append("POTENTIAL OPTIMIZATION TARGETS")
|
|
lines.append("=" * table_width)
|
|
|
|
# Repeated strings
|
|
repeated = self.get_repeated_strings()[:10]
|
|
if repeated:
|
|
lines.append("\nRepeated strings (could be deduplicated):")
|
|
for string, count in repeated:
|
|
savings = (count - 1) * (len(string) + 1)
|
|
clean_string = string[:50] + ("..." if len(string) > 50 else "")
|
|
lines.append(
|
|
f' "{clean_string}" - appears {count} times (potential savings: {savings} bytes)'
|
|
)
|
|
|
|
# Long strings - platform-specific advice
|
|
long_strings = self.get_long_strings(20)[:10]
|
|
if long_strings:
|
|
if self.platform == "esp8266":
|
|
lines.append(
|
|
"\nLong strings that could be moved to PROGMEM (>= 20 chars):"
|
|
)
|
|
else:
|
|
# ESP32: strings in DRAM are typically there for a reason
|
|
# (interrupt handlers, pre-flash-init code, etc.)
|
|
lines.append("\nLong strings in DRAM (>= 20 chars):")
|
|
lines.append(
|
|
"Note: ESP32 DRAM strings may be required for interrupt/early-boot contexts"
|
|
)
|
|
for ram_string in long_strings:
|
|
clean_string = ram_string.content[:60] + (
|
|
"..." if len(ram_string.content) > 60 else ""
|
|
)
|
|
lines.append(
|
|
f' {ram_string.section} @ 0x{ram_string.address:08x}: "{clean_string}" ({len(ram_string.content)} bytes)'
|
|
)
|
|
|
|
lines.append("")
|
|
return "\n".join(lines)
|