From efcc78a5586f09eea10f520438c50537e29d4b85 Mon Sep 17 00:00:00 2001
From: RichardG867 <richardg867@gmail.com>
Date: Fri, 15 Apr 2022 20:23:37 -0300
Subject: [PATCH] Improve debug logging, it can now be enabled through -d

---
 biostools/__main__.py   | 51 +++++++++++++++++++-------
 biostools/analyzers.py  | 79 ++++++++++++++++++++++++++++++-----------
 biostools/extractors.py | 20 +++++------
 docker-entrypoint.sh    |  2 +-
 4 files changed, 109 insertions(+), 43 deletions(-)

diff --git a/biostools/__main__.py b/biostools/__main__.py
index bc6c64e..023e86a 100644
--- a/biostools/__main__.py
+++ b/biostools/__main__.py
@@ -89,7 +89,7 @@ def extract_dir(file_extractors, dir_number_path, next_dir_number_path, scan_dir
 	# Remove this directory if it ends up empty.
 	util.rmdirs(scan_dir_path)
 
-def extract_process(queue, dir_number_path, next_dir_number_path):
+def extract_process(queue, dir_number_path, next_dir_number_path, debug):
 	"""Main loop for the extraction multiprocessing pool."""
 
 	# Set up extractors.
@@ -117,6 +117,12 @@ def extract_process(queue, dir_number_path, next_dir_number_path):
 		extractors.MBRUnsafeExtractor(),
 	]
 
+	# Disable debug mode on extractors.
+	if not debug:
+		dummy_func = lambda self, *args: None
+		for extractor in file_extractors:
+			extractor.debug_print = dummy_func
+
 	# Receive work from the queue.
 	while True:
 		item = queue.get()
@@ -161,7 +167,7 @@ def extract(dir_path, _, options):
 		# Start multiprocessing pool.
 		print('Starting extraction on directory {0}'.format(dir_number), end='', flush=True)
 		queue = multiprocessing.Queue(maxsize=MP_PROCESS_COUNT)
-		mp_pool = multiprocessing.Pool(MP_PROCESS_COUNT, initializer=extract_process, initargs=(queue, dir_number_path, next_dir_number_path))
+		mp_pool = multiprocessing.Pool(MP_PROCESS_COUNT, initializer=extract_process, initargs=(queue, dir_number_path, next_dir_number_path, options.get('debug')))
 
 		# Create next directory.
 		if not os.path.isdir(next_dir_number_path):
@@ -471,7 +477,7 @@ def analyze_dir(formatter, scan_base, file_analyzers, scan_dir_path, scan_file_n
 		# Output the results.
 		formatter.output_row(fields)
 
-def analyze_process(queue, formatter, scan_base):
+def analyze_process(queue, formatter, scan_base, debug):
 	"""Main loop for the analysis multiprocessing pool."""
 
 	# Set up analyzers.
@@ -518,6 +524,13 @@ def analyze_process(queue, formatter, scan_base):
 		analyzers.ZenithAnalyzer(),
 	]
 
+	# Disable debug mode on analyzers.
+	if not debug:
+		dummy_func = lambda self, *args: None
+		for analyzer in file_analyzers:
+			analyzer.debug_print = dummy_func
+			analyzer.debug = False
+
 	# Receive work from the queue.
 	while True:
 		item = queue.get()
@@ -556,7 +569,7 @@ def analyze(dir_path, formatter_args, options):
 
 	# Start multiprocessing pool.
 	queue = multiprocessing.Queue(maxsize=MP_PROCESS_COUNT)
-	mp_pool = multiprocessing.Pool(MP_PROCESS_COUNT, initializer=analyze_process, initargs=(queue, formatter, dir_path))
+	mp_pool = multiprocessing.Pool(MP_PROCESS_COUNT, initializer=analyze_process, initargs=(queue, formatter, dir_path, options.get('debug')))
 
 	if os.path.isdir(dir_path):
 		# Scan directory structure.
@@ -579,21 +592,26 @@ def analyze(dir_path, formatter_args, options):
 
 
 def main():
+	# Set default options.
 	mode = None
 	options = {
 		'array': False,
+		'debug': False,
 		'format': 'csv',
 		'headers': True,
 		'hyperlink': False,
 		'docker-usage': False,
 	}
 
-	args, remainder = getopt.gnu_getopt(sys.argv[1:], 'xaf:hnr', ['extract', 'analyze', 'format=', 'hyperlink', 'no-headers', 'array', 'docker-usage'])
+	# Parse arguments.
+	args, remainder = getopt.gnu_getopt(sys.argv[1:], 'xadf:hnr', ['extract', 'analyze', 'debug', 'format=', 'hyperlink', 'no-headers', 'array', 'docker-usage'])
 	for opt, arg in args:
 		if opt in ('-x', '--extract'):
 			mode = 'extract'
 		elif opt in ('-a', '--analyze'):
 			mode = 'analyze'
+		elif opt in ('-d', '--debug'):
+			options['debug'] = True
 		elif opt in ('-f', '--format'):
 			options['format'] = arg.lower()
 		elif opt in ('-h', '--hyperlink'):
@@ -606,33 +624,42 @@ def main():
 			options['docker-usage'] = True
 
 	if len(remainder) > 0:
+		# Disable multi-threading in debug mode.
+		if options.get('debug'):
+			global MP_PROCESS_COUNT
+			MP_PROCESS_COUNT = 1
+
+		# Run mode handler.
 		if mode == 'extract':
 			return extract(remainder[0], remainder[1:], options)
 		elif mode == 'analyze':
 			return analyze(remainder[0], remainder[1:], options)
 
+	# Print usage.
 	if options['docker-usage']:
 		usage = '''
-Usage: docker run -v directory:/bios biostools [-f output_format] [-h] [-n] [-r] [formatter_options]
+Usage: docker run -v directory:/bios biostools [-d] [-f output_format] [-h] [-n] [-r] [formatter_options]
 
        Archives and BIOS images in the directory mounted to /bios will be
        extracted and analyzed.
 '''
 	else:
 		usage = '''
-Usage: python3 -m biostools -x directory
-       python3 -m biostools [-f output_format] [-h] [-n] [-r] -a directory|single_file [formatter_options]
+Usage: python3 -m biostools [-d] -x directory
+       python3 -m biostools [-d] [-f output_format] [-h] [-n] [-r] -a directory|single_file [formatter_options]
 
        -x    Extract archives and BIOS images recursively in the given directory
+       -d    Enable debug output.
 
        -a    Analyze extracted BIOS images in the given directory, or a single
              extracted file (extracting with -x first is recommended)'''
 	usage += '''
+	   -d    Enable debug output.
        -f    Output format:
-                 csv		Comma-separated values with quotes (default)
-                 scsv		Semicolon-separated values with quotes
-                 json		JSON object array
-                 jsontable	JSON table
+                 csv        Comma-separated values with quotes (default)
+                 scsv       Semicolon-separated values with quotes
+                 json       JSON object array
+                 jsontable  JSON table
        -h    Generate download links for file paths representing HTTP URLs.
              csv/scsv: The Excel HYPERLINK formula is used; if you have
                        non-English Excel, you must provide your language's
diff --git a/biostools/analyzers.py b/biostools/analyzers.py
index 10f80d1..fac8790 100644
--- a/biostools/analyzers.py
+++ b/biostools/analyzers.py
@@ -102,9 +102,9 @@ class AbortAnalysisError(Exception):
 
 
 class Analyzer:
-	def __init__(self, vendor, debug=False):
+	def __init__(self, vendor):
 		self.vendor_id = self.vendor = vendor
-		self.debug = debug
+		self.debug = True # to speed up analyze_line
 
 		self._check_list = []
 
@@ -124,7 +124,8 @@ class Analyzer:
 			if checker_result:
 				callback_result = callback_func(line, checker_result)
 				if callback_result:
-					self.debug_print(callback_func.__name__, line)
+					if self.debug:
+						self.debug_print(callback_func.__name__, '=>', repr(line))
 					return callback_result
 
 	def can_analyze(self):
@@ -137,10 +138,9 @@ class Analyzer:
 		   None if no such file exists."""
 		return True
 
-	def debug_print(self, key, line=None):
-		"""Print a line containing analyzer state if debugging is enabled."""
-		if self.debug:
-			print(self._file_path, '=> found', self.vendor_id, key, '=', (line == None) and 'no line' or repr(line), '\n', end='', file=sys.stderr)
+	def debug_print(self, *args):
+		"""Print a log line if debug output is enabled."""
+		print(self.__class__.__name__ + ':', *args, file=sys.stderr)
 
 	def register_check_list(self, check_list):
 		"""Register the list of checks this analyzer will handle.
@@ -177,7 +177,7 @@ class Analyzer:
 		self.addons = []
 		self.oroms = []
 
-		self._file_path = '?'
+		self._file_path = '[?]'
 
 class NoInfoAnalyzer(Analyzer):
 	"""Special analyzer for BIOSes which can be identified,
@@ -367,6 +367,8 @@ class AMIAnalyzer(Analyzer):
 		# rely on the header version data to get the Intel version sign-on.
 		if header_data:
 			is_intel = AMIIntelAnalyzer.can_handle(self, file_data, header_data)
+			if is_intel:
+				self.debug_print('Intel data found')
 		else:
 			is_intel = False
 
@@ -375,12 +377,14 @@ class AMIAnalyzer(Analyzer):
 		if match:
 			# Determine location of the identification block.
 			id_block_index = match.start(0)
+			self.debug_print('ID block starts at', hex(id_block_index))
 
 			# Extract version.
 			version_6plus = match.group(1)
 			if version_6plus:
 				# AMIBIOS 6 onwards.
 				self.version = version_6plus.decode('cp437', 'ignore')
+				self.debug_print('Version (6+):', self.version)
 
 				# Pad 4-digit versions. (Biostar)
 				if self.version[-1] not in '0123456789':
@@ -390,56 +394,73 @@ class AMIAnalyzer(Analyzer):
 				version_winbios_maj = match.group(2)
 				version_winbios_min = match.group(3)
 				if version_winbios_maj and version_winbios_min:
-					self.version = (version_winbios_maj + version_winbios_min).decode('cp437', 'ignore') + '00'
+					self.version = (version_winbios_maj + version_winbios_min).decode('cp437', 'ignore')
+					self.debug_print('Version (4-5):', self.version)
+					self.version += '00'
 					self.addons.append('WinBIOS')
 				else:
-					# AMI Color date.
+					# AMI Color (or WinBIOS 12/15/93) date.
 					self.version = match.group(4).decode('cp437', 'ignore')
+					self.debug_print('Version (Color):', self.version)
 
 			# Extract string.
 			self.string = util.read_string(file_data[id_block_index + 0x78:id_block_index + 0xa0])
 			if 'Intel Corporation' in self.string or len(self.string) <= 6: # (later Intel AMI with no string)
 				self.string = ''
+				self.debug_print('Intel with no string')
 			else:
+				self.debug_print('Base string:', self.string)
+
 				# Add identification tag to the string if one is present.
 				id_tag = util.read_string(file_data[id_block_index + 0xec:id_block_index + 0x100])
+				self.debug_print('String tag:', id_tag)
 				if id_tag[:4] == '_TG_':
 					self.string = self.string.rstrip() + '-' + id_tag[4:].lstrip()
 
-			# Stop if this BIOS is actually Aptio UEFI CSM.
-			if self._uefi_csm_pattern.match(self.string):
-				return False
+				# Stop if this BIOS is actually Aptio UEFI CSM.
+				if self._uefi_csm_pattern.match(self.string):
+					self.debug_print('String matches UEFI CSM, aborting')
+					return False
 
 			# Ignore unwanted string terminator on sign-on. (TriGem Lisbon-II)
 			signon_terminator = b'\x00'
 			if file_data[id_block_index + 0x123:id_block_index + 0x12b] == b' Inc.,\x00 ':
+				self.debug_print('Applying sign-on terminator hack')
 				signon_terminator += b'\x00'
 
 			# Extract sign-on, while removing carriage returns.
 			self.signon = util.read_string(file_data[id_block_index + 0x100:id_block_index + 0x200], terminator=signon_terminator)
+			self.debug_print('Raw sign-on:', repr(self.signon))
 
 			# The actual sign-on starts on the second line.
 			self.signon = '\n'.join(x.rstrip('\r').strip() for x in self.signon.split('\n')[1:] if x != '\r').strip('\n')
 		elif len(file_data) < 1024:
 			# Ignore false positives from sannata readmes.
+			self.debug_print('False positive by size of', len(file_data), 'bytes')
 			return False
 		elif self._precolor_date_pattern.search(file_data):
+			self.debug_print('Found potential pre-Color')
+
 			# Check date, using a different pattern to differentiate core date from build date.
 			match = self._precolor_core_date_pattern.search(file_data)
 			if match:
 				date_start = match.start(0)
+				self.debug_print('Pre-Color data starts at', hex(date_start))
 			else:
 				match = self._date_pattern.search(file_data)
 				date_start = 0
+				self.debug_print('Pre-Color data start is unknown')
 			if match:
 				# Extract date as the version.
 				self.version = (match.group(1) or (match.group(2) + b'/' + match.group(3) + b'/' + match.group(4))).decode('cp437', 'ignore')
+				self.debug_print('Version (pre-Color):', self.version)
 
 				# Check pre-Color identification block.
 				match = self._precolor_block_pattern.search(file_data)
 				if match:
 					# Determine location of the identification block.
 					id_block_index = match.start(0)
+					self.debug_print('Pre-Color ID block starts at', hex(id_block_index))
 
 					# Locate the encoded string.
 					match = self._precolor_string_pattern.search(file_data[date_start:])
@@ -453,6 +474,7 @@ class AMIAnalyzer(Analyzer):
 							if c & 0x80: # MSB termination
 								break
 						self.string = bytes(buf).decode('cp437', 'ignore')
+						self.debug_print('Base string:', self.string)
 
 						# Remove "-K" KBC suffix.
 						# Note: K without preceding - is possible (Atari PC5)
@@ -470,6 +492,8 @@ class AMIAnalyzer(Analyzer):
 						# Add date.
 						self.string += '-' + util.read_string(file_data[id_block_index + 0x9c:id_block_index + 0xa4]).replace('/', '').strip()
 
+						self.debug_print('Reconstructed string:', self.string)
+
 						# Invalidate string if the identification block doesn't
 						# appear to be valid. (Intel AMI post-Color without string)
 						if self.string[:10] in ('????-0000-', '????-0166-'):
@@ -478,6 +502,7 @@ class AMIAnalyzer(Analyzer):
 				elif check_match.group(1): # 8088-BIOS header
 					# Extract version.
 					self.version = check_match.group(1).decode('cp437', 'ignore')
+					self.debug_print('Version (8088):', self.string)
 
 					# Locate the encoded string.
 					match = self._8088_string_pattern.search(file_data)
@@ -489,20 +514,27 @@ class AMIAnalyzer(Analyzer):
 							c = (c << 1) | (c >> 7)
 							buf.append(c & 0x7f)
 						self.string = bytes(buf).decode('cp437', 'ignore')
+
+						self.debug_print('Base string:', self.string)
 					else:
 						# Fallback if we can't find the encoded string.
 						self.string = '????-' + self.version.replace('/', '')
 
+						self.debug_print('Reconstructed string:', self.string)
+
 				# Extract additional information after the copyright as a sign-on.
 				# (Shuttle 386SX, CDTEK 286, Flying Triumph Access Methods)
 				match = self._precolor_signon_pattern.search(file_data)
 				if match:
 					self.signon = match.group(1).decode('cp437', 'ignore')
+					self.debug_print('Raw sign-on:', repr(self.signon))
 
 					# Split sign-on lines. (Video Technology Info-Tech 286-BIOS)
 					self.signon = '\n'.join(x.strip() for x in self.signon.split('\n') if x.strip()).strip('\n')
 			else:
 				# Assume this is not an AMI BIOS, unless we found Intel data above.
+				if is_intel:
+					self.debug_print('No AMI data found but Intel data found')
 				return is_intel
 
 		return True
@@ -617,7 +649,7 @@ class AMIDellAnalyzer(AMIAnalyzer):
 		if file_data[:9] == b'DELLBIOS\x00':
 			# DELLBIOS header contains the Dell version.
 			self.version = '11/11/92'
-			self.debug_print('DELLBIOS header')
+			self.debug_print('DELLBIOS header present')
 
 			# Extract the version as a sign-on.
 			terminator_index = file_data.find(b'\x00', 10)
@@ -640,7 +672,7 @@ class AMIDellAnalyzer(AMIAnalyzer):
 
 			# The Dell version will be in the BIOS body.
 			self.version = '11/11/92'
-			self.debug_print('DELLXBIOS string')
+			self.debug_print('DELLXBIOS string present')
 
 			return True
 
@@ -1776,13 +1808,13 @@ class PhoenixAnalyzer(Analyzer):
 			# Extract the build code as a string.
 			build_code = self.string = util.read_string(file_data[offset + 55:offset + 63].replace(b'\x00', b'\x20')).strip()
 			if build_code:
-				self.debug_print('BCPSYS build code', build_code)
+				self.debug_print('BCPSYS build code:', build_code)
 
 			# Append the build date and time to the string.
 			date_time = util.read_string(file_data[offset + 15:offset + 32].replace(b'\x00', b'\x20')).strip()
 			if self._bcpsys_datetime_pattern.match(date_time): # discard if this is an invalid date/time (PHLASH.EXE)
 				self._bcpsys_date_time = date_time
-				self.debug_print('BCPSYS build date/time', date_time)
+				self.debug_print('BCPSYS build date/time:', date_time)
 				if self.string:
 					self.string += '\n'
 				self.string += date_time
@@ -1798,7 +1830,7 @@ class PhoenixAnalyzer(Analyzer):
 			if dell_version[0:1] == 'A':
 				self.signon += 'BIOS Version: ' + dell_version
 
-			self.debug_print('Dell version', dell_version)
+			self.debug_print('Dell version:', dell_version)
 		else:
 			# Determine if this is some sort of HP Vectra BIOS.
 			match = self._hp_pattern.search(file_data)
@@ -1812,7 +1844,7 @@ class PhoenixAnalyzer(Analyzer):
 				match = self._hp_signon_pattern.search(file_data)
 				if match:
 					self.signon = match.group(0).decode('cp437', 'ignore')
-					self.debug_print('HP version', self.signon)
+					self.debug_print('HP version:', self.signon)
 			else:
 				# Extract sign-on from Core and some 4.0 Release 6.0 BIOSes.
 				match = self._core_signon_pattern.search(file_data)
@@ -1825,15 +1857,22 @@ class PhoenixAnalyzer(Analyzer):
 						self.signon = self.signon.replace(frm, to)
 
 					self.signon = self.signon.decode('cp437', 'ignore')
+					print('Raw sign-on (4R6+):', repr(self.signon))
 				else:
 					# Extract sign-on from Ax86 and older BIOSes.
 					match = self._rombios_signon_pattern.search(file_data)
-					if not match:
+					if match:
+						signon_log = '(old std):'
+					else:
 						match = self._rombios_signon_alt_pattern.search(file_data)
+						signon_log = '(old alt):'
 					if match:
 						end = match.end(0)
 						if file_data[end] != 0xfa: # (unknown 8088 PLUS 2.52)
 							self.signon = util.read_string(file_data[end:end + 256])
+							print('Raw sign-on', signon_log, repr(self.signon))
+						else:
+							self.debug_print('Ignored bogus sign-on, first bytes:', repr(file_data[end:end + 8]))
 
 				# Split sign-on lines.
 				if self.signon:
diff --git a/biostools/extractors.py b/biostools/extractors.py
index f382373..978b0d4 100644
--- a/biostools/extractors.py
+++ b/biostools/extractors.py
@@ -33,9 +33,9 @@ class Extractor:
 		   - a string with the produced output file/directory path"""
 		raise NotImplementedError()
 
-	def log_print(self, *args):
-		"""Print a log line."""
-		print('{0}:'.format(self.__class__.__name__), *args, file=sys.stderr)
+	def debug_print(self, *args):
+		"""Print a log line if debug output is enabled."""
+		print(self.__class__.__name__ + ':', *args, file=sys.stderr)
 
 
 class ApricotExtractor(Extractor):
@@ -1365,7 +1365,7 @@ class IntelExtractor(Extractor):
 		# Create destination file.
 		dest_file_path = os.path.join(dest_dir, 'intel.bin')
 		out_f = open(dest_file_path, 'wb')
-		self.log_print('Found', len(found_parts), 'parts, header size', header_size, 'bytes, largest part size', largest_part_size, 'bytes')
+		self.debug_print('Found', len(found_parts), 'parts, header size', header_size, 'bytes, largest part size', largest_part_size, 'bytes')
 
 		# Copy parts to the destination file.
 		bootblock_offset = None
@@ -1404,11 +1404,11 @@ class IntelExtractor(Extractor):
 						if bootblock_offset < 0:
 							bootblock_offset = 0
 					dest_offset += bootblock_offset
-					self.log_print('bbo', hex(bootblock_offset), 'do', hex(dest_offset))
+					self.debug_print('bbo', hex(bootblock_offset), 'do', hex(dest_offset))
 				out_f.seek(dest_offset)
 
 				# Copy data.
-				self.log_print(data_length, 'bytes @', hex(dest_offset), '=>', found_part_path)
+				self.debug_print(data_length, 'bytes @', hex(dest_offset), '=>', found_part_path)
 				remaining = max(data_length, largest_part_size)
 				part_data = b' '
 				while part_data and remaining > 0:
@@ -1421,17 +1421,17 @@ class IntelExtractor(Extractor):
 					if data_length <= 8192 and len(found_parts_boot) == 0:
 						# Workaround for JN440BX, which requires its final
 						# part (sized 8 KB) to be at the end of the image.
-						self.log_print('> Final part non-padded')
+						self.debug_print('> Final part non-padded')
 						remaining = 0
 					elif data_length == largest_part_size and ((dest_offset >> 16) & 1) == int(invert):
 						# Workaround for SE440BX-2 and SRMK2, which require a
 						# gap at the final 64 KB where the boot block goes.
-						self.log_print('> Final part gap')
+						self.debug_print('> Final part gap')
 						remaining += largest_part_size
 				elif logical_area == 0 and dest_offset == bootblock_offset:
 					# Don't pad a boot block insertion.
 					remaining = 0
-				self.log_print('> Adding', remaining, 'padding bytes')
+				self.debug_print('> Adding', remaining, 'padding bytes')
 				while remaining > 0:
 					out_f.write(b'\xFF' * min(remaining, 1048576))
 					remaining -= 1048576
@@ -1461,7 +1461,7 @@ class IntelExtractor(Extractor):
 					out_f = open(dest_file_path + '.padded', 'wb')
 
 					# Write padding.
-					self.log_print('Padding by', hex(padding_size))
+					self.debug_print('Padding by', hex(padding_size))
 					while padding_size > 0:
 						out_f.write(b'\xFF' * min(padding_size, 1048576))
 						padding_size -= 1048576
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
index 5b17aaa..212a540 100644
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -20,7 +20,7 @@
 if [ ! -d /bios/0 -o -d /bios/1 ]
 then
 	# Run extractor.
-	python3 -u -m biostools -x /bios >&2
+	python3 -u -m biostools -x /bios $* >&2
 
 	# Print usage if there's no 1 directory (nothing bound to /bios).
 	[ ! -d /bios/1 ] && exec python3 -u -m biostools --docker-usage >&2