import re
import sys
from collections import defaultdict

def parse_hashcat_benchmark(file_path):
    """
    Parse a single hashcat benchmark file and return:
        - device_name (str)
        - hash_modes: list of dicts with keys: 'mode', 'name', 'speed'
    """
    device_name = None
    hash_modes = []

    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Extract device name: capture everything between '* Device #\d+: ' and first comma
    # Ignore lines containing "skipped"
    for line in content.splitlines():
        line = line.strip()
        # Match device line: "* Device #\d+: <name>, ..."
        match = re.match(r'^\* Device #\d+: (.*?),', line)
        if match:
            device_candidate = match.group(1).strip()
            # Skip if this line contains "skipped"
            if "skipped" in line:
                continue
            device_name = device_candidate
            break  # Assume first non-skipped device is the one used

    if not device_name:
        raise ValueError(f"No valid device found in {file_path}")

    # Extract hash modes: * Hash-Mode X (name) followed by Speed line
    hash_mode_header_pattern = r'\* Hash-Mode\s+(\d+)\s+\((.*)\)'
    speed_pattern = r'Speed\.#01\.+:\s+([\d.]+\s+.*H/s)'

    headers = list(re.finditer(hash_mode_header_pattern, content))

    for header in headers:
        mode_num = header.group(1)
        mode_name = header.group(2)

        start_pos = header.end()
        next_header_match = None
        for next_header in headers:
            if next_header.start() > start_pos:
                next_header_match = next_header
                break

        end_pos = next_header_match.start() if next_header_match else len(content)
        section = content[start_pos:end_pos]

        speed_match = re.search(speed_pattern, section)
        if speed_match:
            speed = speed_match.group(1)
            hash_modes.append({
                'mode': mode_num,
                'name': mode_name,
                'speed': speed
            })

    return device_name, hash_modes


def main():
    if len(sys.argv) < 2:
        print("Usage: python parse_hashcat_multiple.py <file1> [file2] [file3] ...")
        print("Each file is a hashcat benchmark output from a different device.")
        sys.exit(1)

    file_paths = sys.argv[1:]
    device_speeds = {}  # device_name -> list of {mode, name, speed}
    all_hash_modes = {}  # (mode, name) -> list of speeds (one per device)

    # Parse each file
    for file_path in file_paths:
        try:
            device_name, hash_modes = parse_hashcat_benchmark(file_path)
            device_speeds[device_name] = hash_modes

            # Store speeds by (mode, name) for later merging
            for mode_info in hash_modes:
                key = (mode_info['mode'], mode_info['name'])
                if key not in all_hash_modes:
                    all_hash_modes[key] = []
                all_hash_modes[key].append(mode_info['speed'])

        except FileNotFoundError:
            print(f"Warning: File '{file_path}' not found. Skipping.")
        except ValueError as e:
            print(f"Warning: {e}. Skipping {file_path}.")
        except Exception as e:
            print(f"Warning: Unexpected error parsing {file_path}: {e}. Skipping.")

    if not all_hash_modes:
        print("Error: No valid data found in any input files.")
        sys.exit(1)

    # Determine all devices (ordered by file order)
    devices = list(device_speeds.keys())

    # Build Markdown table
    markdown = "# Hashcat Benchmark Comparison\n\n"
    # Header: Hash-Mode columns + one Speed column per device
    markdown += "| Hash-Mode (number) | Hash-Mode (name) |"
    for device in devices:
        # Escape device names that might contain pipes or markdown chars
        escaped_device = device.replace('|', '\\|')
        markdown += f" {escaped_device} |"
    markdown += "\n"

    # Separator row
    markdown += "|" + "|".join(["--------------------"] + ["------------------"] + ["-------"] * len(devices)) + "|\n"

    # Rows: one per hash mode
    for (mode_num, mode_name), speeds in sorted(all_hash_modes.items(), key=lambda x: int(x[0][0])):
        # Ensure we have exactly one speed per device (pad with 'N/A' if missing)
        speed_row = []
        device_to_speed = {d: s for d, s in zip(devices, speeds)}
        for device in devices:
            speed_row.append(device_to_speed.get(device, "N/A"))

        # Escape pipe characters in mode name
        escaped_name = mode_name.replace('|', '\\|')
        markdown += f"| {mode_num} | {escaped_name} | "
        markdown += " | ".join(speed_row) + " |\n"

    print(markdown)


if __name__ == "__main__":
    main()