sdns-proxy/scripts/post_processing/merge_files.py

#!/usr/bin/env python3
"""
Merge all DNS test CSVs into a single unified CSV.
Extracts metadata from filenames and directory structure.
"""

import csv
from pathlib import Path
from dateutil import parser as date_parser
import argparse


def parse_config(filename: str) -> dict:
    """
    Parse protocol, dnssec_mode, and keep_alive from filename.

    Examples:
        doh3-auth.csv         → protocol=doh3, dnssec=auth, persist=0
        tls-trust-persist.csv → protocol=tls, dnssec=trust, persist=1
        https.csv             → protocol=https, dnssec=off, persist=0
        doudp-auth.csv        → protocol=doudp, dnssec=auth, persist=0
        dnscrypt-trust.csv    → protocol=dnscrypt, dnssec=trust, persist=0
    """
    base = filename.replace('.csv', '')
    parts = base.split('-')

    protocol = parts[0]
    dnssec_mode = 'off'
    keep_alive = 0

    for part in parts[1:]:
        if part in ('auth', 'trust'):
            dnssec_mode = part
        elif part == 'persist':
            keep_alive = 1

    return {
        'protocol': protocol,
        'dnssec_mode': dnssec_mode,
        'keep_alive': keep_alive,
    }


def parse_timestamp_unix(ts_str: str) -> float:
    """Convert RFC3339 timestamp to Unix epoch."""
    try:
        dt = date_parser.isoparse(ts_str)
        return dt.timestamp()
    except Exception:
        return 0.0


def ns_to_ms(duration_ns: str) -> float:
    """Convert nanoseconds to milliseconds."""
    try:
        return float(duration_ns) / 1_000_000
    except (ValueError, TypeError):
        return 0.0


def find_csv_files(input_dir: Path) -> list:
    """Find all non-backup CSV files."""
    files = []
    for csv_path in input_dir.rglob('*.csv'):
        name = csv_path.name.lower()
        if '.bak' in name or name.endswith('.cpu.csv') or name.endswith('.mem.csv'):
            continue
        files.append(csv_path)
    return sorted(files)


def merge_all_csvs(input_dir: Path, output_path: Path):
    """Merge all CSVs into a single file."""

    csv_files = find_csv_files(input_dir)

    if not csv_files:
        print("No CSV files found")
        return

    print(f"Found {len(csv_files)} CSV files")

    # Output columns in desired order
    output_columns = [
        'id',
        'provider',
        'protocol',
        'dnssec_mode',
        'domain',
        'query_type',
        'keep_alive',
        'dns_server',
        'timestamp',
        'timestamp_unix',
        'duration_ns',
        'duration_ms',
        'request_size_bytes',
        'response_size_bytes',
        'bytes_sent',
        'bytes_received',
        'packets_sent',
        'packets_received',
        'total_bytes',
        'response_code',
        'error',
    ]

    global_id = 0
    total_rows = 0

    with open(output_path, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=output_columns)
        writer.writeheader()

        for csv_path in csv_files:
            # Extract provider from path
            provider = csv_path.parent.name.lower()

            # Parse config from filename
            config = parse_config(csv_path.name)

            print(f"  {provider}/{csv_path.name} ({config['protocol']}, {config['dnssec_mode']}, persist={config['keep_alive']})")

            file_rows = 0

            with open(csv_path, 'r', newline='', encoding='utf-8') as infile:
                reader = csv.DictReader(infile)

                for row in reader:
                    global_id += 1
                    file_rows += 1

                    # Build output row
                    out_row = {
                        'id': global_id,
                        'provider': provider,
                        'protocol': config['protocol'],
                        'dnssec_mode': config['dnssec_mode'],
                        'keep_alive': config['keep_alive'],
                        'domain': row.get('domain', ''),
                        'query_type': row.get('query_type', ''),
                        'dns_server': row.get('dns_server', ''),
                        'timestamp': row.get('timestamp', ''),
                        'timestamp_unix': parse_timestamp_unix(row.get('timestamp', '')),
                        'duration_ns': row.get('duration_ns', ''),
                        'duration_ms': ns_to_ms(row.get('duration_ns', '')),
                        'request_size_bytes': row.get('request_size_bytes', ''),
                        'response_size_bytes': row.get('response_size_bytes', ''),
                        'bytes_sent': row.get('bytes_sent', ''),
                        'bytes_received': row.get('bytes_received', ''),
                        'packets_sent': row.get('packets_sent', ''),
                        'packets_received': row.get('packets_received', ''),
                        'total_bytes': row.get('total_bytes', ''),
                        'response_code': row.get('response_code', ''),
                        'error': row.get('error', ''),
                    }

                    writer.writerow(out_row)

            total_rows += file_rows
            print(f"    → {file_rows:,} rows")

    print(f"\n{'='*60}")
    print(f"Output: {output_path}")
    print(f"Total rows: {total_rows:,}")
    print(f"{'='*60}")


def main():
    parser = argparse.ArgumentParser(
        description='Merge all DNS test CSVs into a single file'
    )
    parser.add_argument(
        'input_dir',
        nargs='?',
        default='.',
        help='Input directory containing provider folders (default: .)'
    )
    parser.add_argument(
        '-o', '--output',
        default='dns_results.csv',
        help='Output CSV path (default: dns_results.csv)'
    )

    args = parser.parse_args()

    input_dir = Path(args.input_dir)
    output_path = Path(args.output)

    if not input_dir.exists():
        print(f"Error: Input directory not found: {input_dir}")
        return 1

    print("="*60)
    print("MERGE ALL DNS CSVs")
    print("="*60)
    print(f"Input:  {input_dir}")
    print(f"Output: {output_path}")
    print()

    merge_all_csvs(input_dir, output_path)

    return 0


if __name__ == '__main__':
    exit(main())