sdns-proxy/scripts/tools/add_extra_metrics_to_csv.py

#!/usr/bin/env python3
"""
Add network metrics from PCAP files to DNS CSV files.
Adds: raw_bytes_total, raw_packet_count, overhead_bytes, efficiency_percent
"""

import csv
import os
import argparse
import re
from pathlib import Path
from datetime import datetime, timezone
from scapy.all import rdpcap

def parse_timestamp(ts_str):
    """Parse timestamp with timezone and nanoseconds (RFC3339Nano)."""
    match = re.match(
        r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)([\+\-]\d{2}:\d{2})',
        ts_str
    )

    if not match:
        raise ValueError(f"Invalid timestamp format: {ts_str}")

    base, nanos, tz = match.groups()
    micros = nanos[:6].ljust(6, '0')
    iso_str = f"{base}.{micros}{tz}"
    dt = datetime.fromisoformat(iso_str)
    full_nanos = int(nanos.ljust(9, '0'))

    return dt, full_nanos

def read_pcap(pcap_path):
    """Read PCAP and return list of (timestamp_epoch, size)."""
    packets = []
    try:
        pkts = rdpcap(str(pcap_path))
        for pkt in pkts:
            timestamp = float(pkt.time)
            length = len(pkt)
            packets.append((timestamp, length))
    except Exception as e:
        print(f"  ❌ Error reading PCAP: {e}")
        return []

    return packets

def find_packets_in_window(packets, start_ts, start_nanos, duration_ns):
    """Find packets within exact time window."""
    start_epoch = start_ts.timestamp()
    start_epoch += (start_nanos % 1_000_000) / 1_000_000_000
    end_epoch = start_epoch + (duration_ns / 1_000_000_000)

    total_bytes = 0
    packet_count = 0

    for pkt_ts, pkt_len in packets:
        if start_epoch <= pkt_ts <= end_epoch:
            total_bytes += pkt_len
            packet_count += 1

    return total_bytes, packet_count

def enhance_csv(csv_path, pcap_path, output_path, debug=False):
    """Add PCAP metrics to CSV."""
    if not os.path.exists(pcap_path):
        print(f"⚠️  PCAP not found: {pcap_path}")
        return False

    print(f"Processing: {os.path.basename(csv_path)}")

    # Read PCAP
    packets = read_pcap(pcap_path)
    print(f"  Loaded {len(packets)} packets")

    if not packets:
        print("  ❌ No packets found")
        return False

    if packets and debug:
        first_pcap = packets[0][0]
        last_pcap = packets[-1][0]
        print(f"  First PCAP packet: {first_pcap:.6f}")
        print(f"  Last PCAP packet:  {last_pcap:.6f}")
        print(f"  PCAP duration: {(last_pcap - first_pcap):.3f}s")

    # Read CSV
    with open(csv_path, 'r', newline='') as f:
        reader = csv.DictReader(f)
        fieldnames = list(reader.fieldnames) + [
            'raw_bytes_total',
            'raw_packet_count',
            'overhead_bytes',
            'efficiency_percent'
        ]
        rows = list(reader)

    if rows and debug:
        try:
            first_ts, _ = parse_timestamp(rows[0]['timestamp'])
            last_ts, _ = parse_timestamp(rows[-1]['timestamp'])
            print(f"  First CSV query:  {first_ts.timestamp():.6f}")
            print(f"  Last CSV query:   {last_ts.timestamp():.6f}")
            offset = packets[0][0] - first_ts.timestamp()
            print(f"  Time offset (PCAP - CSV): {offset:.3f}s")
        except:
            pass

    # Enhance rows
    enhanced = []
    matched = 0

    for i, row in enumerate(rows):
        try:
            timestamp, nanos = parse_timestamp(row['timestamp'])
            duration_ns = int(row['duration_ns'])

            raw_bytes, packet_count = find_packets_in_window(
                packets, timestamp, nanos, duration_ns
            )

            useful_bytes = (
                int(row['request_size_bytes']) +
                int(row['response_size_bytes'])
            )
            overhead = raw_bytes - useful_bytes
            efficiency = (
                (useful_bytes / raw_bytes * 100)
                if raw_bytes > 0 else 0
            )

            row['raw_bytes_total'] = raw_bytes
            row['raw_packet_count'] = packet_count
            row['overhead_bytes'] = overhead
            row['efficiency_percent'] = f"{efficiency:.2f}"

            if raw_bytes > 0:
                matched += 1

            # Debug first few queries
            if debug and i < 3:
                print(f"  Query {i}: {row['domain']}")
                print(f"    Duration: {duration_ns / 1e6:.3f}ms")
                print(f"    Matched packets: {packet_count}")
                print(f"    Raw bytes: {raw_bytes}")
                print(f"    Useful bytes: {useful_bytes}")
                print(f"    Efficiency: {efficiency:.2f}%")

        except (ValueError, KeyError) as e:
            if debug:
                print(f"  Error processing row {i}: {e}")
            row['raw_bytes_total'] = 0
            row['raw_packet_count'] = 0
            row['overhead_bytes'] = 0
            row['efficiency_percent'] = "0.00"

        enhanced.append(row)

    print(f"  Matched: {matched}/{len(rows)} queries")

    if matched == 0:
        print("  ⚠️  WARNING: No queries matched any packets!")
        print("     This might indicate timestamp misalignment.")

    # Write output
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
        writer.writeheader()
        writer.writerows(enhanced)

    print(f"  ✓ Saved: {output_path}")
    return True

def main():
    parser = argparse.ArgumentParser(
        description='Add PCAP network metrics to DNS CSV files'
    )
    parser.add_argument('input_dir', help='Input directory (e.g., results)')
    parser.add_argument(
        '--output',
        default='./results_enriched',
        help='Output directory (default: ./results_enriched)'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Preview files without processing'
    )
    parser.add_argument(
        '--debug',
        action='store_true',
        help='Show detailed timing information'
    )

    args = parser.parse_args()

    print("=" * 60)
    print("ENHANCE DNS CSVs WITH PCAP METRICS")
    print("=" * 60)
    print(f"Input:  {args.input_dir}")
    print(f"Output: {args.output}")
    if args.debug:
        print("Debug:  ENABLED")
    print()

    # Find CSV files
    csv_files = list(Path(args.input_dir).rglob('*.csv'))

    if not csv_files:
        print("❌ No CSV files found")
        return 1

    print(f"Found {len(csv_files)} CSV files\n")

    if args.dry_run:
        print("DRY RUN - would process:")
        for csv_path in csv_files:
            pcap_path = csv_path.with_suffix('.pcap')
            print(f"  {csv_path.relative_to(args.input_dir)}")
            print(f"    PCAP: {'✓' if pcap_path.exists() else '✗'}")
        return 0

    # Process files
    success = 0
    failed = 0

    for csv_path in csv_files:
        pcap_path = csv_path.with_suffix('.pcap')
        rel_path = csv_path.relative_to(args.input_dir)
        output_path = Path(args.output) / rel_path

        if enhance_csv(str(csv_path), str(pcap_path), str(output_path),
                       args.debug):
            success += 1
        else:
            failed += 1
        print()

    # Summary
    print("=" * 60)
    print(f"✓ Success: {success}")
    print(f"✗ Failed:  {failed}")
    print(f"Total:     {len(csv_files)}")
    print(f"\nOutput: {args.output}")

    return 0 if failed == 0 else 1

if __name__ == "__main__":
    exit(main())