sdns-proxy/scripts/tools/add_extra_metrics_to_csv.py

#!/usr/bin/env python3
"""
Add network metrics from PCAP files to DNS CSV files.
Adds: pcap_network_bytes_in, pcap_network_bytes_out, pcap_overhead_bytes
"""

import csv
import os
import argparse
from pathlib import Path
from datetime import datetime, timezone
import dpkt
import socket

# Test machine IPs
TEST_IPS = {
    '10.0.0.50',
    '2001:818:e73e:ba00:5506:dfd4:ed8b:96e',
    'fe80::fe98:c62e:4463:9a2d'
}


def inet_to_str(inet):
    """Convert inet bytes to IP string"""
    try:
        return socket.inet_ntop(socket.AF_INET, inet)
    except ValueError:
        try:
            return socket.inet_ntop(socket.AF_INET6, inet)
        except ValueError:
            return None


def read_pcap(pcap_path):
    """Read PCAP and return list of (timestamp_ns, size, src_ip, dst_ip)"""
    packets = []

    with open(pcap_path, 'rb') as f:
        try:
            pcap = dpkt.pcap.Reader(f)
        except:
            f.seek(0)
            pcap = dpkt.pcapng.Reader(f)

        for ts, buf in pcap:
            try:
                # Convert PCAP timestamp (float seconds) to nanoseconds
                timestamp_ns = int(ts * 1_000_000_000)
                size = len(buf)
                eth = dpkt.ethernet.Ethernet(buf)

                src_ip = dst_ip = None

                if isinstance(eth.data, dpkt.ip.IP):
                    src_ip = inet_to_str(eth.data.src)
                    dst_ip = inet_to_str(eth.data.dst)
                elif isinstance(eth.data, dpkt.ip6.IP6):
                    src_ip = inet_to_str(eth.data.src)
                    dst_ip = inet_to_str(eth.data.dst)

                if src_ip and dst_ip:
                    packets.append((timestamp_ns, size, src_ip, dst_ip))

            except (dpkt.dpkt.NeedData, dpkt.dpkt.UnpackError):
                continue

    return packets


def find_packets_in_window(packets, start_ns, duration_ns):
    """Find packets within exact time window (nanosecond precision)"""
    end_ns = start_ns + duration_ns

    matching = []
    for timestamp_ns, size, src_ip, dst_ip in packets:
        if start_ns <= timestamp_ns <= end_ns:
            matching.append((size, src_ip, dst_ip))

    return matching


def calculate_metrics(packets):
    """Calculate network metrics from packets"""
    bytes_in = 0
    bytes_out = 0

    for size, src_ip, dst_ip in packets:
        if dst_ip in TEST_IPS:
            bytes_in += size
        elif src_ip in TEST_IPS:
            bytes_out += size

    return {
        'pcap_network_bytes_in': bytes_in,
        'pcap_network_bytes_out': bytes_out,
        'pcap_overhead_bytes': bytes_in + bytes_out
    }


def parse_timestamp_to_ns(ts_str):
    """Parse ISO timestamp to nanoseconds since epoch"""
    try:
        dt = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
        if dt.tzinfo is not None:
            dt = dt.astimezone(timezone.utc)
        # Convert to nanoseconds since epoch
        return int(dt.timestamp() * 1_000_000_000)
    except ValueError:
        return None


def enhance_csv(csv_path, pcap_path, output_path, debug=False):
    """Add PCAP metrics to CSV"""
    if not os.path.exists(pcap_path):
        print(f"⚠️  PCAP not found: {pcap_path}")
        return False

    print(f"Processing: {os.path.basename(csv_path)}")

    # Read PCAP
    try:
        packets = read_pcap(pcap_path)
        print(f"  Loaded {len(packets)} packets")

        if packets and debug:
            first_pcap_ns = packets[0][0]
            last_pcap_ns = packets[-1][0]
            print(f"  First PCAP packet: {first_pcap_ns} ns")
            print(f"  Last PCAP packet:  {last_pcap_ns} ns")
            print(f"  PCAP duration: {(last_pcap_ns - first_pcap_ns) / 1e9:.3f}s")

    except Exception as e:
        print(f"  ❌ Error reading PCAP: {e}")
        return False

    if not packets:
        print("  ❌ No packets found")
        return False

    # Read CSV
    with open(csv_path, 'r', newline='') as f:
        reader = csv.DictReader(f)
        fieldnames = list(reader.fieldnames) + [
            'pcap_network_bytes_in',
            'pcap_network_bytes_out',
            'pcap_overhead_bytes'
        ]
        rows = list(reader)

    if rows and debug:
        first_csv_ns = parse_timestamp_to_ns(rows[0]['timestamp'])
        last_csv_ns = parse_timestamp_to_ns(rows[-1]['timestamp'])
        if first_csv_ns and last_csv_ns:
            print(f"  First CSV query:  {first_csv_ns} ns")
            print(f"  Last CSV query:   {last_csv_ns} ns")
            print(f"  CSV duration: {(last_csv_ns - first_csv_ns) / 1e9:.3f}s")

            # Check alignment
            offset_ns = packets[0][0] - first_csv_ns
            print(f"  Time offset (PCAP - CSV): {offset_ns / 1e9:.3f}s")

    # Enhance rows
    enhanced = []
    matched = 0

    for i, row in enumerate(rows):
        ts_ns = parse_timestamp_to_ns(row['timestamp'])
        if not ts_ns:
            continue

        duration_ns = int(row.get('duration_ns', 0))

        matching_packets = find_packets_in_window(packets, ts_ns, duration_ns)

        metrics = calculate_metrics(matching_packets)
        row.update(metrics)
        enhanced.append(row)

        if metrics['pcap_overhead_bytes'] > 0:
            matched += 1

        # Debug first few queries
        if debug and i < 3:
            print(f"  Query {i}: {row['domain']}")
            print(f"    Start: {ts_ns} ns")
            print(f"    Duration: {duration_ns} ns ({duration_ns / 1e6:.3f}ms)")
            print(f"    End: {ts_ns + duration_ns} ns")
            print(f"    Matched packets: {len(matching_packets)}")
            print(f"    Bytes: {metrics['pcap_overhead_bytes']}")

    print(f"  Matched: {matched}/{len(rows)} queries")

    if matched == 0:
        print("  ⚠️  WARNING: No queries matched any packets!")
        print("     This might indicate timestamp misalignment.")

    # Write output
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
        writer.writeheader()
        writer.writerows(enhanced)

    print(f"  ✓ Saved: {output_path}")
    return True


def main():
    parser = argparse.ArgumentParser(
        description='Add PCAP network metrics to DNS CSV files'
    )
    parser.add_argument('input_dir', help='Input directory (e.g., results_merged)')
    parser.add_argument(
        '--output',
        default='./results_enhanced',
        help='Output directory (default: ./results_enhanced)'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Preview files without processing'
    )
    parser.add_argument(
        '--debug',
        action='store_true',
        help='Show detailed timing information'
    )

    args = parser.parse_args()

    print("=" * 60)
    print("ENHANCE DNS CSVs WITH PCAP METRICS")
    print("=" * 60)
    print(f"Input:  {args.input_dir}")
    print(f"Output: {args.output}")
    if args.debug:
        print("Debug:  ENABLED")
    print()

    # Find CSV files
    csv_files = list(Path(args.input_dir).rglob('*.csv'))

    if not csv_files:
        print("❌ No CSV files found")
        return 1

    print(f"Found {len(csv_files)} CSV files\n")

    if args.dry_run:
        print("DRY RUN - would process:")
        for csv_path in csv_files:
            pcap_path = csv_path.with_suffix('.pcap')
            print(f"  {csv_path.relative_to(args.input_dir)}")
            print(f"    PCAP: {'✓' if pcap_path.exists() else '✗'}")
        return 0

    # Process files
    success = 0
    failed = 0

    for csv_path in csv_files:
        pcap_path = csv_path.with_suffix('.pcap')
        rel_path = csv_path.relative_to(args.input_dir)
        output_path = Path(args.output) / rel_path

        if enhance_csv(str(csv_path), str(pcap_path), str(output_path),
                       args.debug):
            success += 1
        else:
            failed += 1
        print()

    # Summary
    print("=" * 60)
    print(f"✓ Success: {success}")
    print(f"✗ Failed:  {failed}")
    print(f"Total:     {len(csv_files)}")
    print(f"\nOutput: {args.output}")

    return 0 if failed == 0 else 1


if __name__ == "__main__":
    exit(main())