feat(scripts): add scripts to process data

2025-10-11 23:12:31 +01:00
parent 319c9d0767
commit 4cec2fabd4
8 changed files with 932 additions and 8 deletions
--- a/scripts/tools/add_extra_metrics_to_csv.py
+++ b/scripts/tools/add_extra_metrics_to_csv.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""
+Add network metrics from PCAP files to DNS CSV files.
+Adds: pcap_network_bytes_in, pcap_network_bytes_out, pcap_overhead_bytes
+"""
+
+import csv
+import os
+import argparse
+from pathlib import Path
+from datetime import datetime, timezone
+import dpkt
+import socket
+
+# Test machine IPs
+TEST_IPS = {
+    '10.0.0.50',
+    '2001:818:e73e:ba00:5506:dfd4:ed8b:96e',
+    'fe80::fe98:c62e:4463:9a2d'
+}
+
+
+def inet_to_str(inet):
+    """Convert inet bytes to IP string"""
+    try:
+        return socket.inet_ntop(socket.AF_INET, inet)
+    except ValueError:
+        try:
+            return socket.inet_ntop(socket.AF_INET6, inet)
+        except ValueError:
+            return None
+
+
+def read_pcap(pcap_path):
+    """Read PCAP and return list of (timestamp_ns, size, src_ip, dst_ip)"""
+    packets = []
+    
+    with open(pcap_path, 'rb') as f:
+        try:
+            pcap = dpkt.pcap.Reader(f)
+        except:
+            f.seek(0)
+            pcap = dpkt.pcapng.Reader(f)
+        
+        for ts, buf in pcap:
+            try:
+                # Convert PCAP timestamp (float seconds) to nanoseconds
+                timestamp_ns = int(ts * 1_000_000_000)
+                size = len(buf)
+                eth = dpkt.ethernet.Ethernet(buf)
+                
+                src_ip = dst_ip = None
+                
+                if isinstance(eth.data, dpkt.ip.IP):
+                    src_ip = inet_to_str(eth.data.src)
+                    dst_ip = inet_to_str(eth.data.dst)
+                elif isinstance(eth.data, dpkt.ip6.IP6):
+                    src_ip = inet_to_str(eth.data.src)
+                    dst_ip = inet_to_str(eth.data.dst)
+                
+                if src_ip and dst_ip:
+                    packets.append((timestamp_ns, size, src_ip, dst_ip))
+                    
+            except (dpkt.dpkt.NeedData, dpkt.dpkt.UnpackError):
+                continue
+    
+    return packets
+
+
+def find_packets_in_window(packets, start_ns, duration_ns):
+    """Find packets within exact time window (nanosecond precision)"""
+    end_ns = start_ns + duration_ns
+    
+    matching = []
+    for timestamp_ns, size, src_ip, dst_ip in packets:
+        if start_ns <= timestamp_ns <= end_ns:
+            matching.append((size, src_ip, dst_ip))
+    
+    return matching
+
+
+def calculate_metrics(packets):
+    """Calculate network metrics from packets"""
+    bytes_in = 0
+    bytes_out = 0
+    
+    for size, src_ip, dst_ip in packets:
+        if dst_ip in TEST_IPS:
+            bytes_in += size
+        elif src_ip in TEST_IPS:
+            bytes_out += size
+    
+    return {
+        'pcap_network_bytes_in': bytes_in,
+        'pcap_network_bytes_out': bytes_out,
+        'pcap_overhead_bytes': bytes_in + bytes_out
+    }
+
+
+def parse_timestamp_to_ns(ts_str):
+    """Parse ISO timestamp to nanoseconds since epoch"""
+    try:
+        dt = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
+        if dt.tzinfo is not None:
+            dt = dt.astimezone(timezone.utc)
+        # Convert to nanoseconds since epoch
+        return int(dt.timestamp() * 1_000_000_000)
+    except ValueError:
+        return None
+
+
+def enhance_csv(csv_path, pcap_path, output_path, debug=False):
+    """Add PCAP metrics to CSV"""
+    if not os.path.exists(pcap_path):
+        print(f"⚠️  PCAP not found: {pcap_path}")
+        return False
+    
+    print(f"Processing: {os.path.basename(csv_path)}")
+    
+    # Read PCAP
+    try:
+        packets = read_pcap(pcap_path)
+        print(f"  Loaded {len(packets)} packets")
+        
+        if packets and debug:
+            first_pcap_ns = packets[0][0]
+            last_pcap_ns = packets[-1][0]
+            print(f"  First PCAP packet: {first_pcap_ns} ns")
+            print(f"  Last PCAP packet:  {last_pcap_ns} ns")
+            print(f"  PCAP duration: {(last_pcap_ns - first_pcap_ns) / 1e9:.3f}s")
+            
+    except Exception as e:
+        print(f"  ❌ Error reading PCAP: {e}")
+        return False
+    
+    if not packets:
+        print("  ❌ No packets found")
+        return False
+    
+    # Read CSV
+    with open(csv_path, 'r', newline='') as f:
+        reader = csv.DictReader(f)
+        fieldnames = list(reader.fieldnames) + [
+            'pcap_network_bytes_in',
+            'pcap_network_bytes_out',
+            'pcap_overhead_bytes'
+        ]
+        rows = list(reader)
+    
+    if rows and debug:
+        first_csv_ns = parse_timestamp_to_ns(rows[0]['timestamp'])
+        last_csv_ns = parse_timestamp_to_ns(rows[-1]['timestamp'])
+        if first_csv_ns and last_csv_ns:
+            print(f"  First CSV query:  {first_csv_ns} ns")
+            print(f"  Last CSV query:   {last_csv_ns} ns")
+            print(f"  CSV duration: {(last_csv_ns - first_csv_ns) / 1e9:.3f}s")
+            
+            # Check alignment
+            offset_ns = packets[0][0] - first_csv_ns
+            print(f"  Time offset (PCAP - CSV): {offset_ns / 1e9:.3f}s")
+    
+    # Enhance rows
+    enhanced = []
+    matched = 0
+    
+    for i, row in enumerate(rows):
+        ts_ns = parse_timestamp_to_ns(row['timestamp'])
+        if not ts_ns:
+            continue
+        
+        duration_ns = int(row.get('duration_ns', 0))
+        
+        matching_packets = find_packets_in_window(packets, ts_ns, duration_ns)
+        
+        metrics = calculate_metrics(matching_packets)
+        row.update(metrics)
+        enhanced.append(row)
+        
+        if metrics['pcap_overhead_bytes'] > 0:
+            matched += 1
+        
+        # Debug first few queries
+        if debug and i < 3:
+            print(f"  Query {i}: {row['domain']}")
+            print(f"    Start: {ts_ns} ns")
+            print(f"    Duration: {duration_ns} ns ({duration_ns / 1e6:.3f}ms)")
+            print(f"    End: {ts_ns + duration_ns} ns")
+            print(f"    Matched packets: {len(matching_packets)}")
+            print(f"    Bytes: {metrics['pcap_overhead_bytes']}")
+    
+    print(f"  Matched: {matched}/{len(rows)} queries")
+    
+    if matched == 0:
+        print("  ⚠️  WARNING: No queries matched any packets!")
+        print("     This might indicate timestamp misalignment.")
+    
+    # Write output
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
+        writer.writeheader()
+        writer.writerows(enhanced)
+    
+    print(f"  ✓ Saved: {output_path}")
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Add PCAP network metrics to DNS CSV files'
+    )
+    parser.add_argument('input_dir', help='Input directory (e.g., results_merged)')
+    parser.add_argument(
+        '--output',
+        default='./results_enhanced',
+        help='Output directory (default: ./results_enhanced)'
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Preview files without processing'
+    )
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='Show detailed timing information'
+    )
+    
+    args = parser.parse_args()
+    
+    print("=" * 60)
+    print("ENHANCE DNS CSVs WITH PCAP METRICS")
+    print("=" * 60)
+    print(f"Input:  {args.input_dir}")
+    print(f"Output: {args.output}")
+    if args.debug:
+        print("Debug:  ENABLED")
+    print()
+    
+    # Find CSV files
+    csv_files = list(Path(args.input_dir).rglob('*.csv'))
+    
+    if not csv_files:
+        print("❌ No CSV files found")
+        return 1
+    
+    print(f"Found {len(csv_files)} CSV files\n")
+    
+    if args.dry_run:
+        print("DRY RUN - would process:")
+        for csv_path in csv_files:
+            pcap_path = csv_path.with_suffix('.pcap')
+            print(f"  {csv_path.relative_to(args.input_dir)}")
+            print(f"    PCAP: {'✓' if pcap_path.exists() else '✗'}")
+        return 0
+    
+    # Process files
+    success = 0
+    failed = 0
+    
+    for csv_path in csv_files:
+        pcap_path = csv_path.with_suffix('.pcap')
+        rel_path = csv_path.relative_to(args.input_dir)
+        output_path = Path(args.output) / rel_path
+        
+        if enhance_csv(str(csv_path), str(pcap_path), str(output_path), 
+                       args.debug):
+            success += 1
+        else:
+            failed += 1
+        print()
+    
+    # Summary
+    print("=" * 60)
+    print(f"✓ Success: {success}")
+    print(f"✗ Failed:  {failed}")
+    print(f"Total:     {len(csv_files)}")
+    print(f"\nOutput: {args.output}")
+    
+    return 0 if failed == 0 else 1
+
+
+if __name__ == "__main__":
+    exit(main())