feat(dns): add dnscrypt and dns over tcp

2026-02-04 22:08:05 +00:00
parent 5d9b630d13
commit 92351a80a9
12 changed files with 2576 additions and 568 deletions
--- a/scripts/tools/add_extra_metrics_to_csv.py
+++ b/scripts/tools/add_extra_metrics_to_csv.py
@@ -1,250 +1,362 @@
 #!/usr/bin/env python3
 """
-Add network metrics from PCAP files to DNS CSV files.
-Adds: raw_bytes_total, raw_packet_count, overhead_bytes, efficiency_percent
+Fast PCAP Preprocessor for DNS QoS Analysis
+Loads PCAP into memory first, then uses binary search for matching.
+Uses LAN IP to determine direction (LAN = sent, non-LAN = received).
 """

 import csv
-import os
-import argparse
-import re
+import shutil
 from pathlib import Path
-from datetime import datetime, timezone
-from scapy.all import rdpcap
+from typing import Dict, List, NamedTuple
+import time

-def parse_timestamp(ts_str):
-    """Parse timestamp with timezone and nanoseconds (RFC3339Nano)."""
-    match = re.match(
-        r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)([\+\-]\d{2}:\d{2})',
-        ts_str
-    )
-    
-    if not match:
-        raise ValueError(f"Invalid timestamp format: {ts_str}")
-    
-    base, nanos, tz = match.groups()
-    micros = nanos[:6].ljust(6, '0')
-    iso_str = f"{base}.{micros}{tz}"
-    dt = datetime.fromisoformat(iso_str)
-    full_nanos = int(nanos.ljust(9, '0'))
-    
-    return dt, full_nanos
+import dpkt
+from dateutil import parser as date_parser

-def read_pcap(pcap_path):
-    """Read PCAP and return list of (timestamp_epoch, size)."""
+
+class Packet(NamedTuple):
+    """Lightweight packet representation."""
+    timestamp: float
+    size: int
+    is_outbound: bool  # True if from LAN, False if from internet
+
+
+class QueryWindow:
+    """Efficient query window representation."""
+    __slots__ = ['index', 'start', 'end', 'sent', 'received', 'pkts_sent', 'pkts_received']
+    
+    def __init__(self, index: int, start: float, end: float):
+        self.index = index
+        self.start = start
+        self.end = end
+        self.sent = 0
+        self.received = 0
+        self.pkts_sent = 0
+        self.pkts_received = 0
+
+
+def parse_csv_timestamp(ts_str: str) -> float:
+    """Convert RFC3339Nano timestamp to Unix epoch (seconds)."""
+    dt = date_parser.isoparse(ts_str)
+    return dt.timestamp()
+
+
+def is_lan_ip(ip_bytes: bytes) -> bool:
+    """Check if IP is a private/LAN address."""
+    if len(ip_bytes) != 4:
+        return False
+    
+    first = ip_bytes[0]
+    second = ip_bytes[1]
+    
+    # 10.0.0.0/8
+    if first == 10:
+        return True
+    
+    # 172.16.0.0/12
+    if first == 172 and 16 <= second <= 31:
+        return True
+    
+    # 192.168.0.0/16
+    if first == 192 and second == 168:
+        return True
+    
+    # 127.0.0.0/8 (localhost)
+    if first == 127:
+        return True
+    
+    return False
+
+
+def load_pcap_into_memory(pcap_path: Path) -> List[Packet]:
+    """Load all packets from PCAP into memory with minimal data."""
    packets = []
+    
+    print(f"    Loading PCAP into memory...")
+    start_time = time.time()
+    
    try:
-        pkts = rdpcap(str(pcap_path))
-        for pkt in pkts:
-            timestamp = float(pkt.time)
-            length = len(pkt)
-            packets.append((timestamp, length))
+        with open(pcap_path, 'rb') as f:
+            try:
+                pcap = dpkt.pcap.Reader(f)
+            except:
+                # Try pcapng format
+                f.seek(0)
+                pcap = dpkt.pcapng.Reader(f)
+            
+            for ts, buf in pcap:
+                try:
+                    packet_time = float(ts)
+                    packet_size = len(buf)
+                    
+                    # Parse to get source IP
+                    eth = dpkt.ethernet.Ethernet(buf)
+                    
+                    # Default to outbound if we can't determine
+                    is_outbound = True
+                    
+                    if isinstance(eth.data, dpkt.ip.IP):
+                        ip = eth.data
+                        src_ip = ip.src
+                        is_outbound = is_lan_ip(src_ip)
+                    
+                    packets.append(Packet(
+                        timestamp=packet_time,
+                        size=packet_size,
+                        is_outbound=is_outbound
+                    ))
+                    
+                except (dpkt.dpkt.NeedData, dpkt.dpkt.UnpackError, AttributeError):
+                    continue
+    
    except Exception as e:
-        print(f"  ❌ Error reading PCAP: {e}")
+        print(f"    Error reading PCAP: {e}")
        return []
    
+    elapsed = time.time() - start_time
+    print(f"    Loaded {len(packets):,} packets in {elapsed:.2f}s")
+    
+    # Sort by timestamp for binary search
+    packets.sort(key=lambda p: p.timestamp)
+    
    return packets

-def find_packets_in_window(packets, start_ts, start_nanos, duration_ns):
-    """Find packets within exact time window."""
-    start_epoch = start_ts.timestamp()
-    start_epoch += (start_nanos % 1_000_000) / 1_000_000_000
-    end_epoch = start_epoch + (duration_ns / 1_000_000_000)
-    
-    total_bytes = 0
-    packet_count = 0
-    
-    for pkt_ts, pkt_len in packets:
-        if start_epoch <= pkt_ts <= end_epoch:
-            total_bytes += pkt_len
-            packet_count += 1
-    
-    return total_bytes, packet_count

-def enhance_csv(csv_path, pcap_path, output_path, debug=False):
-    """Add PCAP metrics to CSV."""
-    if not os.path.exists(pcap_path):
-        print(f"⚠️  PCAP not found: {pcap_path}")
-        return False
-    
-    print(f"Processing: {os.path.basename(csv_path)}")
-    
-    # Read PCAP
-    packets = read_pcap(pcap_path)
-    print(f"  Loaded {len(packets)} packets")
-    
+def find_packets_in_window(
+    packets: List[Packet],
+    start_time: float,
+    end_time: float,
+    left_hint: int = 0
+) -> tuple[List[Packet], int]:
+    """
+    Binary search to find all packets within time window.
+    Returns (matching_packets, left_index_hint_for_next_search).
+    """
    if not packets:
-        print("  ❌ No packets found")
-        return False
+        return [], 0
    
-    if packets and debug:
-        first_pcap = packets[0][0]
-        last_pcap = packets[-1][0]
-        print(f"  First PCAP packet: {first_pcap:.6f}")
-        print(f"  Last PCAP packet:  {last_pcap:.6f}")
-        print(f"  PCAP duration: {(last_pcap - first_pcap):.3f}s")
+    # Binary search for first packet >= start_time
+    left, right = left_hint, len(packets) - 1
+    first_idx = len(packets)
    
-    # Read CSV
-    with open(csv_path, 'r', newline='') as f:
-        reader = csv.DictReader(f)
-        fieldnames = list(reader.fieldnames) + [
-            'raw_bytes_total',
-            'raw_packet_count',
-            'overhead_bytes',
-            'efficiency_percent'
-        ]
-        rows = list(reader)
+    while left <= right:
+        mid = (left + right) // 2
+        if packets[mid].timestamp >= start_time:
+            first_idx = mid
+            right = mid - 1
+        else:
+            left = mid + 1
    
-    if rows and debug:
-        try:
-            first_ts, _ = parse_timestamp(rows[0]['timestamp'])
-            last_ts, _ = parse_timestamp(rows[-1]['timestamp'])
-            print(f"  First CSV query:  {first_ts.timestamp():.6f}")
-            print(f"  Last CSV query:   {last_ts.timestamp():.6f}")
-            offset = packets[0][0] - first_ts.timestamp()
-            print(f"  Time offset (PCAP - CSV): {offset:.3f}s")
-        except:
-            pass
+    # No packets in range
+    if first_idx >= len(packets) or packets[first_idx].timestamp > end_time:
+        return [], first_idx
    
-    # Enhance rows
-    enhanced = []
-    matched = 0
+    # Collect all packets in window
+    matching = []
+    idx = first_idx
+    while idx < len(packets) and packets[idx].timestamp <= end_time:
+        matching.append(packets[idx])
+        idx += 1
    
-    for i, row in enumerate(rows):
-        try:
-            timestamp, nanos = parse_timestamp(row['timestamp'])
-            duration_ns = int(row['duration_ns'])
-            
-            raw_bytes, packet_count = find_packets_in_window(
-                packets, timestamp, nanos, duration_ns
-            )
-            
-            useful_bytes = (
-                int(row['request_size_bytes']) + 
-                int(row['response_size_bytes'])
-            )
-            overhead = raw_bytes - useful_bytes
-            efficiency = (
-                (useful_bytes / raw_bytes * 100) 
-                if raw_bytes > 0 else 0
-            )
-            
-            row['raw_bytes_total'] = raw_bytes
-            row['raw_packet_count'] = packet_count
-            row['overhead_bytes'] = overhead
-            row['efficiency_percent'] = f"{efficiency:.2f}"
-            
-            if raw_bytes > 0:
-                matched += 1
-            
-            # Debug first few queries
-            if debug and i < 3:
-                print(f"  Query {i}: {row['domain']}")
-                print(f"    Duration: {duration_ns / 1e6:.3f}ms")
-                print(f"    Matched packets: {packet_count}")
-                print(f"    Raw bytes: {raw_bytes}")
-                print(f"    Useful bytes: {useful_bytes}")
-                print(f"    Efficiency: {efficiency:.2f}%")
-            
-        except (ValueError, KeyError) as e:
-            if debug:
-                print(f"  Error processing row {i}: {e}")
-            row['raw_bytes_total'] = 0
-            row['raw_packet_count'] = 0
-            row['overhead_bytes'] = 0
-            row['efficiency_percent'] = "0.00"
-        
-        enhanced.append(row)
-    
-    print(f"  Matched: {matched}/{len(rows)} queries")
-    
-    if matched == 0:
-        print("  ⚠️  WARNING: No queries matched any packets!")
-        print("     This might indicate timestamp misalignment.")
-    
-    # Write output
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    with open(output_path, 'w', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
-        writer.writeheader()
-        writer.writerows(enhanced)
-    
-    print(f"  ✓ Saved: {output_path}")
-    return True
+    return matching, first_idx

-def main():
-    parser = argparse.ArgumentParser(
-        description='Add PCAP network metrics to DNS CSV files'
-    )
-    parser.add_argument('input_dir', help='Input directory (e.g., results)')
-    parser.add_argument(
-        '--output',
-        default='./results_enriched',
-        help='Output directory (default: ./results_enriched)'
-    )
-    parser.add_argument(
-        '--dry-run',
-        action='store_true',
-        help='Preview files without processing'
-    )
-    parser.add_argument(
-        '--debug',
-        action='store_true',
-        help='Show detailed timing information'
-    )
+
+def load_csv_queries(csv_path: Path) -> List[Dict]:
+    """Load CSV and create query data structures."""
+    queries = []
+    with open(csv_path, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            try:
+                ts_epoch = parse_csv_timestamp(row['timestamp'])
+                duration_s = float(row['duration_ns']) / 1e9
+                queries.append({
+                    'data': row,
+                    'start_time': ts_epoch,
+                    'end_time': ts_epoch + duration_s,
+                })
+            except Exception as e:
+                print(f"  Warning: Skipping row - {e}")
+                continue
+    return queries
+
+
+def match_packets_to_queries(
+    packets: List[Packet],
+    queries: List[Dict]
+) -> List[Dict]:
+    """Match packets to query windows using binary search."""
+    if not queries or not packets:
+        return queries
    
-    args = parser.parse_args()
+    print(f"    Matching packets to queries...")
+    start_time = time.time()
    
-    print("=" * 60)
-    print("ENHANCE DNS CSVs WITH PCAP METRICS")
-    print("=" * 60)
-    print(f"Input:  {args.input_dir}")
-    print(f"Output: {args.output}")
-    if args.debug:
-        print("Debug:  ENABLED")
-    print()
+    # Initialize metrics
+    for q in queries:
+        q['bytes_sent'] = 0
+        q['bytes_received'] = 0
+        q['packets_sent'] = 0
+        q['packets_received'] = 0
+        q['total_bytes'] = 0
    
-    # Find CSV files
-    csv_files = list(Path(args.input_dir).rglob('*.csv'))
+    # Sort queries by start time for sequential processing
+    queries_sorted = sorted(enumerate(queries), key=lambda x: x[1]['start_time'])
    
-    if not csv_files:
-        print("❌ No CSV files found")
-        return 1
+    matched_packets = 0
+    left_hint = 0  # Optimization: start next search from here
    
-    print(f"Found {len(csv_files)} CSV files\n")
+    for original_idx, q in queries_sorted:
+        matching, left_hint = find_packets_in_window(
+            packets,
+            q['start_time'],
+            q['end_time'],
+            left_hint
+        )
+        
+        for pkt in matching:
+            matched_packets += 1
+            if pkt.is_outbound:
+                q['bytes_sent'] += pkt.size
+                q['packets_sent'] += 1
+            else:
+                q['bytes_received'] += pkt.size
+                q['packets_received'] += 1
+        
+        q['total_bytes'] = q['bytes_sent'] + q['bytes_received']
    
-    if args.dry_run:
-        print("DRY RUN - would process:")
-        for csv_path in csv_files:
-            pcap_path = csv_path.with_suffix('.pcap')
-            print(f"  {csv_path.relative_to(args.input_dir)}")
-            print(f"    PCAP: {'✓' if pcap_path.exists() else '✗'}")
-        return 0
+    elapsed = time.time() - start_time
+    print(f"    Matched {matched_packets:,} packets in {elapsed:.2f}s")
    
-    # Process files
-    success = 0
-    failed = 0
+    # Statistics
+    total_sent = sum(q['bytes_sent'] for q in queries)
+    total_recv = sum(q['bytes_received'] for q in queries)
+    queries_with_data = sum(1 for q in queries if q['total_bytes'] > 0)
+    print(f"    Total: {total_sent:,} bytes sent, {total_recv:,} bytes received")
+    print(f"    Queries with data: {queries_with_data}/{len(queries)}")
+    
+    return queries
+
+
+def write_enriched_csv(
+    csv_path: Path, queries: List[Dict], backup: bool = True
+):
+    """Write enriched CSV with bandwidth columns."""
+    if backup and csv_path.exists():
+        backup_path = csv_path.with_suffix('.csv.bak')
+        if not backup_path.exists():  # Don't overwrite existing backup
+            shutil.copy2(csv_path, backup_path)
+            print(f"  Backup: {backup_path.name}")
+    
+    # Get fieldnames
+    original_fields = list(queries[0]['data'].keys())
+    new_fields = [
+        'bytes_sent',
+        'bytes_received',
+        'packets_sent',
+        'packets_received',
+        'total_bytes',
+    ]
+    fieldnames = original_fields + new_fields
+    
+    with open(csv_path, 'w', encoding='utf-8', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        
+        for q in queries:
+            row = q['data'].copy()
+            for field in new_fields:
+                row[field] = q[field]
+            writer.writerow(row)
+    
+    print(f"  Written: {csv_path.name}")
+
+
+def process_provider_directory(provider_path: Path):
+    """Process all CSV/PCAP pairs in a provider directory."""
+    print(f"\n{'='*60}")
+    print(f"Processing: {provider_path.name.upper()}")
+    print(f"{'='*60}")
+    
+    csv_files = sorted(provider_path.glob('*.csv'))
+    processed = 0
+    total_time = 0
    
    for csv_path in csv_files:
-        pcap_path = csv_path.with_suffix('.pcap')
-        rel_path = csv_path.relative_to(args.input_dir)
-        output_path = Path(args.output) / rel_path
+        # Skip backup files
+        if '.bak' in csv_path.name:
+            continue
        
-        if enhance_csv(str(csv_path), str(pcap_path), str(output_path), 
-                       args.debug):
-            success += 1
-        else:
-            failed += 1
-        print()
+        pcap_path = csv_path.with_suffix('.pcap')
+        
+        if not pcap_path.exists():
+            print(f"\n  ⚠ Skipping {csv_path.name} - no matching PCAP")
+            continue
+        
+        print(f"\n  📁 {csv_path.name}")
+        file_start = time.time()
+        
+        # Load PCAP into memory first
+        packets = load_pcap_into_memory(pcap_path)
+        if not packets:
+            print(f"    ⚠ No packets found in PCAP")
+            continue
+        
+        # Load CSV queries
+        queries = load_csv_queries(csv_path)
+        if not queries:
+            print(f"    ⚠ No valid queries found")
+            continue
+        
+        print(f"    Loaded {len(queries):,} queries")
+        
+        # Match packets to queries
+        enriched_queries = match_packets_to_queries(packets, queries)
+        
+        # Write enriched CSV
+        write_enriched_csv(csv_path, enriched_queries)
+        
+        file_time = time.time() - file_start
+        total_time += file_time
+        processed += 1
+        print(f"    ✓ Completed in {file_time:.2f}s")
    
-    # Summary
-    print("=" * 60)
-    print(f"✓ Success: {success}")
-    print(f"✗ Failed:  {failed}")
-    print(f"Total:     {len(csv_files)}")
-    print(f"\nOutput: {args.output}")
-    
-    return 0 if failed == 0 else 1
+    print(f"\n  {'='*58}")
+    print(f"  {provider_path.name}: {processed} files in {total_time:.2f}s")
+    print(f"  {'='*58}")

-if __name__ == "__main__":
-    exit(main())
+
+def main():
+    """Main preprocessing pipeline."""
+    overall_start = time.time()
+    
+    print("\n" + "="*60)
+    print("DNS PCAP PREPROCESSOR - Memory-Optimized Edition")
+    print("="*60)
+    
+    results_dir = Path('results')
+    
+    if not results_dir.exists():
+        print(f"\n❌ Error: '{results_dir}' directory not found")
+        return
+    
+    providers = ['adguard', 'cloudflare', 'google', 'quad9']
+    
+    for provider in providers:
+        provider_path = results_dir / provider
+        if provider_path.exists():
+            process_provider_directory(provider_path)
+        else:
+            print(f"\n⚠ Warning: Provider directory not found: {provider}")
+    
+    overall_time = time.time() - overall_start
+    
+    print("\n" + "="*60)
+    print(f"✓ PREPROCESSING COMPLETE")
+    print(f"  Total time: {overall_time:.2f}s ({overall_time/60:.1f} minutes)")
+    print("="*60 + "\n")
+
+
+if __name__ == '__main__':
+    main()