feat(scripts): add scripts to process data

2025-10-11 23:12:31 +01:00
parent 319c9d0767
commit 4cec2fabd4
8 changed files with 932 additions and 8 deletions
--- a/scripts/analysis/analyze_dns_metrics.py
+++ b/scripts/analysis/analyze_dns_metrics.py
@@ -0,0 +1,289 @@
+import csv
+import os
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+def map_server_to_resolver(server):
+    """Map server address/domain to resolver name"""
+    server_lower = server.lower()
+    
+    if '1.1.1.1' in server_lower or 'cloudflare' in server_lower:
+        return 'Cloudflare'
+    elif '8.8.8.8' in server_lower or 'google' in server_lower:
+        return 'Google'
+    elif '9.9.9.9' in server_lower or 'quad9' in server_lower:
+        return 'Quad9'
+    elif 'adguard' in server_lower:
+        return 'AdGuard'
+    else:
+        return server  # Fallback to original server name
+
+def extract_from_new_format(filename):
+    """Parse new filename format: protocol[-flags]-timestamp.csv"""
+    base = filename.replace('.csv', '')
+    parts = base.split('-')
+    
+    if len(parts) < 2:
+        return None, None, None, None
+    
+    protocol = parts[0]
+    timestamp = parts[-1]
+    
+    # Flags are everything between protocol and timestamp
+    flags_str = '-'.join(parts[1:-1])
+    
+    # Determine DNSSEC status
+    if 'auth' in flags_str:
+        dnssec_status = 'auth'  # Authoritative DNSSEC
+    elif 'trust' in flags_str:
+        dnssec_status = 'trust'  # Trust-based DNSSEC
+    else:
+        dnssec_status = 'off'
+    
+    keepalive_status = 'on' if 'persist' in flags_str else 'off'
+    
+    return protocol, dnssec_status, keepalive_status, flags_str
+
+def extract_server_info_from_csv(row):
+    """Extract DNSSEC info from CSV row data"""
+    dnssec = row.get('dnssec', 'false').lower() == 'true'
+    auth_dnssec = row.get('auth_dnssec', 'false').lower() == 'true'
+    keepalive = row.get('keep_alive', 'false').lower() == 'true'
+    
+    if dnssec:
+        if auth_dnssec:
+            dnssec_status = 'auth'
+        else:
+            dnssec_status = 'trust'
+    else:
+        dnssec_status = 'off'
+    
+    keepalive_status = 'on' if keepalive else 'off'
+    
+    return dnssec_status, keepalive_status
+
+def extract_server_info(file_path, row):
+    """Extract info using directory structure, filename, and CSV data"""
+    path = Path(file_path)
+    
+    # First try to get DNSSEC info from CSV row (most accurate)
+    try:
+        csv_dnssec_status, csv_keepalive_status = extract_server_info_from_csv(row)
+        protocol = row.get('protocol', '').lower()
+        
+        # Get server from directory structure
+        parts = path.parts
+        if len(parts) >= 4:
+            potential_date = parts[-2]
+            # Check if it's a date like YYYY-MM-DD
+            if len(potential_date) == 10 and potential_date[4] == '-' and potential_date[7] == '-' and potential_date.replace('-', '').isdigit():
+                server = parts[-3]  # resolver folder (e.g., cloudflare)
+                return protocol, server, csv_dnssec_status, csv_keepalive_status
+        
+        # Fallback to DNS server field
+        server = row.get('dns_server', '')
+        return protocol, server, csv_dnssec_status, csv_keepalive_status
+        
+    except (KeyError, ValueError):
+        pass
+    
+    # Fallback to filename parsing
+    filename = path.name
+    protocol, dnssec_status, keepalive_status, flags = extract_from_new_format(filename)
+    
+    if protocol:
+        # Get server from directory structure
+        parts = path.parts
+        if len(parts) >= 4:
+            potential_date = parts[-2]
+            if len(potential_date) == 10 and potential_date[4] == '-' and potential_date[7] == '-' and potential_date.replace('-', '').isdigit():
+                server = parts[-3]
+                return protocol, server, dnssec_status, keepalive_status
+        
+        # Fallback to DNS server field
+        server = row.get('dns_server', '')
+        return protocol, server, dnssec_status, keepalive_status
+    
+    return None, None, None, None
+
+def get_dnssec_display_name(dnssec_status):
+    """Convert DNSSEC status to display name"""
+    if dnssec_status == 'auth':
+        return 'DNSSEC (Authoritative)'
+    elif dnssec_status == 'trust':
+        return 'DNSSEC (Trust-based)'
+    else:
+        return 'No DNSSEC'
+
+def analyze_dns_data(root_directory, output_file):
+    """Analyze DNS data and generate metrics"""
+    
+    # Dictionary to store measurements: {(resolver, protocol, dnssec, keepalive): [durations]}
+    measurements = defaultdict(list)
+    
+    # Walk through all directories
+    for root, dirs, files in os.walk(root_directory):
+        for file in files:
+            if file.endswith('.csv'):
+                file_path = os.path.join(root, file)
+                print(f"Processing: {file_path}")
+                
+                try:
+                    with open(file_path, 'r', newline='') as csvfile:
+                        reader = csv.DictReader(csvfile)
+                        
+                        for row_num, row in enumerate(reader, 2):  # Start at 2 since header is row 1
+                            try:
+                                protocol, server, dnssec_status, keepalive_status = extract_server_info(file_path, row)
+                                
+                                if protocol and server:
+                                    resolver = map_server_to_resolver(server)
+                                    duration_ms = float(row.get('duration_ms', 0))
+                                    
+                                    # Only include successful queries
+                                    if row.get('response_code', '') in ['NOERROR', '']:
+                                        key = (resolver, protocol, dnssec_status, keepalive_status)
+                                        measurements[key].append(duration_ms)
+                                    
+                            except (ValueError, TypeError) as e:
+                                print(f"Data parse error in {file_path} row {row_num}: {e}")
+                                continue
+                                
+                except Exception as e:
+                    print(f"Error processing file {file_path}: {e}")
+                    continue
+    
+    # Calculate statistics grouped by resolver first, then by configuration
+    resolver_results = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    
+    for (resolver, protocol, dnssec, keepalive), durations in measurements.items():
+        if durations:
+            stats = {
+                'protocol': protocol.upper(),
+                'dnssec': dnssec,
+                'keepalive': keepalive,
+                'total_queries': len(durations),
+                'avg_latency_ms': round(statistics.mean(durations), 3),
+                'median_latency_ms': round(statistics.median(durations), 3),
+                'min_latency_ms': round(min(durations), 3),
+                'max_latency_ms': round(max(durations), 3),
+                'std_dev_ms': round(statistics.stdev(durations) if len(durations) > 1 else 0, 3),
+                'p95_latency_ms': round(statistics.quantiles(durations, n=20)[18], 3) if len(durations) >= 20 else round(max(durations), 3),
+                'p99_latency_ms': round(statistics.quantiles(durations, n=100)[98], 3) if len(durations) >= 100 else round(max(durations), 3)
+            }
+            # Group by resolver -> dnssec -> keepalive -> protocol
+            resolver_results[resolver][dnssec][keepalive].append(stats)
+    
+    # Sort each configuration's results by average latency
+    for resolver in resolver_results:
+        for dnssec in resolver_results[resolver]:
+            for keepalive in resolver_results[resolver][dnssec]:
+                resolver_results[resolver][dnssec][keepalive].sort(key=lambda x: x['avg_latency_ms'])
+    
+    # Write to CSV with all data
+    all_results = []
+    for resolver in resolver_results:
+        for dnssec in resolver_results[resolver]:
+            for keepalive in resolver_results[resolver][dnssec]:
+                for result in resolver_results[resolver][dnssec][keepalive]:
+                    result['resolver'] = resolver
+                    all_results.append(result)
+    
+    with open(output_file, 'w', newline='') as csvfile:
+        fieldnames = [
+            'resolver', 'protocol', 'dnssec', 'keepalive', 'total_queries',
+            'avg_latency_ms', 'median_latency_ms', 'min_latency_ms', 
+            'max_latency_ms', 'std_dev_ms', 'p95_latency_ms', 'p99_latency_ms'
+        ]
+        
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(all_results)
+    
+    print(f"\nAnalysis complete! Full results written to {output_file}")
+    print(f"Total measurements: {sum(len(durations) for durations in measurements.values())}")
+    
+    def print_configuration_table(resolver, dnssec_status, keepalive_status, results):
+        """Print a formatted table for a specific configuration"""
+        ka_indicator = "PERSISTENT" if keepalive_status == 'on' else "NEW CONN"
+        dnssec_display = get_dnssec_display_name(dnssec_status)
+        
+        print(f"\n  {dnssec_display} - {ka_indicator}")
+        print("  " + "-" * 90)
+        print(f"  {'Protocol':<12} {'Queries':<8} {'Avg(ms)':<10} {'Median(ms)':<12} {'Min(ms)':<10} {'Max(ms)':<10} {'P95(ms)':<10}")
+        print("  " + "-" * 90)
+        
+        for result in results:
+            print(f"  {result['protocol']:<12} {result['total_queries']:<8} "
+                  f"{result['avg_latency_ms']:<10} {result['median_latency_ms']:<12} "
+                  f"{result['min_latency_ms']:<10} {result['max_latency_ms']:<10} "
+                  f"{result['p95_latency_ms']:<10}")
+    
+    # Print results grouped by resolver first
+    print(f"\n{'=' * 100}")
+    print("DNS RESOLVER PERFORMANCE COMPARISON")
+    print(f"{'=' * 100}")
+    
+    for resolver in sorted(resolver_results.keys()):
+        print(f"\n{resolver} DNS Resolver")
+        print("=" * 100)
+        
+        # Order configurations logically
+        config_order = [
+            ('off', 'off'),     # No DNSSEC, New connections
+            ('off', 'on'),      # No DNSSEC, Persistent
+            ('trust', 'off'),   # Trust DNSSEC, New connections  
+            ('trust', 'on'),    # Trust DNSSEC, Persistent
+            ('auth', 'off'),    # Auth DNSSEC, New connections
+            ('auth', 'on'),     # Auth DNSSEC, Persistent
+        ]
+        
+        for dnssec_status, keepalive_status in config_order:
+            if dnssec_status in resolver_results[resolver] and keepalive_status in resolver_results[resolver][dnssec_status]:
+                results = resolver_results[resolver][dnssec_status][keepalive_status]
+                if results:  # Only print if there are results
+                    print_configuration_table(resolver, dnssec_status, keepalive_status, results)
+    
+    # Summary comparison across resolvers
+    print(f"\n{'=' * 100}")
+    print("CROSS-RESOLVER PROTOCOL COMPARISON")
+    print(f"{'=' * 100}")
+    
+    # Group by protocol and configuration for cross-resolver comparison
+    protocol_comparison = defaultdict(lambda: defaultdict(list))
+    
+    for resolver in resolver_results:
+        for dnssec in resolver_results[resolver]:
+            for keepalive in resolver_results[resolver][dnssec]:
+                for result in resolver_results[resolver][dnssec][keepalive]:
+                    config_key = f"{get_dnssec_display_name(dnssec)} - {'PERSISTENT' if keepalive == 'on' else 'NEW CONN'}"
+                    protocol_comparison[result['protocol']][config_key].append({
+                        'resolver': resolver,
+                        'avg_latency_ms': result['avg_latency_ms'],
+                        'total_queries': result['total_queries']
+                    })
+    
+    for protocol in sorted(protocol_comparison.keys()):
+        print(f"\n{protocol} Protocol Comparison")
+        print("-" * 100)
+        
+        for config in sorted(protocol_comparison[protocol].keys()):
+            resolvers_data = protocol_comparison[protocol][config]
+            if resolvers_data:
+                print(f"\n  {config}")
+                print("  " + "-" * 60)
+                print(f"  {'Resolver':<15} {'Avg Latency (ms)':<20} {'Queries':<10}")
+                print("  " + "-" * 60)
+                
+                # Sort by average latency
+                resolvers_data.sort(key=lambda x: x['avg_latency_ms'])
+                
+                for data in resolvers_data:
+                    print(f"  {data['resolver']:<15} {data['avg_latency_ms']:<20} {data['total_queries']:<10}")
+
+if __name__ == "__main__":
+    root_dir = "."
+    output_file = "dns_metrics.csv"
+    
+    analyze_dns_data(root_dir, output_file)
--- a/scripts/tools/add_extra_metrics_to_csv.py
+++ b/scripts/tools/add_extra_metrics_to_csv.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""
+Add network metrics from PCAP files to DNS CSV files.
+Adds: pcap_network_bytes_in, pcap_network_bytes_out, pcap_overhead_bytes
+"""
+
+import csv
+import os
+import argparse
+from pathlib import Path
+from datetime import datetime, timezone
+import dpkt
+import socket
+
+# Test machine IPs
+TEST_IPS = {
+    '10.0.0.50',
+    '2001:818:e73e:ba00:5506:dfd4:ed8b:96e',
+    'fe80::fe98:c62e:4463:9a2d'
+}
+
+
+def inet_to_str(inet):
+    """Convert inet bytes to IP string"""
+    try:
+        return socket.inet_ntop(socket.AF_INET, inet)
+    except ValueError:
+        try:
+            return socket.inet_ntop(socket.AF_INET6, inet)
+        except ValueError:
+            return None
+
+
+def read_pcap(pcap_path):
+    """Read PCAP and return list of (timestamp_ns, size, src_ip, dst_ip)"""
+    packets = []
+    
+    with open(pcap_path, 'rb') as f:
+        try:
+            pcap = dpkt.pcap.Reader(f)
+        except:
+            f.seek(0)
+            pcap = dpkt.pcapng.Reader(f)
+        
+        for ts, buf in pcap:
+            try:
+                # Convert PCAP timestamp (float seconds) to nanoseconds
+                timestamp_ns = int(ts * 1_000_000_000)
+                size = len(buf)
+                eth = dpkt.ethernet.Ethernet(buf)
+                
+                src_ip = dst_ip = None
+                
+                if isinstance(eth.data, dpkt.ip.IP):
+                    src_ip = inet_to_str(eth.data.src)
+                    dst_ip = inet_to_str(eth.data.dst)
+                elif isinstance(eth.data, dpkt.ip6.IP6):
+                    src_ip = inet_to_str(eth.data.src)
+                    dst_ip = inet_to_str(eth.data.dst)
+                
+                if src_ip and dst_ip:
+                    packets.append((timestamp_ns, size, src_ip, dst_ip))
+                    
+            except (dpkt.dpkt.NeedData, dpkt.dpkt.UnpackError):
+                continue
+    
+    return packets
+
+
+def find_packets_in_window(packets, start_ns, duration_ns):
+    """Find packets within exact time window (nanosecond precision)"""
+    end_ns = start_ns + duration_ns
+    
+    matching = []
+    for timestamp_ns, size, src_ip, dst_ip in packets:
+        if start_ns <= timestamp_ns <= end_ns:
+            matching.append((size, src_ip, dst_ip))
+    
+    return matching
+
+
+def calculate_metrics(packets):
+    """Calculate network metrics from packets"""
+    bytes_in = 0
+    bytes_out = 0
+    
+    for size, src_ip, dst_ip in packets:
+        if dst_ip in TEST_IPS:
+            bytes_in += size
+        elif src_ip in TEST_IPS:
+            bytes_out += size
+    
+    return {
+        'pcap_network_bytes_in': bytes_in,
+        'pcap_network_bytes_out': bytes_out,
+        'pcap_overhead_bytes': bytes_in + bytes_out
+    }
+
+
+def parse_timestamp_to_ns(ts_str):
+    """Parse ISO timestamp to nanoseconds since epoch"""
+    try:
+        dt = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
+        if dt.tzinfo is not None:
+            dt = dt.astimezone(timezone.utc)
+        # Convert to nanoseconds since epoch
+        return int(dt.timestamp() * 1_000_000_000)
+    except ValueError:
+        return None
+
+
+def enhance_csv(csv_path, pcap_path, output_path, debug=False):
+    """Add PCAP metrics to CSV"""
+    if not os.path.exists(pcap_path):
+        print(f"⚠️  PCAP not found: {pcap_path}")
+        return False
+    
+    print(f"Processing: {os.path.basename(csv_path)}")
+    
+    # Read PCAP
+    try:
+        packets = read_pcap(pcap_path)
+        print(f"  Loaded {len(packets)} packets")
+        
+        if packets and debug:
+            first_pcap_ns = packets[0][0]
+            last_pcap_ns = packets[-1][0]
+            print(f"  First PCAP packet: {first_pcap_ns} ns")
+            print(f"  Last PCAP packet:  {last_pcap_ns} ns")
+            print(f"  PCAP duration: {(last_pcap_ns - first_pcap_ns) / 1e9:.3f}s")
+            
+    except Exception as e:
+        print(f"  ❌ Error reading PCAP: {e}")
+        return False
+    
+    if not packets:
+        print("  ❌ No packets found")
+        return False
+    
+    # Read CSV
+    with open(csv_path, 'r', newline='') as f:
+        reader = csv.DictReader(f)
+        fieldnames = list(reader.fieldnames) + [
+            'pcap_network_bytes_in',
+            'pcap_network_bytes_out',
+            'pcap_overhead_bytes'
+        ]
+        rows = list(reader)
+    
+    if rows and debug:
+        first_csv_ns = parse_timestamp_to_ns(rows[0]['timestamp'])
+        last_csv_ns = parse_timestamp_to_ns(rows[-1]['timestamp'])
+        if first_csv_ns and last_csv_ns:
+            print(f"  First CSV query:  {first_csv_ns} ns")
+            print(f"  Last CSV query:   {last_csv_ns} ns")
+            print(f"  CSV duration: {(last_csv_ns - first_csv_ns) / 1e9:.3f}s")
+            
+            # Check alignment
+            offset_ns = packets[0][0] - first_csv_ns
+            print(f"  Time offset (PCAP - CSV): {offset_ns / 1e9:.3f}s")
+    
+    # Enhance rows
+    enhanced = []
+    matched = 0
+    
+    for i, row in enumerate(rows):
+        ts_ns = parse_timestamp_to_ns(row['timestamp'])
+        if not ts_ns:
+            continue
+        
+        duration_ns = int(row.get('duration_ns', 0))
+        
+        matching_packets = find_packets_in_window(packets, ts_ns, duration_ns)
+        
+        metrics = calculate_metrics(matching_packets)
+        row.update(metrics)
+        enhanced.append(row)
+        
+        if metrics['pcap_overhead_bytes'] > 0:
+            matched += 1
+        
+        # Debug first few queries
+        if debug and i < 3:
+            print(f"  Query {i}: {row['domain']}")
+            print(f"    Start: {ts_ns} ns")
+            print(f"    Duration: {duration_ns} ns ({duration_ns / 1e6:.3f}ms)")
+            print(f"    End: {ts_ns + duration_ns} ns")
+            print(f"    Matched packets: {len(matching_packets)}")
+            print(f"    Bytes: {metrics['pcap_overhead_bytes']}")
+    
+    print(f"  Matched: {matched}/{len(rows)} queries")
+    
+    if matched == 0:
+        print("  ⚠️  WARNING: No queries matched any packets!")
+        print("     This might indicate timestamp misalignment.")
+    
+    # Write output
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
+        writer.writeheader()
+        writer.writerows(enhanced)
+    
+    print(f"  ✓ Saved: {output_path}")
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Add PCAP network metrics to DNS CSV files'
+    )
+    parser.add_argument('input_dir', help='Input directory (e.g., results_merged)')
+    parser.add_argument(
+        '--output',
+        default='./results_enhanced',
+        help='Output directory (default: ./results_enhanced)'
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Preview files without processing'
+    )
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help='Show detailed timing information'
+    )
+    
+    args = parser.parse_args()
+    
+    print("=" * 60)
+    print("ENHANCE DNS CSVs WITH PCAP METRICS")
+    print("=" * 60)
+    print(f"Input:  {args.input_dir}")
+    print(f"Output: {args.output}")
+    if args.debug:
+        print("Debug:  ENABLED")
+    print()
+    
+    # Find CSV files
+    csv_files = list(Path(args.input_dir).rglob('*.csv'))
+    
+    if not csv_files:
+        print("❌ No CSV files found")
+        return 1
+    
+    print(f"Found {len(csv_files)} CSV files\n")
+    
+    if args.dry_run:
+        print("DRY RUN - would process:")
+        for csv_path in csv_files:
+            pcap_path = csv_path.with_suffix('.pcap')
+            print(f"  {csv_path.relative_to(args.input_dir)}")
+            print(f"    PCAP: {'✓' if pcap_path.exists() else '✗'}")
+        return 0
+    
+    # Process files
+    success = 0
+    failed = 0
+    
+    for csv_path in csv_files:
+        pcap_path = csv_path.with_suffix('.pcap')
+        rel_path = csv_path.relative_to(args.input_dir)
+        output_path = Path(args.output) / rel_path
+        
+        if enhance_csv(str(csv_path), str(pcap_path), str(output_path), 
+                       args.debug):
+            success += 1
+        else:
+            failed += 1
+        print()
+    
+    # Summary
+    print("=" * 60)
+    print(f"✓ Success: {success}")
+    print(f"✗ Failed:  {failed}")
+    print(f"Total:     {len(csv_files)}")
+    print(f"\nOutput: {args.output}")
+    
+    return 0 if failed == 0 else 1
+
+
+if __name__ == "__main__":
+    exit(main())
--- a/scripts/tools/clean_pcaps.py
+++ b/scripts/tools/clean_pcaps.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python3
+"""
+Advanced PCAP filter for DNS traffic (with IPv6 support).
+
+Filters out:
+- Local network traffic except test machine (IPv4: 10.0.0.50; IPv6: specific addresses)
+- AdGuard DNS servers (for non-AdGuard captures)
+- Non-DNS traffic based on protocol-specific ports
+"""
+
+import os
+import subprocess
+from pathlib import Path
+import argparse
+
+# Test machine IPs (IPv4 and IPv6 from your provided info)
+TEST_IPV4 = '10.0.0.50'
+TEST_IPV6_GLOBAL = '2001:818:e73e:ba00:5506:dfd4:ed8b:96e'
+TEST_IPV6_LINKLOCAL = 'fe80::fe98:c62e:4463:9a2d'
+
+# Port mappings
+PORT_MAP = {
+    'udp': [53],                    # DNS-over-UDP
+    'tls': [53, 853],               # DNS-over-TLS
+    'https': [53, 443],             # DNS-over-HTTPS (DoH)
+    'doq': [53, 784, 8853],         # DNS-over-QUIC
+    'doh3': [53, 443]               # DNS-over-HTTP/3
+}
+
+# AdGuard DNS IPs to filter out (for non-AdGuard captures)
+ADGUARD_IPS = [
+    '94.140.14.14',
+    '94.140.15.15',
+    '2a10:50c0::ad1:ff',
+    '2a10:50c0::ad2:ff'
+]
+
+def parse_filename(filename):
+    """Extract protocol from filename"""
+    base = filename.replace('.pcap', '').replace('.csv', '')
+    parts = base.split('-')
+    
+    if len(parts) < 1:  # Minimum: protocol
+        return None
+    
+    protocol = parts[0].lower()
+    return protocol
+
+def extract_resolver_from_path(pcap_path):
+    """Extract resolver name from directory structure"""
+    parts = Path(pcap_path).parts
+    
+    for part in parts:
+        if part.lower() in ['cloudflare', 'google', 'quad9', 'adguard']:
+            return part.lower()
+    
+    return None
+
+def build_filter_expression(protocol, resolver):
+    """
+    Build tshark filter expression.
+    
+    Strategy:
+    1. Only protocol-specific DNS ports
+    2. Keep only traffic involving the test machine (IPv4/IPv6)
+    3. Exclude AdGuard IPs for non-AdGuard captures
+    """
+    
+    # Get ports for this protocol
+    ports = PORT_MAP.get(protocol, [53, 443, 853, 784, 8853])
+    
+    # Build port filter (UDP or TCP on these ports)
+    port_conditions = []
+    for port in ports:
+        port_conditions.append(f'(udp.port == {port} or tcp.port == {port})')
+    
+    port_filter = ' or '.join(port_conditions)
+    
+    # Build test machine filter (keep if src or dst is test machine IP)
+    machine_conditions = [f'(ip.addr == {TEST_IPV4})']
+    if TEST_IPV6_GLOBAL:
+        machine_conditions.append(f'(ipv6.addr == {TEST_IPV6_GLOBAL})')
+    if TEST_IPV6_LINKLOCAL:
+        machine_conditions.append(f'(ipv6.addr == {TEST_IPV6_LINKLOCAL})')
+    
+    machine_filter = ' or '.join(machine_conditions)
+    
+    # Build AdGuard exclusion filter
+    adguard_exclusions = []
+    if resolver != 'adguard':
+        for ip in ADGUARD_IPS:
+            if ':' in ip:  # IPv6
+                adguard_exclusions.append(f'!(ipv6.addr == {ip})')
+            else:  # IPv4
+                adguard_exclusions.append(f'!(ip.addr == {ip})')
+    
+    # Combine all filters
+    filters = [f'({port_filter})', f'({machine_filter})']
+    
+    if adguard_exclusions:
+        adguard_filter = ' and '.join(adguard_exclusions)
+        filters.append(f'({adguard_filter})')
+    
+    final_filter = ' and '.join(filters)
+    
+    return final_filter
+
+def filter_pcap(input_path, output_path, filter_expr, verbose=False):
+    """Apply filter to PCAP file using tshark"""
+    
+    cmd = [
+        'tshark',
+        '-r', input_path,
+        '-Y', filter_expr,
+        '-w', output_path,
+        '-F', 'pcap'
+    ]
+    
+    try:
+        if verbose:
+            print(f"  Filter: {filter_expr}")
+        
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=300
+        )
+        
+        if result.returncode != 0:
+            print(f"  ✗ Error: {result.stderr.strip()}")
+            return False
+        
+        if not os.path.exists(output_path):
+            print(f"  ✗ Output file not created")
+            return False
+        
+        output_size = os.path.getsize(output_path)
+        if output_size < 24:
+            print(f"  ⚠ Warning: Output is empty")
+        
+        return True
+        
+    except subprocess.TimeoutExpired:
+        print(f"  ✗ Timeout (>5 minutes)")
+        return False
+    except Exception as e:
+        print(f"  ✗ Exception: {e}")
+        return False
+
+def find_pcap_files(root_dir):
+    """Recursively find all PCAP files"""
+    pcap_files = []
+    for root, dirs, files in os.walk(root_dir):
+        for file in files:
+            if file.endswith('.pcap'):
+                full_path = os.path.join(root, file)
+                pcap_files.append(full_path)
+    return sorted(pcap_files)
+
+def format_bytes(bytes_val):
+    """Format bytes as human readable"""
+    for unit in ['B', 'KB', 'MB', 'GB']:
+        if bytes_val < 1024.0:
+            return f"{bytes_val:.1f} {unit}"
+        bytes_val /= 1024.0
+    return f"{bytes_val:.1f} TB"
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Advanced PCAP filter for DNS traffic (IPv4/IPv6)',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='''
+Filtering rules:
+  1. Only include traffic on protocol-specific DNS ports
+  2. Keep only packets involving the test machine (10.0.0.50 or its IPv6 addresses)
+  3. Exclude AdGuard IPs for non-AdGuard captures
+
+Protocol-specific ports:
+  udp:   53
+  tls:   53, 853
+  https: 53, 443
+  doq:   53, 784, 8853
+  doh3:  53, 443
+
+Examples:
+  # Dry run
+  %(prog)s ./results --dry-run
+  
+  # Filter with verbose output
+  %(prog)s ./results --verbose
+  
+  # Custom output directory
+  %(prog)s ./results --output ./cleaned
+        '''
+    )
+    
+    parser.add_argument(
+        'input_dir',
+        help='Input directory containing PCAP files'
+    )
+    parser.add_argument(
+        '-o', '--output',
+        default='./results_filtered',
+        help='Output directory (default: ./results_filtered)'
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Show what would be done without filtering'
+    )
+    parser.add_argument(
+        '--limit',
+        type=int,
+        help='Only process first N files (for testing)'
+    )
+    parser.add_argument(
+        '-v', '--verbose',
+        action='store_true',
+        help='Verbose output (show filter expressions)'
+    )
+    parser.add_argument(
+        '--overwrite',
+        action='store_true',
+        help='Overwrite existing filtered files'
+    )
+    
+    args = parser.parse_args()
+    
+    # Check for tshark
+    try:
+        result = subprocess.run(
+            ['tshark', '-v'],
+            capture_output=True,
+            check=True
+        )
+        if args.verbose:
+            version = result.stdout.decode().split('\n')[0]
+            print(f"Using: {version}\n")
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("Error: tshark not found. Install Wireshark/tshark:")
+        print("  Ubuntu/Debian: sudo apt-get install tshark")
+        print("  macOS: brew install wireshark")
+        return 1
+    
+    print("=" * 80)
+    print("ADVANCED DNS PCAP FILTER (IPv4/IPv6)")
+    print("=" * 80)
+    print("Filters:")
+    print("  1. Protocol-specific DNS ports only")
+    print("  2. Keep only traffic involving test machine (10.0.0.50 / IPv6 addresses)")
+    print("  3. Exclude AdGuard IPs (for non-AdGuard captures)")
+    print(f"\nInput:  {args.input_dir}")
+    print(f"Output: {args.output}")
+    
+    # Find PCAP files
+    print(f"\nScanning for PCAP files...")
+    pcap_files = find_pcap_files(args.input_dir)
+    
+    if not pcap_files:
+        print(f"No PCAP files found in {args.input_dir}")
+        return 1
+    
+    print(f"Found {len(pcap_files)} PCAP files")
+    
+    total_input_size = sum(os.path.getsize(f) for f in pcap_files)
+    print(f"Total size: {format_bytes(total_input_size)}")
+    
+    if args.limit:
+        pcap_files = pcap_files[:args.limit]
+        print(f"Limiting to first {args.limit} files")
+    
+    if args.dry_run:
+        print("\n*** DRY RUN MODE ***\n")
+    else:
+        print()
+    
+    # Process files
+    success_count = 0
+    skip_count = 0
+    fail_count = 0
+    total_output_size = 0
+    
+    for i, input_path in enumerate(pcap_files, 1):
+        # Extract info from path
+        filename = Path(input_path).name
+        protocol = parse_filename(filename)
+        resolver = extract_resolver_from_path(input_path)
+        
+        if not protocol:
+            print(f"[{i}/{len(pcap_files)}] {filename}")
+            print(f"  ⚠ Could not parse protocol, skipping")
+            skip_count += 1
+            continue
+        
+        # Create output path
+        rel_path = os.path.relpath(input_path, args.input_dir)
+        output_path = os.path.join(args.output, rel_path)
+        
+        input_size = os.path.getsize(input_path)
+        
+        print(f"[{i}/{len(pcap_files)}] {rel_path}")
+        print(f"  Protocol: {protocol.upper()}")
+        print(f"  Resolver: {resolver or 'unknown'}")
+        print(f"  Size: {format_bytes(input_size)}")
+        
+        # Check if already filtered
+        if os.path.exists(output_path) and not args.overwrite:
+            output_size = os.path.getsize(output_path)
+            reduction = ((input_size - output_size) / input_size * 100) if input_size > 0 else 0
+            print(f"  ⊙ Already filtered: {format_bytes(output_size)} "
+                  f"({reduction:.1f}% reduction)")
+            skip_count += 1
+            total_output_size += output_size
+            continue
+        
+        # Build filter
+        filter_expr = build_filter_expression(protocol, resolver)
+        
+        if args.dry_run:
+            print(f"  → Would filter")
+            if args.verbose:
+                print(f"  Filter: {filter_expr}")
+            continue
+        
+        # Create output directory
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        
+        # Filter
+        success = filter_pcap(input_path, output_path, filter_expr, args.verbose)
+        
+        if success:
+            output_size = os.path.getsize(output_path)
+            reduction = ((input_size - output_size) / input_size * 100) if input_size > 0 else 0
+            print(f"  ✓ Filtered: {format_bytes(output_size)} "
+                  f"({reduction:.1f}% reduction)")
+            success_count += 1
+            total_output_size += output_size
+        else:
+            fail_count += 1
+    
+    # Summary
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    
+    if args.dry_run:
+        print(f"Would process: {len(pcap_files)} files")
+    else:
+        print(f"Successful:    {success_count}")
+        print(f"Skipped:       {skip_count} (already filtered or unparseable)")
+        print(f"Failed:        {fail_count}")
+        print(f"Total:         {len(pcap_files)}")
+        
+        if success_count > 0 or skip_count > 0:
+            print(f"\nInput size:    {format_bytes(total_input_size)}")
+            print(f"Output size:   {format_bytes(total_output_size)}")
+            if total_input_size > 0:
+                reduction = ((total_input_size - total_output_size) / 
+                            total_input_size * 100)
+                print(f"Reduction:     {reduction:.1f}%")
+            print(f"\nOutput directory: {args.output}")
+    
+    return 0 if fail_count == 0 else 1
+
+if __name__ == "__main__":
+    exit(main())
--- a/scripts/tools/merge_files.py
+++ b/scripts/tools/merge_files.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+"""
+Merge DNS test files by configuration.
+
+- Merges CSVs of same config (adds 'run_id' column for traceability)
+- Optionally merges PCAPs using mergecap
+- Flattens date structure
+"""
+
+import os
+import csv
+import subprocess
+import shutil
+from pathlib import Path
+import argparse
+from collections import defaultdict
+
+def parse_filename(filename):
+    """
+    Extract config key from filename.
+    Format: protocol[-flags]-timestamp.{csv,pcap}
+    Config key: protocol[-flags] (ignores timestamp)
+    """
+    base = filename.replace('.csv', '').replace('.pcap', '')
+    parts = base.split('-')
+    
+    if len(parts) < 2:
+        return None
+    
+    # Config is everything except timestamp
+    config = '-'.join(parts[:-1])
+    timestamp = parts[-1]
+    
+    return config, timestamp
+
+def extract_resolver_from_path(file_path):
+    """Extract resolver name from path"""
+    parts = Path(file_path).parts
+    for part in parts:
+        if part.lower() in ['cloudflare', 'google', 'quad9', 'adguard']:
+            return part.lower()
+    return None
+
+def find_files(root_dir, extension):
+    """Find all files with given extension"""
+    files = []
+    for root, dirs, filenames in os.walk(root_dir):
+        for filename in filenames:
+            if filename.endswith(extension):
+                full_path = os.path.join(root, filename)
+                files.append(full_path)
+    return sorted(files)
+
+def merge_csvs(csv_files, output_path, fieldnames):
+    """Merge multiple CSVs into one, adding 'run_id' column"""
+    with open(output_path, 'w', newline='') as outfile:
+        writer = csv.DictWriter(outfile, fieldnames=fieldnames + ['run_id'])
+        writer.writeheader()
+        
+        for csv_path in csv_files:
+            # Use timestamp as run_id
+            filename = Path(csv_path).name
+            _, timestamp = parse_filename(filename)
+            run_id = timestamp  # Or add date if needed
+            
+            with open(csv_path, 'r', newline='') as infile:
+                reader = csv.DictReader(infile)
+                for row in reader:
+                    row['run_id'] = run_id
+                    writer.writerow(row)
+
+def merge_pcaps(pcap_files, output_path):
+    """Merge PCAP files using mergecap"""
+    cmd = ['mergecap', '-w', output_path] + pcap_files
+    try:
+        subprocess.run(cmd, capture_output=True, check=True)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"  ✗ mergecap error: {e.stderr.decode()}")
+        return False
+    except FileNotFoundError:
+        print("Error: mergecap not found. Install Wireshark:")
+        print("  Ubuntu: sudo apt install wireshark-common")
+        print("  macOS: brew install wireshark")
+        return False
+
+def format_bytes(bytes_val):
+    """Format bytes as human readable"""
+    for unit in ['B', 'KB', 'MB', 'GB']:
+        if bytes_val < 1024.0:
+            return f"{bytes_val:.1f} {unit}"
+        bytes_val /= 1024.0
+    return f"{bytes_val:.1f} TB"
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Merge DNS test files by configuration',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='''
+Merges files of same config across dates/timestamps.
+Output: ./results_merged/[resolver]/[config].csv (merged)
+        ./results_merged/[resolver]/[config].pcap (merged, if --merge-pcaps)
+
+Examples:
+  # Dry run to preview
+  %(prog)s ./results --dry-run
+  
+  # Merge CSVs only (recommended)
+  %(prog)s ./results
+  
+  # Merge CSVs and PCAPs
+  %(prog)s ./results --merge-pcaps
+  
+  # Custom output directory
+  %(prog)s ./results --output ./merged_data
+        '''
+    )
+    
+    parser.add_argument(
+        'input_dir',
+        help='Input directory (e.g., ./results)'
+    )
+    parser.add_argument(
+        '--output',
+        default='./results_merged',
+        help='Output directory (default: ./results_merged)'
+    )
+    parser.add_argument(
+        '--merge-pcaps',
+        action='store_true',
+        help='Merge PCAP files (requires mergecap from Wireshark)'
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Show what would be done without merging'
+    )
+    parser.add_argument(
+        '-y', '--yes',
+        action='store_true',
+        help='Skip confirmation prompt'
+    )
+    
+    args = parser.parse_args()
+    
+    if not os.path.isdir(args.input_dir):
+        print(f"Error: Input directory not found: {args.input_dir}")
+        return 1
+    
+    # Find all files
+    print("=" * 80)
+    print("MERGE DNS TEST FILES")
+    print("=" * 80)
+    print(f"Input:  {args.input_dir}")
+    print(f"Output: {args.output}")
+    print(f"Merge PCAPs: {'Yes' if args.merge_pcaps else 'No'}")
+    
+    csv_files = find_files(args.input_dir, '.csv')
+    pcap_files = find_files(args.input_dir, '.pcap') if args.merge_pcaps else []
+    
+    if not csv_files and not pcap_files:
+        print("\nNo CSV/PCAP files found")
+        return 1
+    
+    print(f"\nFound {len(csv_files)} CSV files")
+    if args.merge_pcaps:
+        print(f"Found {len(pcap_files)} PCAP files")
+    
+    # Group files by resolver and config
+    csv_groups = defaultdict(list)
+    pcap_groups = defaultdict(list)
+    
+    for csv_path in csv_files:
+        config, _ = parse_filename(Path(csv_path).name)
+        resolver = extract_resolver_from_path(csv_path)
+        if config and resolver:
+            key = (resolver, config)
+            csv_groups[key].append(csv_path)
+    
+    for pcap_path in pcap_files:
+        config, _ = parse_filename(Path(pcap_path).name)
+        resolver = extract_resolver_from_path(pcap_path)
+        if config and resolver:
+            key = (resolver, config)
+            pcap_groups[key].append(pcap_path)
+    
+    # Summary
+    print("\nConfigs to merge:")
+    print("-" * 80)
+    for (resolver, config), files in sorted(csv_groups.items()):
+        print(f"  {resolver}/{config}: {len(files)} runs")
+    
+    total_runs = sum(len(files) for files in csv_groups.values())
+    print(f"\nTotal configs: {len(csv_groups)}")
+    print(f"Total runs:    {total_runs}")
+    
+    if args.dry_run:
+        print("\n*** DRY RUN MODE ***\n")
+        for (resolver, config) in sorted(csv_groups.keys()):
+            print(f"Would merge: {resolver}/{config} ({len(csv_groups[(resolver, config)])} CSVs)")
+            if args.merge_pcaps and (resolver, config) in pcap_groups:
+                print(f"Would merge: {resolver}/{config} ({len(pcap_groups[(resolver, config)])} PCAPs)")
+        return 0
+    
+    # Confirmation
+    if not args.yes:
+        response = input(f"\nMerge all into {args.output}? [y/N] ")
+        if response.lower() not in ['y', 'yes']:
+            print("Cancelled")
+            return 0
+    
+    # Merge
+    print("\n" + "=" * 80)
+    print("MERGING FILES")
+    print("=" * 80)
+    
+    success_count = 0
+    fail_count = 0
+    total_queries = 0
+    total_size = 0
+    
+    # Get standard CSV fieldnames (from first file)
+    first_csv = next(iter(csv_files))
+    with open(first_csv, 'r') as f:
+        reader = csv.DictReader(f)
+        fieldnames = reader.fieldnames
+    
+    for (resolver, config), files in sorted(csv_groups.items()):
+        print(f"\n{resolver}/{config} ({len(files)} runs)")
+        
+        # Merge CSVs
+        output_csv = os.path.join(args.output, resolver, f"{config}.csv")
+        os.makedirs(os.path.dirname(output_csv), exist_ok=True)
+        
+        merge_csvs(files, output_csv, fieldnames)
+        
+        # Count queries in merged file
+        with open(output_csv, 'r') as f:
+            query_count = sum(1 for _ in csv.reader(f)) - 1  # Minus header
+        
+        print(f"  ✓ Merged CSV: {query_count:,} queries")
+        total_queries += query_count
+        success_count += 1
+        
+        # Merge PCAPs if requested
+        if args.merge_pcaps and (resolver, config) in pcap_groups:
+            output_pcap = os.path.join(args.output, resolver, f"{config}.pcap")
+            pcap_list = pcap_groups[(resolver, config)]
+            
+            if merge_pcaps(pcap_list, output_pcap):
+                merged_size = os.path.getsize(output_pcap)
+                orig_size = sum(os.path.getsize(p) for p in pcap_list)
+                print(f"  ✓ Merged PCAP: {format_bytes(merged_size)} "
+                      f"(from {format_bytes(orig_size)})")
+                total_size += merged_size
+            else:
+                print(f"  ✗ PCAP merge failed")
+                fail_count += 1
+    
+    # Final summary
+    print("\n" + "=" * 80)
+    print("COMPLETE")
+    print("=" * 80)
+    print(f"Successful configs: {success_count}")
+    print(f"Failed:            {fail_count}")
+    print(f"Total queries:     {total_queries:,}")
+    if args.merge_pcaps:
+        print(f"Total PCAP size:   {format_bytes(total_size)}")
+    print(f"\nMerged files in: {args.output}")
+    
+    return 0 if fail_count == 0 else 1
+
+if __name__ == "__main__":
+    exit(main())