feat(profiling): Add CPU and MEM profiling

2026-04-06 13:15:09 +01:00
parent 0dc77c4583
commit cd8a6c5433
17 changed files with 631 additions and 2740 deletions
@@ -1,27 +0,0 @@
 {
  "nodes": {
    "nixpkgs": {
      "locked": {
        "lastModified": 1760103332,
        "narHash": "sha256-BMsGVfKl4Q80Pr9T1AkCRljO1bpwCmY8rTBVj8XGuhA=",
        "owner": "NixOS",
        "repo": "nixpkgs",
        "rev": "870493f9a8cb0b074ae5b411b2f232015db19a65",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
        "ref": "nixpkgs-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "nixpkgs": "nixpkgs"
      }
    }
  },
  "root": "root",
  "version": 7
 }
@@ -1,38 +0,0 @@
 {
  description = "A Nix-flake-based Go 1.22 development environment";
  inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
  outputs = inputs:
    let
      goVersion = 25; # Change this to update the whole stack
      supportedSystems = [ "aarch64-darwin" ];
      forEachSupportedSystem = f: inputs.nixpkgs.lib.genAttrs supportedSystems (system: f {
        pkgs = import inputs.nixpkgs {
          inherit system;
          overlays = [ inputs.self.overlays.default ];
        };
      });
    in
    {
      overlays.default = final: prev: {
        go = final."go_1_${toString goVersion}";
      };
      devShells = forEachSupportedSystem ({ pkgs }: {
        default = pkgs.mkShell {
          packages = with pkgs; [
            go
            gotools
            golangci-lint
            (pkgs.python3.withPackages (python-pkgs: with python-pkgs; [
              pandas
              matplotlib
              seaborn
            ]))
          ];
        };
      });
    };
 }
@@ -12,6 +12,7 @@ import (
 	"github.com/afonsofrancof/sdns-proxy/client"
 	"github.com/afonsofrancof/sdns-proxy/internal/qol/capture"
 	"github.com/afonsofrancof/sdns-proxy/internal/qol/results"
 	"github.com/afonsofrancof/sdns-proxy/internal/qol/stats"
 	"github.com/google/gopacket/pcap"
 	"github.com/miekg/dns"
 )
@@ -86,13 +87,16 @@ func (r *MeasurementRunner) runPerUpstream(upstream string, domains []string, qT
 	defer dnsClient.Close()
 	// Setup output files
-	csvPath, pcapPath := GenerateOutputPaths(r.config.OutputDir, upstream, r.config.DNSSEC, r.config.AuthoritativeDNSSEC, r.config.KeepAlive)
+	csvPath, pcapPath, memPath := GenerateOutputPaths(r.config.OutputDir, upstream, r.config.DNSSEC, r.config.AuthoritativeDNSSEC, r.config.KeepAlive)
 	// Create directory if it doesn't exist
 	if err := os.MkdirAll(filepath.Dir(csvPath), 0755); err != nil {
 		return fmt.Errorf("failed to create output directory: %w", err)
 	}
 	// Initialize runtime collector with memPath
 	runtimeCollector := stats.NewRuntimeCollector(memPath)
 	keepAliveStr := ""
 	if r.config.KeepAlive {
 		keepAliveStr = " (keep-alive)"
@@ -118,7 +122,17 @@ func (r *MeasurementRunner) runPerUpstream(upstream string, domains []string, qT
 	time.Sleep(time.Second)
 	// Run measurements
-	return r.runQueries(dnsClient, upstream, domains, qType, writer, packetCapture)
+	err = r.runQueries(dnsClient, upstream, domains, qType, writer, packetCapture)
 	if err != nil {
 		return err
 	}
 	// Write summed mem stats for the entire run
 	if err := runtimeCollector.WriteStats(); err != nil {
 		fmt.Fprintf(os.Stderr, "Warning: failed to write mem stats to %s: %v\n", memPath, err)
 	}
 	return nil
 }
 func (r *MeasurementRunner) runQueries(dnsClient client.DNSClient, upstream string,
@@ -0,0 +1,95 @@
 package stats
 import (
 	"encoding/csv"
 	"fmt"
 	"os"
 	"runtime"
 	"time"
 )
 type RuntimeStats struct {
 	TotalAlloc   uint64
 	Mallocs      uint64
 	NumGC        uint32
 	AllocDelta   uint64
 	MallocsDelta uint64
 	GCDelta      uint32
 }
 type RuntimeCollector struct {
 	startStats runtime.MemStats
 	memPath    string
 }
 func NewRuntimeCollector(memPath string) *RuntimeCollector {
 	var stats runtime.MemStats
 	runtime.ReadMemStats(&stats)
 	return &RuntimeCollector{
 		startStats: stats,
 		memPath:    memPath,
 	}
 }
 func (rc *RuntimeCollector) Collect() RuntimeStats {
 	var current runtime.MemStats
 	runtime.ReadMemStats(&current)
 	return RuntimeStats{
 		TotalAlloc:   current.TotalAlloc,
 		Mallocs:      current.Mallocs,
 		NumGC:        current.NumGC,
 		AllocDelta:   current.TotalAlloc - rc.startStats.TotalAlloc,
 		MallocsDelta: current.Mallocs - rc.startStats.Mallocs,
 		GCDelta:      current.NumGC - rc.startStats.NumGC,
 	}
 }
 func (rc *RuntimeCollector) WriteStats() error {
 	stats := rc.Collect()
 	timestamp := time.Now().Format(time.RFC3339Nano)
 	// Check if file exists
 	fileExists := false
 	if _, err := os.Stat(rc.memPath); err == nil {
 		fileExists = true
 	}
 	// Open in append mode
 	file, err := os.OpenFile(rc.memPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
 	if err != nil {
 		return fmt.Errorf("failed to open mem.csv: %w", err)
 	}
 	defer file.Close()
 	writer := csv.NewWriter(file)
 	// Write header if new file
 	if !fileExists {
 		header := []string{
 			"timestamp", "total_alloc_bytes", "mallocs", "gc_cycles",
 			"alloc_delta", "mallocs_delta", "gc_delta",
 		}
 		if err := writer.Write(header); err != nil {
 			return fmt.Errorf("failed to write mem.csv header: %w", err)
 		}
 	}
 	// Write data row
 	row := []string{
 		timestamp,
 		fmt.Sprintf("%d", stats.TotalAlloc),
 		fmt.Sprintf("%d", stats.Mallocs),
 		fmt.Sprintf("%d", stats.NumGC),
 		fmt.Sprintf("%d", stats.AllocDelta),
 		fmt.Sprintf("%d", stats.MallocsDelta),
 		fmt.Sprintf("%d", stats.GCDelta),
 	}
 	if err := writer.Write(row); err != nil {
 		return fmt.Errorf("failed to write mem.csv row: %w", err)
 	}
 	writer.Flush()
 	return writer.Error()
 }
@@ -1,4 +1,3 @@
 // ./internal/qol/utils.go
 package qol
 import (
@@ -8,7 +7,7 @@ import (
 	"strings"
 )
-func GenerateOutputPaths(outputDir, upstream string, dnssec, authDNSSEC, keepAlive bool) (csvPath, pcapPath string) {
+func GenerateOutputPaths(outputDir, upstream string, dnssec, authDNSSEC, keepAlive bool) (csvPath, pcapPath, memPath string) {
 	proto := DetectProtocol(upstream)
 	cleanServer := cleanServerName(upstream)
@@ -32,8 +31,10 @@ func GenerateOutputPaths(outputDir, upstream string, dnssec, authDNSSEC, keepAli
 		base = fmt.Sprintf("%s-%s", base, strings.Join(flags, "-"))
 	}
-	return filepath.Join(subDir, base+".csv"),
+	csvPath = filepath.Join(subDir, base+".csv")
-		filepath.Join(subDir, base+".pcap")
+	pcapPath = filepath.Join(subDir, base+".pcap")
 	memPath = filepath.Join(subDir, base+".mem.csv")
 	return
 }
 func cleanServerName(server string) string {
@@ -1,187 +0,0 @@
 #!/bin/bash
 # Exit on error
 set -e
 # Default values
 TOOL_PATH="./qol"
 DOMAINS_FILE="./domains.txt"
 OUTPUT_DIR="./results"
 INTERFACE="veth1"
 TIMEOUT="5s"
 SLEEP_TIME="1"
 # Parse arguments
 while [[ $# -gt 0 ]]; do
  case $1 in
    -t|--tool-path)
      TOOL_PATH="$2"
      shift 2
      ;;
    -d|--domains-file)
      DOMAINS_FILE="$2"
      shift 2
      ;;
    -o|--output-dir)
      OUTPUT_DIR="$2"
      shift 2
      ;;
    -I|--interface)
      INTERFACE="$2"
      shift 2
      ;;
    -T|--timeout)
      TIMEOUT="$2"
      shift 2
      ;;
    -s|--sleep)
      SLEEP_TIME="$2"
      shift 2
      ;;
    --help)
      echo "Usage: $0 [OPTIONS]"
      echo ""
      echo "Options:"
      echo "  -t, --tool-path PATH      Path to qol tool (default: ./qol)"
      echo "  -d, --domains-file PATH   Path to domains file (default: ./domains.txt)"
      echo "  -o, --output-dir PATH     Output directory (default: ./results)"
      echo "  -I, --interface NAME      Network interface (default: veth1)"
      echo "  -T, --timeout DURATION    Timeout duration (default: 5s)"
      echo "  -s, --sleep SECONDS       Sleep between runs (default: 1)"
      echo "  --help                    Show this help"
      exit 0
      ;;
    *)
      echo "Unknown option: $1"
      echo "Use --help for usage information"
      exit 1
      ;;
  esac
 done
 echo "Configuration:"
 echo "  Tool path: $TOOL_PATH"
 echo "  Domains file: $DOMAINS_FILE"
 echo "  Output dir: $OUTPUT_DIR"
 echo "  Interface: $INTERFACE"
 echo "  Timeout: $TIMEOUT"
 echo "  Sleep time: ${SLEEP_TIME}s"
 echo ""
 # Connection-based protocols that benefit from keep-alive (TCP-based)
 CONN_SERVERS=(
    -s "dotcp://8.8.8.8:53"
    -s "dotcp://1.1.1.1:53"
    -s "dotcp://9.9.9.9:53"
    -s "dotcp://dns.adguard-dns.com:53"
    #-s "tls://8.8.8.8:853"
    #-s "tls://1.1.1.1:853"
    #-s "tls://9.9.9.9:853"
    #-s "tls://dns.adguard-dns.com:853"
    #-s "https://dns.google/dns-query"
    #-s "https://cloudflare-dns.com/dns-query"
    #-s "https://dns10.quad9.net/dns-query"
    #-s "https://dns.adguard-dns.com/dns-query"
 )
 # QUIC-based protocols (have built-in 0-RTT, keep-alive doesn't add value)
 QUIC_SERVERS=(
    #-s "doh3://dns.google/dns-query"
    #-s "doh3://cloudflare-dns.com/dns-query"
    #-s "doh3://dns.adguard-dns.com/dns-query"
    #-s "doq://dns.adguard-dns.com:853"
 )
 # Connectionless protocols (no keep-alive)
 CONNLESS_SERVERS=(
    # -s "udp://8.8.8.8:53"
    # -s "udp://1.1.1.1:53"
    # -s "udp://9.9.9.9:53"
    # -s "udp://dns.adguard-dns.com:53"
    -s "sdns://AQMAAAAAAAAAETk0LjE0MC4xNC4xNDo1NDQzINErR_JS3PLCu_iZEIbq95zkSV2LFsigxDIuUso_OQhzIjIuZG5zY3J5cHQuZGVmYXVsdC5uczEuYWRndWFyZC5jb20"
    -s "sdns://AQMAAAAAAAAAFDE0OS4xMTIuMTEyLjExMjo4NDQzIGfIR7jIdYzRICRVQ751Z0bfNN8dhMALjEcDaN-CHYY-GTIuZG5zY3J5cHQtY2VydC5xdWFkOS5uZXQ"
 )
 # Common args
 COMMON_ARGS=(
    "$DOMAINS_FILE"
    --interface "$INTERFACE"
    --timeout "$TIMEOUT"
 )
 # Combinations for TCP-based connection protocols
 CONN_COMBINATIONS=(
    # DNSSEC off, Keep off
    ""
    # DNSSEC off, Keep on
    "--keep-alive"
    # DNSSEC on (trust), Keep on
    "--dnssec --keep-alive"
    # DNSSEC on (auth), Keep on
    "--dnssec --authoritative-dnssec --keep-alive"
 )
 # Combinations for QUIC and connectionless protocols (no keep-alive)
 NO_KEEPALIVE_COMBINATIONS=(
    # DNSSEC off
    ""
    # DNSSEC on (trust)
    "--dnssec"
    # DNSSEC on (auth)
    "--dnssec --authoritative-dnssec"
 )
 echo "=== Running TCP-based protocols (TLS/HTTPS) ==="
 for FLAGS in "${CONN_COMBINATIONS[@]}"; do
    echo "Running: $FLAGS"
    FLAGS_ARRAY=($FLAGS)
    sudo "$TOOL_PATH" run \
        --output-dir "$OUTPUT_DIR" \
        "${COMMON_ARGS[@]}" \
        "${CONN_SERVERS[@]}" \
        "${FLAGS_ARRAY[@]}" || true
    sleep "$SLEEP_TIME"
 done
 echo ""
 echo "=== Running QUIC-based protocols (DoH3/DoQ) ==="
 for FLAGS in "${NO_KEEPALIVE_COMBINATIONS[@]}"; do
    echo "Running: $FLAGS"
    FLAGS_ARRAY=($FLAGS)
    sudo "$TOOL_PATH" run \
        --output-dir "$OUTPUT_DIR" \
        "${COMMON_ARGS[@]}" \
        "${QUIC_SERVERS[@]}" \
        "${FLAGS_ARRAY[@]}" || true
    sleep "$SLEEP_TIME"
 done
 echo ""
 echo "=== Running connectionless protocols (UDP) ==="
 for FLAGS in "${NO_KEEPALIVE_COMBINATIONS[@]}"; do
    echo "Running: $FLAGS"
    FLAGS_ARRAY=($FLAGS)
    sudo "$TOOL_PATH" run \
        --output-dir "$OUTPUT_DIR" \
        "${COMMON_ARGS[@]}" \
        "${CONNLESS_SERVERS[@]}" \
        "${FLAGS_ARRAY[@]}" || true
    sleep "$SLEEP_TIME"
 done
 echo ""
 echo "All combinations completed!"
@@ -1,498 +0,0 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 import numpy as np
 from pathlib import Path
 from scipy import stats
 import warnings
 warnings.filterwarnings('ignore')
 # Set style for publication-quality plots
 sns.set_style("whitegrid")
 plt.rcParams['figure.dpi'] = 300
 plt.rcParams['savefig.dpi'] = 300
 plt.rcParams['font.size'] = 10
 plt.rcParams['figure.figsize'] = (12, 6)
 class DNSAnalyzer:
    def __init__(self, results_dir='results'):
        self.results_dir = Path(results_dir)
        self.df = None
    def load_all_data(self):
        """Load all CSV files from the results directory"""
        data_frames = []
        providers = ['adguard', 'cloudflare', 'google', 'quad9']
        for provider in providers:
            provider_path = self.results_dir / provider
            if not provider_path.exists():
                continue
            for csv_file in provider_path.glob('*.csv'):
                try:
                    df = pd.read_csv(csv_file)
                    df['provider'] = provider
                    df['test_config'] = csv_file.stem
                    data_frames.append(df)
                except Exception as e:
                    print(f"Error loading {csv_file}: {e}")
        self.df = pd.concat(data_frames, ignore_index=True)
        self._clean_and_enrich_data()
        print(f"Loaded {len(self.df)} DNS queries across {len(data_frames)} test configurations")
    def _clean_and_enrich_data(self):
        """Clean data and add useful columns"""
        # Remove failed queries
        self.df = self.df[self.df['error'].isna()]
        # Extract protocol base (remove -auth, -trust suffixes)
        self.df['protocol_base'] = self.df['protocol'].str.replace('-auth|-trust', '', regex=True)
        # DNSSEC configuration
        self.df['dnssec_mode'] = 'none'
        self.df.loc[self.df['auth_dnssec'] == True, 'dnssec_mode'] = 'auth'
        self.df.loc[(self.df['dnssec'] == True) & (self.df['auth_dnssec'] == False), 'dnssec_mode'] = 'trust'
        # Protocol categories
        self.df['protocol_category'] = self.df['protocol_base'].map({
            'udp': 'Plain DNS',
            'tls': 'DoT',
            'https': 'DoH',
            'doh3': 'DoH/3',
            'doq': 'DoQ'
        })
        # Connection persistence
        self.df['persistence'] = self.df['keep_alive'].fillna(False)
    def generate_summary_statistics(self):
        """Generate comprehensive summary statistics"""
        print("\n" + "="*80)
        print("SUMMARY STATISTICS")
        print("="*80)
        # Overall statistics
        print("\n--- Overall Performance ---")
        print(f"Total queries: {len(self.df)}")
        print(f"Mean latency: {self.df['duration_ms'].mean():.2f} ms")
        print(f"Median latency: {self.df['duration_ms'].median():.2f} ms")
        print(f"95th percentile: {self.df['duration_ms'].quantile(0.95):.2f} ms")
        print(f"99th percentile: {self.df['duration_ms'].quantile(0.99):.2f} ms")
        # By protocol
        print("\n--- Performance by Protocol ---")
        protocol_stats = self.df.groupby('protocol_category')['duration_ms'].agg([
            ('count', 'count'),
            ('mean', 'mean'),
            ('median', 'median'),
            ('std', 'std'),
            ('p95', lambda x: x.quantile(0.95)),
            ('p99', lambda x: x.quantile(0.99))
        ]).round(2)
        print(protocol_stats)
        # By provider
        print("\n--- Performance by Provider ---")
        provider_stats = self.df.groupby('provider')['duration_ms'].agg([
            ('count', 'count'),
            ('mean', 'mean'),
            ('median', 'median'),
            ('std', 'std'),
            ('p95', lambda x: x.quantile(0.95))
        ]).round(2)
        print(provider_stats)
        # DNSSEC impact
        print("\n--- DNSSEC Validation Impact ---")
        dnssec_stats = self.df.groupby('dnssec_mode')['duration_ms'].agg([
            ('count', 'count'),
            ('mean', 'mean'),
            ('median', 'median'),
            ('overhead_vs_none', lambda x: x.mean())
        ]).round(2)
        # Calculate overhead percentage
        baseline = dnssec_stats.loc['none', 'mean'] if 'none' in dnssec_stats.index else 0
        if baseline > 0:
            dnssec_stats['overhead_pct'] = ((dnssec_stats['overhead_vs_none'] - baseline) / baseline * 100).round(1)
        print(dnssec_stats)
        # Bandwidth analysis
        print("\n--- Bandwidth Usage ---")
        bandwidth_stats = self.df.groupby('protocol_category').agg({
            'request_size_bytes': ['mean', 'median'],
            'response_size_bytes': ['mean', 'median']
        }).round(2)
        print(bandwidth_stats)
        # Persistence impact (where applicable)
        print("\n--- Connection Persistence Impact ---")
        persist_protocols = self.df[self.df['protocol_base'].isin(['tls', 'https'])]
        if len(persist_protocols) > 0:
            persist_stats = persist_protocols.groupby(['protocol_base', 'persistence'])['duration_ms'].agg([
                ('mean', 'mean'),
                ('median', 'median')
            ]).round(2)
            print(persist_stats)
        return {
            'protocol': protocol_stats,
            'provider': provider_stats,
            'dnssec': dnssec_stats,
            'bandwidth': bandwidth_stats
        }
    def plot_latency_by_protocol(self, output_dir='plots'):
        """Violin plot of latency distribution by protocol"""
        Path(output_dir).mkdir(exist_ok=True)
        plt.figure(figsize=(14, 7))
        # Order protocols logically
        protocol_order = ['Plain DNS', 'DoT', 'DoH', 'DoH/3', 'DoQ']
        available_protocols = [p for p in protocol_order if p in self.df['protocol_category'].values]
        sns.violinplot(data=self.df, x='protocol_category', y='duration_ms', 
                      order=available_protocols, inner='box', cut=0)
        plt.title('DNS Query Latency Distribution by Protocol', fontsize=14, fontweight='bold')
        plt.xlabel('Protocol', fontsize=12)
        plt.ylabel('Response Time (ms)', fontsize=12)
        plt.xticks(rotation=0)
        # Add mean values as annotations
        for i, protocol in enumerate(available_protocols):
            mean_val = self.df[self.df['protocol_category'] == protocol]['duration_ms'].mean()
            plt.text(i, mean_val, f'{mean_val:.1f}', ha='center', va='bottom', fontweight='bold')
        plt.tight_layout()
        plt.savefig(f'{output_dir}/latency_by_protocol.png', bbox_inches='tight')
        plt.close()
        print(f"✓ Saved: latency_by_protocol.png")
    def plot_provider_comparison(self, output_dir='plots'):
        """Box plot comparing providers across protocols"""
        Path(output_dir).mkdir(exist_ok=True)
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Provider Performance Comparison by Protocol', fontsize=16, fontweight='bold')
        protocols = self.df['protocol_category'].unique()
        protocols = [p for p in ['Plain DNS', 'DoT', 'DoH', 'DoH/3'] if p in protocols]
        for idx, protocol in enumerate(protocols[:4]):
            ax = axes[idx // 2, idx % 2]
            data = self.df[self.df['protocol_category'] == protocol]
            if len(data) > 0:
                sns.boxplot(data=data, x='provider', y='duration_ms', ax=ax)
                ax.set_title(f'{protocol}', fontsize=12, fontweight='bold')
                ax.set_xlabel('Provider', fontsize=10)
                ax.set_ylabel('Response Time (ms)', fontsize=10)
                ax.tick_params(axis='x', rotation=45)
        plt.tight_layout()
        plt.savefig(f'{output_dir}/provider_comparison.png', bbox_inches='tight')
        plt.close()
        print(f"✓ Saved: provider_comparison.png")
    def plot_dnssec_impact(self, output_dir='plots'):
        """Compare DNSSEC validation methods (trust vs auth)"""
        Path(output_dir).mkdir(exist_ok=True)
        # Filter for protocols that have DNSSEC variations
        dnssec_data = self.df[self.df['dnssec_mode'] != 'none'].copy()
        if len(dnssec_data) == 0:
            print("⚠ No DNSSEC data available")
            return
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        # Plot 1: Overall DNSSEC impact
        protocol_order = ['Plain DNS', 'DoT', 'DoH', 'DoH/3', 'DoQ']
        available = [p for p in protocol_order if p in self.df['protocol_category'].values]
        sns.barplot(data=self.df, x='protocol_category', y='duration_ms', 
                   hue='dnssec_mode', order=available, ax=ax1, ci=95)
        ax1.set_title('DNSSEC Validation Overhead by Protocol', fontsize=12, fontweight='bold')
        ax1.set_xlabel('Protocol', fontsize=10)
        ax1.set_ylabel('Mean Response Time (ms)', fontsize=10)
        ax1.legend(title='DNSSEC Mode', labels=['No DNSSEC', 'Auth (Full)', 'Trust (Resolver)'])
        ax1.tick_params(axis='x', rotation=0)
        # Plot 2: Trust vs Auth comparison
        comparison_data = dnssec_data.groupby(['protocol_category', 'dnssec_mode'])['duration_ms'].mean().reset_index()
        pivot_data = comparison_data.pivot(index='protocol_category', columns='dnssec_mode', values='duration_ms')
        if 'auth' in pivot_data.columns and 'trust' in pivot_data.columns:
            pivot_data['overhead_pct'] = ((pivot_data['auth'] - pivot_data['trust']) / pivot_data['trust'] * 100)
            pivot_data['overhead_pct'].plot(kind='bar', ax=ax2, color='coral')
            ax2.set_title('Auth vs Trust: Additional Overhead (%)', fontsize=12, fontweight='bold')
            ax2.set_xlabel('Protocol', fontsize=10)
            ax2.set_ylabel('Additional Overhead (%)', fontsize=10)
            ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.8)
            ax2.tick_params(axis='x', rotation=45)
            ax2.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        plt.savefig(f'{output_dir}/dnssec_impact.png', bbox_inches='tight')
        plt.close()
        print(f"✓ Saved: dnssec_impact.png")
    def plot_persistence_impact(self, output_dir='plots'):
        """Analyze impact of connection persistence"""
        Path(output_dir).mkdir(exist_ok=True)
        persist_data = self.df[self.df['protocol_base'].isin(['tls', 'https'])].copy()
        if len(persist_data) == 0:
            print("⚠ No persistence data available")
            return
        plt.figure(figsize=(12, 6))
        sns.barplot(data=persist_data, x='protocol_base', y='duration_ms', 
                   hue='persistence', ci=95)
        plt.title('Impact of Connection Persistence on Latency', fontsize=14, fontweight='bold')
        plt.xlabel('Protocol', fontsize=12)
        plt.ylabel('Mean Response Time (ms)', fontsize=12)
        plt.legend(title='Keep-Alive', labels=['Disabled', 'Enabled'])
        # Calculate and annotate overhead reduction
        for protocol in persist_data['protocol_base'].unique():
            protocol_data = persist_data[persist_data['protocol_base'] == protocol]
            no_persist = protocol_data[protocol_data['persistence'] == False]['duration_ms'].mean()
            with_persist = protocol_data[protocol_data['persistence'] == True]['duration_ms'].mean()
            if not np.isnan(no_persist) and not np.isnan(with_persist):
                reduction = ((no_persist - with_persist) / no_persist * 100)
                print(f"{protocol}: {reduction:.1f}% reduction with persistence")
        plt.tight_layout()
        plt.savefig(f'{output_dir}/persistence_impact.png', bbox_inches='tight')
        plt.close()
        print(f"✓ Saved: persistence_impact.png")
    def plot_bandwidth_overhead(self, output_dir='plots'):
        """Visualize bandwidth usage by protocol"""
        Path(output_dir).mkdir(exist_ok=True)
        bandwidth_data = self.df.groupby('protocol_category').agg({
            'request_size_bytes': 'mean',
            'response_size_bytes': 'mean'
        }).reset_index()
        bandwidth_data['total_bytes'] = (bandwidth_data['request_size_bytes'] + 
                                         bandwidth_data['response_size_bytes'])
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        # Plot 1: Request vs Response sizes
        x = np.arange(len(bandwidth_data))
        width = 0.35
        ax1.bar(x - width/2, bandwidth_data['request_size_bytes'], width, 
               label='Request', alpha=0.8)
        ax1.bar(x + width/2, bandwidth_data['response_size_bytes'], width, 
               label='Response', alpha=0.8)
        ax1.set_xlabel('Protocol', fontsize=12)
        ax1.set_ylabel('Bytes', fontsize=12)
        ax1.set_title('Average Request/Response Sizes', fontsize=12, fontweight='bold')
        ax1.set_xticks(x)
        ax1.set_xticklabels(bandwidth_data['protocol_category'])
        ax1.legend()
        ax1.grid(axis='y', alpha=0.3)
        # Plot 2: Total bandwidth overhead vs UDP baseline
        udp_total = bandwidth_data[bandwidth_data['protocol_category'] == 'Plain DNS']['total_bytes'].values
        if len(udp_total) > 0:
            bandwidth_data['overhead_vs_udp'] = ((bandwidth_data['total_bytes'] - udp_total[0]) / udp_total[0] * 100)
            colors = ['green' if x < 0 else 'red' for x in bandwidth_data['overhead_vs_udp']]
            ax2.bar(bandwidth_data['protocol_category'], bandwidth_data['overhead_vs_udp'], 
                   color=colors, alpha=0.7)
            ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.8)
            ax2.set_xlabel('Protocol', fontsize=12)
            ax2.set_ylabel('Overhead vs Plain DNS (%)', fontsize=12)
            ax2.set_title('Bandwidth Overhead', fontsize=12, fontweight='bold')
            ax2.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        plt.savefig(f'{output_dir}/bandwidth_overhead.png', bbox_inches='tight')
        plt.close()
        print(f"✓ Saved: bandwidth_overhead.png")
    def plot_heatmap(self, output_dir='plots'):
        """Heatmap of provider-protocol performance"""
        Path(output_dir).mkdir(exist_ok=True)
        # Create pivot table
        heatmap_data = self.df.groupby(['provider', 'protocol_category'])['duration_ms'].median().unstack()
        plt.figure(figsize=(12, 8))
        sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='RdYlGn_r', 
                   cbar_kws={'label': 'Median Latency (ms)'})
        plt.title('DNS Provider-Protocol Performance Matrix', fontsize=14, fontweight='bold')
        plt.xlabel('Protocol', fontsize=12)
        plt.ylabel('Provider', fontsize=12)
        plt.tight_layout()
        plt.savefig(f'{output_dir}/provider_protocol_heatmap.png', bbox_inches='tight')
        plt.close()
        print(f"✓ Saved: provider_protocol_heatmap.png")
    def plot_percentile_comparison(self, output_dir='plots'):
        """Plot percentile comparison across protocols"""
        Path(output_dir).mkdir(exist_ok=True)
        percentiles = [50, 75, 90, 95, 99]
        protocol_order = ['Plain DNS', 'DoT', 'DoH', 'DoH/3', 'DoQ']
        available = [p for p in protocol_order if p in self.df['protocol_category'].values]
        percentile_data = []
        for protocol in available:
            data = self.df[self.df['protocol_category'] == protocol]['duration_ms']
            for p in percentiles:
                percentile_data.append({
                    'protocol': protocol,
                    'percentile': f'P{p}',
                    'latency': np.percentile(data, p)
                })
        percentile_df = pd.DataFrame(percentile_data)
        plt.figure(figsize=(14, 7))
        sns.barplot(data=percentile_df, x='protocol', y='latency', hue='percentile', order=available)
        plt.title('Latency Percentiles by Protocol', fontsize=14, fontweight='bold')
        plt.xlabel('Protocol', fontsize=12)
        plt.ylabel('Response Time (ms)', fontsize=12)
        plt.legend(title='Percentile', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(f'{output_dir}/percentile_comparison.png', bbox_inches='tight')
        plt.close()
        print(f"✓ Saved: percentile_comparison.png")
    def statistical_tests(self):
        """Perform statistical significance tests"""
        print("\n" + "="*80)
        print("STATISTICAL TESTS")
        print("="*80)
        # Test 1: Protocol differences (Kruskal-Wallis)
        protocols = self.df['protocol_category'].unique()
        if len(protocols) > 2:
            groups = [self.df[self.df['protocol_category'] == p]['duration_ms'].values 
                     for p in protocols]
            h_stat, p_value = stats.kruskal(*groups)
            print(f"\n--- Kruskal-Wallis Test (Protocol Differences) ---")
            print(f"H-statistic: {h_stat:.4f}")
            print(f"p-value: {p_value:.4e}")
            print(f"Result: {'Significant' if p_value < 0.05 else 'Not significant'} differences between protocols")
        # Test 2: DNSSEC impact (Mann-Whitney U)
        if 'none' in self.df['dnssec_mode'].values and 'auth' in self.df['dnssec_mode'].values:
            none_data = self.df[self.df['dnssec_mode'] == 'none']['duration_ms']
            auth_data = self.df[self.df['dnssec_mode'] == 'auth']['duration_ms']
            u_stat, p_value = stats.mannwhitneyu(none_data, auth_data, alternative='two-sided')
            print(f"\n--- Mann-Whitney U Test (No DNSSEC vs Auth) ---")
            print(f"U-statistic: {u_stat:.4f}")
            print(f"p-value: {p_value:.4e}")
            print(f"Result: {'Significant' if p_value < 0.05 else 'Not significant'} difference")
        # Test 3: Trust vs Auth comparison
        if 'trust' in self.df['dnssec_mode'].values and 'auth' in self.df['dnssec_mode'].values:
            trust_data = self.df[self.df['dnssec_mode'] == 'trust']['duration_ms']
            auth_data = self.df[self.df['dnssec_mode'] == 'auth']['duration_ms']
            u_stat, p_value = stats.mannwhitneyu(trust_data, auth_data, alternative='two-sided')
            print(f"\n--- Mann-Whitney U Test (Trust vs Auth) ---")
            print(f"U-statistic: {u_stat:.4f}")
            print(f"p-value: {p_value:.4e}")
            print(f"Result: Auth is {'significantly' if p_value < 0.05 else 'not significantly'} slower than Trust")
    def generate_latex_table(self, output_dir='plots'):
        """Generate LaTeX table for thesis"""
        Path(output_dir).mkdir(exist_ok=True)
        # Summary table by protocol
        summary = self.df.groupby('protocol_category')['duration_ms'].agg([
            ('Mean', 'mean'),
            ('Median', 'median'),
            ('Std Dev', 'std'),
            ('P95', lambda x: x.quantile(0.95)),
            ('P99', lambda x: x.quantile(0.99))
        ]).round(2)
        latex_code = summary.to_latex(float_format="%.2f")
        with open(f'{output_dir}/summary_table.tex', 'w') as f:
            f.write(latex_code)
        print(f"✓ Saved: summary_table.tex")
        print("\nLaTeX Table Preview:")
        print(latex_code)
    def run_full_analysis(self):
        """Run complete analysis pipeline"""
        print("="*80)
        print("DNS QoS Analysis - Starting Full Analysis")
        print("="*80)
        # Load data
        print("\n[1/10] Loading data...")
        self.load_all_data()
        # Generate statistics
        print("\n[2/10] Generating summary statistics...")
        self.generate_summary_statistics()
        # Statistical tests
        print("\n[3/10] Running statistical tests...")
        self.statistical_tests()
        # Generate plots
        print("\n[4/10] Creating latency by protocol plot...")
        self.plot_latency_by_protocol()
        print("\n[5/10] Creating provider comparison plot...")
        self.plot_provider_comparison()
        print("\n[6/10] Creating DNSSEC impact plot...")
        self.plot_dnssec_impact()
        print("\n[7/10] Creating persistence impact plot...")
        self.plot_persistence_impact()
        print("\n[8/10] Creating bandwidth overhead plot...")
        self.plot_bandwidth_overhead()
        print("\n[9/10] Creating heatmap...")
        self.plot_heatmap()
        print("\n[10/10] Creating percentile comparison...")
        self.plot_percentile_comparison()
        # Generate LaTeX table
        print("\n[Bonus] Generating LaTeX table...")
        self.generate_latex_table()
        print("\n" + "="*80)
        print("✓ Analysis Complete! Check the 'plots' directory for all visualizations.")
        print("="*80)
 if __name__ == "__main__":
    analyzer = DNSAnalyzer(results_dir='results')
    analyzer.run_full_analysis()
@@ -1,536 +0,0 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 import numpy as np
 from pathlib import Path
 import datetime
 from dateutil import parser as date_parser
 import dpkt
 # Set style
 sns.set_style("whitegrid")
 plt.rcParams['figure.dpi'] = 300
 plt.rcParams['savefig.dpi'] = 300
 plt.rcParams['font.size'] = 10
 class FastDNSAnalyzer:
    def __init__(self, results_dir='results'):
        self.results_dir = Path(results_dir)
        self.all_data = []
    def should_include_file(self, filename):
        """Filter out DNSSEC and non-persist files"""
        name = filename.stem
        if 'auth' in name or 'trust' in name:
            return False
        if name in ['tls', 'https']:
            return False
        return True
    def parse_rfc3339_nano(self, timestamp_str):
        """Parse RFC3339Nano timestamp with timezone"""
        try:
            dt = date_parser.parse(timestamp_str)
            return dt.astimezone(datetime.timezone.utc).timestamp()
        except Exception as e:
            print(f"    Error parsing timestamp {timestamp_str}: {e}")
            return None
    def extract_bandwidth_from_pcap_fast(self, pcap_file, csv_data):
        """Fast bandwidth extraction using dpkt"""
        print(f"    Analyzing pcap: {pcap_file.name}")
        try:
            with open(pcap_file, 'rb') as f:
                pcap = dpkt.pcap.Reader(f)
                # Build query time windows
                query_windows = []
                for idx, row in csv_data.iterrows():
                    start_time = self.parse_rfc3339_nano(row['timestamp'])
                    if start_time is None:
                        continue
                    duration_seconds = row['duration_ns'] / 1_000_000_000
                    end_time = start_time + duration_seconds
                    query_windows.append({
                        'index': idx,
                        'start': start_time,
                        'end': end_time,
                        'bytes_sent': 0,
                        'bytes_received': 0,
                        'packets_sent': 0,
                        'packets_received': 0
                    })
                if not query_windows:
                    print("    ✗ No valid query windows")
                    return None
                # Sort windows for faster matching
                query_windows.sort(key=lambda x: x['start'])
                # Process packets
                packet_count = 0
                matched_count = 0
                for timestamp, buf in pcap:
                    packet_count += 1
                    packet_size = len(buf)
                    # Quick parse to determine direction
                    try:
                        eth = dpkt.ethernet.Ethernet(buf)
                        # Get IP layer
                        if isinstance(eth.data, dpkt.ip.IP):
                            ip = eth.data
                        elif isinstance(eth.data, dpkt.ip6.IP6):
                            ip = eth.data
                        else:
                            continue
                        # Get transport layer
                        if isinstance(ip.data, dpkt.udp.UDP):
                            transport = ip.data
                            src_port = transport.sport
                            dst_port = transport.dport
                        elif isinstance(ip.data, dpkt.tcp.TCP):
                            transport = ip.data
                            src_port = transport.sport
                            dst_port = transport.dport
                        else:
                            continue
                        # Determine direction (client port usually higher)
                        is_outbound = src_port > dst_port
                        # Binary search for matching window
                        for window in query_windows:
                            if window['start'] <= timestamp <= window['end']:
                                if is_outbound:
                                    window['bytes_sent'] += packet_size
                                    window['packets_sent'] += 1
                                else:
                                    window['bytes_received'] += packet_size
                                    window['packets_received'] += 1
                                matched_count += 1
                                break
                            elif timestamp < window['start']:
                                break  # No more windows to check
                    except Exception:
                        continue
                print(f"    ✓ Processed {packet_count} packets, matched {matched_count}")
                # Convert to DataFrame
                bandwidth_df = pd.DataFrame(query_windows)
                return bandwidth_df[['index', 'bytes_sent', 'bytes_received', 
                                   'packets_sent', 'packets_received']]
        except Exception as e:
            print(f"    ✗ Error reading pcap: {e}")
            return None
    def load_data(self):
        """Load all relevant CSV files and extract bandwidth from pcaps"""
        print("Loading data and analyzing bandwidth...")
        for provider_dir in self.results_dir.iterdir():
            if not provider_dir.is_dir():
                continue
            provider = provider_dir.name
            for csv_file in provider_dir.glob('*.csv'):
                if not self.should_include_file(csv_file):
                    continue
                try:
                    df = pd.read_csv(csv_file)
                    df['provider'] = provider
                    df['test_file'] = csv_file.stem
                    df['csv_path'] = str(csv_file)
                    # Find corresponding pcap file
                    pcap_file = csv_file.with_suffix('.pcap')
                    if pcap_file.exists():
                        print(f"  Processing: {provider}/{csv_file.name}")
                        bandwidth_data = self.extract_bandwidth_from_pcap_fast(pcap_file, df)
                        if bandwidth_data is not None and len(bandwidth_data) > 0:
                            # Merge bandwidth data
                            df = df.reset_index(drop=True)
                            for col in ['bytes_sent', 'bytes_received', 'packets_sent', 'packets_received']:
                                df[col] = 0
                            for _, row in bandwidth_data.iterrows():
                                idx = int(row['index'])
                                if idx < len(df):
                                    df.at[idx, 'bytes_sent'] = row['bytes_sent']
                                    df.at[idx, 'bytes_received'] = row['bytes_received']
                                    df.at[idx, 'packets_sent'] = row['packets_sent']
                                    df.at[idx, 'packets_received'] = row['packets_received']
                            df['total_bytes'] = df['bytes_sent'] + df['bytes_received']
                            print(f"    ✓ Extracted bandwidth for {len(df)} queries")
                        else:
                            print(f"    ⚠ Could not extract bandwidth data")
                    else:
                        print(f"  ⚠ No pcap found for {csv_file.name}")
                    self.all_data.append(df)
                except Exception as e:
                    print(f"  ✗ Error loading {csv_file}: {e}")
                    import traceback
                    traceback.print_exc()
        print(f"\nTotal files loaded: {len(self.all_data)}")
    def create_line_graphs(self, output_dir='output/line_graphs'):
        """Create line graphs for latency and bandwidth"""
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        print("\nGenerating line graphs...")
        for df in self.all_data:
            provider = df['provider'].iloc[0]
            test_name = df['test_file'].iloc[0]
            df['query_index'] = range(1, len(df) + 1)
            # Create figure with 2 subplots
            fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
            # Plot 1: Latency
            ax1.plot(df['query_index'], df['duration_ms'], marker='o', 
                    markersize=4, linewidth=1, alpha=0.7, color='steelblue')
            mean_latency = df['duration_ms'].mean()
            ax1.axhline(y=mean_latency, color='r', linestyle='--', 
                       label=f'Mean: {mean_latency:.2f} ms', linewidth=2)
            ax1.set_xlabel('Query Number', fontsize=12)
            ax1.set_ylabel('Latency (ms)', fontsize=12)
            ax1.set_title('Latency Over Time', fontsize=12, fontweight='bold')
            ax1.legend()
            ax1.grid(True, alpha=0.3)
            # Plot 2: Bandwidth
            if 'total_bytes' in df.columns and df['total_bytes'].sum() > 0:
                ax2.plot(df['query_index'], df['bytes_sent'], marker='s', 
                        markersize=4, linewidth=1, alpha=0.7, 
                        color='orange', label='Sent')
                ax2.plot(df['query_index'], df['bytes_received'], marker='^', 
                        markersize=4, linewidth=1, alpha=0.7, 
                        color='green', label='Received')
                mean_sent = df['bytes_sent'].mean()
                mean_received = df['bytes_received'].mean()
                ax2.axhline(y=mean_sent, color='orange', linestyle='--', 
                           linewidth=1.5, alpha=0.5)
                ax2.axhline(y=mean_received, color='green', linestyle='--', 
                           linewidth=1.5, alpha=0.5)
                ax2.set_xlabel('Query Number', fontsize=12)
                ax2.set_ylabel('Bytes', fontsize=12)
                ax2.set_title(f'Bandwidth Over Time (Mean: ↑{mean_sent:.0f}B ↓{mean_received:.0f}B)', 
                             fontsize=12, fontweight='bold')
                ax2.legend()
                ax2.grid(True, alpha=0.3)
            fig.suptitle(f'{provider.upper()} - {test_name}', 
                        fontsize=14, fontweight='bold')
            plt.tight_layout()
            filename = f"{provider}_{test_name}.png"
            plt.savefig(f'{output_dir}/{filename}', bbox_inches='tight')
            plt.close()
            print(f"  ✓ Created: {filename}")
    def get_protocol_name(self, test_file):
        """Extract clean protocol name"""
        name = test_file.replace('-persist', '')
        protocol_map = {
            'udp': 'Plain DNS (UDP)',
            'tls': 'DoT (DNS over TLS)',
            'https': 'DoH (DNS over HTTPS)',
            'doh3': 'DoH/3 (DNS over HTTP/3)',
            'doq': 'DoQ (DNS over QUIC)'
        }
        return protocol_map.get(name, name.upper())
    def create_resolver_comparison_bars(self, output_dir='output/comparisons'):
        """Create bar graphs comparing resolvers for latency and bandwidth"""
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        print("\nGenerating resolver comparison graphs...")
        combined_df = pd.concat(self.all_data, ignore_index=True)
        protocols = combined_df['test_file'].unique()
        for protocol in protocols:
            protocol_data = combined_df[combined_df['test_file'] == protocol]
            protocol_name = self.get_protocol_name(protocol)
            # Latency stats
            latency_stats = protocol_data.groupby('provider')['duration_ms'].agg([
                ('mean', 'mean'),
                ('median', 'median'),
                ('std', 'std')
            ]).reset_index()
            # Create latency comparison
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
            fig.suptitle(f'{protocol_name} - Latency Comparison', 
                        fontsize=16, fontweight='bold')
            # Mean latency
            bars1 = ax1.bar(latency_stats['provider'], latency_stats['mean'], 
                           color='steelblue', alpha=0.8, edgecolor='black')
            ax1.errorbar(latency_stats['provider'], latency_stats['mean'], 
                        yerr=latency_stats['std'], fmt='none', color='black', 
                        capsize=5, alpha=0.6)
            for bar in bars1:
                height = bar.get_height()
                ax1.text(bar.get_x() + bar.get_width()/2., height,
                        f'{height:.2f}',
                        ha='center', va='bottom', fontweight='bold')
            ax1.set_xlabel('Resolver', fontsize=12)
            ax1.set_ylabel('Mean Latency (ms)', fontsize=12)
            ax1.set_title('Mean Latency', fontsize=12)
            ax1.grid(axis='y', alpha=0.3)
            # Median latency
            bars2 = ax2.bar(latency_stats['provider'], latency_stats['median'], 
                           color='coral', alpha=0.8, edgecolor='black')
            for bar in bars2:
                height = bar.get_height()
                ax2.text(bar.get_x() + bar.get_width()/2., height,
                        f'{height:.2f}',
                        ha='center', va='bottom', fontweight='bold')
            ax2.set_xlabel('Resolver', fontsize=12)
            ax2.set_ylabel('Median Latency (ms)', fontsize=12)
            ax2.set_title('Median Latency', fontsize=12)
            ax2.grid(axis='y', alpha=0.3)
            plt.tight_layout()
            plt.savefig(f'{output_dir}/latency_{protocol}.png', bbox_inches='tight')
            plt.close()
            print(f"  ✓ Created: latency_{protocol}.png")
            # Bandwidth comparison
            if 'total_bytes' in protocol_data.columns and protocol_data['total_bytes'].sum() > 0:
                bandwidth_stats = protocol_data.groupby('provider').agg({
                    'bytes_sent': 'mean',
                    'bytes_received': 'mean',
                    'total_bytes': 'mean'
                }).reset_index()
                fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
                fig.suptitle(f'{protocol_name} - Bandwidth Comparison', 
                            fontsize=16, fontweight='bold')
                # Sent vs Received
                x = np.arange(len(bandwidth_stats))
                width = 0.35
                bars1 = ax1.bar(x - width/2, bandwidth_stats['bytes_sent'], width,
                               label='Sent', color='orange', alpha=0.8, edgecolor='black')
                bars2 = ax1.bar(x + width/2, bandwidth_stats['bytes_received'], width,
                               label='Received', color='green', alpha=0.8, edgecolor='black')
                ax1.set_xlabel('Resolver', fontsize=12)
                ax1.set_ylabel('Bytes per Query', fontsize=12)
                ax1.set_title('Average Bandwidth per Query', fontsize=12)
                ax1.set_xticks(x)
                ax1.set_xticklabels(bandwidth_stats['provider'])
                ax1.legend()
                ax1.grid(axis='y', alpha=0.3)
                # Total bandwidth
                bars3 = ax2.bar(bandwidth_stats['provider'], bandwidth_stats['total_bytes'],
                               color='purple', alpha=0.8, edgecolor='black')
                for bar in bars3:
                    height = bar.get_height()
                    ax2.text(bar.get_x() + bar.get_width()/2., height,
                            f'{height:.0f}',
                            ha='center', va='bottom', fontweight='bold')
                ax2.set_xlabel('Resolver', fontsize=12)
                ax2.set_ylabel('Total Bytes per Query', fontsize=12)
                ax2.set_title('Total Bandwidth per Query', fontsize=12)
                ax2.grid(axis='y', alpha=0.3)
                plt.tight_layout()
                plt.savefig(f'{output_dir}/bandwidth_{protocol}.png', bbox_inches='tight')
                plt.close()
                print(f"  ✓ Created: bandwidth_{protocol}.png")
    def generate_latex_tables(self, output_dir='output/tables'):
        """Generate LaTeX tables with latency and bandwidth statistics"""
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        print("\nGenerating LaTeX tables...")
        combined_df = pd.concat(self.all_data, ignore_index=True)
        # Generate latency table for each resolver
        for provider in combined_df['provider'].unique():
            provider_data = combined_df[combined_df['provider'] == provider]
            stats = provider_data.groupby('test_file')['duration_ms'].agg([
                ('Mean', 'mean'),
                ('Median', 'median'),
                ('Std Dev', 'std'),
                ('P95', lambda x: x.quantile(0.95)),
                ('P99', lambda x: x.quantile(0.99))
            ]).round(2)
            stats.index = stats.index.map(self.get_protocol_name)
            stats.index.name = 'Protocol'
            latex_code = stats.to_latex(
                caption=f'{provider.upper()} - Latency Statistics (ms)',
                label=f'tab:{provider}_latency',
                float_format="%.2f"
            )
            with open(f'{output_dir}/{provider}_latency.tex', 'w') as f:
                f.write(latex_code)
            print(f"  ✓ Created: {provider}_latency.tex")
        # Generate bandwidth table for each resolver
        for provider in combined_df['provider'].unique():
            provider_data = combined_df[combined_df['provider'] == provider]
            if 'total_bytes' not in provider_data.columns or provider_data['total_bytes'].sum() == 0:
                continue
            bandwidth_stats = provider_data.groupby('test_file').agg({
                'bytes_sent': 'mean',
                'bytes_received': 'mean',
                'total_bytes': 'mean'
            }).round(2)
            bandwidth_stats.columns = ['Avg Sent (B)', 'Avg Received (B)', 'Avg Total (B)']
            bandwidth_stats.index = bandwidth_stats.index.map(self.get_protocol_name)
            bandwidth_stats.index.name = 'Protocol'
            latex_code = bandwidth_stats.to_latex(
                caption=f'{provider.upper()} - Bandwidth Statistics',
                label=f'tab:{provider}_bandwidth',
                float_format="%.2f"
            )
            with open(f'{output_dir}/{provider}_bandwidth.tex', 'w') as f:
                f.write(latex_code)
            print(f"  ✓ Created: {provider}_bandwidth.tex")
        # Generate protocol efficiency table
        print("\nGenerating protocol efficiency table...")
        if 'total_bytes' in combined_df.columns and combined_df['total_bytes'].sum() > 0:
            protocol_bandwidth = combined_df.groupby('test_file').agg({
                'bytes_sent': 'mean',
                'bytes_received': 'mean',
                'total_bytes': 'mean'
            }).round(2)
            # Find UDP baseline
            udp_baseline = None
            for protocol in protocol_bandwidth.index:
                if 'udp' in protocol:
                    udp_baseline = protocol_bandwidth.loc[protocol, 'total_bytes']
                    break
            if udp_baseline and udp_baseline > 0:
                protocol_bandwidth['Overhead vs UDP (%)'] = (
                    (protocol_bandwidth['total_bytes'] - udp_baseline) / udp_baseline * 100
                ).round(1)
                protocol_bandwidth['Efficiency (%)'] = (
                    100 / (1 + protocol_bandwidth['Overhead vs UDP (%)'] / 100)
                ).round(1)
            protocol_bandwidth.columns = ['Avg Sent (B)', 'Avg Received (B)', 
                                         'Avg Total (B)', 'Overhead (%)', 'Efficiency (%)']
            protocol_bandwidth.index = protocol_bandwidth.index.map(self.get_protocol_name)
            protocol_bandwidth.index.name = 'Protocol'
            latex_code = protocol_bandwidth.to_latex(
                caption='Protocol Bandwidth Efficiency Comparison',
                label='tab:protocol_efficiency',
                float_format="%.2f"
            )
            with open(f'{output_dir}/protocol_efficiency.tex', 'w') as f:
                f.write(latex_code)
            print(f"  ✓ Created: protocol_efficiency.tex")
            print("\n--- Protocol Efficiency ---")
            print(protocol_bandwidth.to_string())
        # Generate combined comparison tables
        for metric in ['Mean', 'Median', 'P95']:
            comparison_stats = combined_df.groupby(['provider', 'test_file'])['duration_ms'].agg([
                ('Mean', 'mean'),
                ('Median', 'median'),
                ('P95', lambda x: x.quantile(0.95))
            ]).round(2)
            pivot_table = comparison_stats[metric].unstack(level=0)
            pivot_table.index = pivot_table.index.map(self.get_protocol_name)
            pivot_table.index.name = 'Protocol'
            latex_code = pivot_table.to_latex(
                caption=f'Resolver Latency Comparison - {metric} (ms)',
                label=f'tab:comparison_{metric.lower()}',
                float_format="%.2f"
            )
            with open(f'{output_dir}/comparison_{metric.lower()}.tex', 'w') as f:
                f.write(latex_code)
            print(f"  ✓ Created: comparison_{metric.lower()}.tex")
    def run_analysis(self):
        """Run the complete analysis"""
        print("="*80)
        print("Fast DNS QoS Analysis with Bandwidth")
        print("="*80)
        self.load_data()
        if not self.all_data:
            print("\n⚠ No data loaded.")
            return
        print("\n" + "="*80)
        self.create_line_graphs()
        print("\n" + "="*80)
        self.create_resolver_comparison_bars()
        print("\n" + "="*80)
        self.generate_latex_tables()
        print("\n" + "="*80)
        print("✓ Analysis Complete!")
        print("="*80)
 if __name__ == "__main__":
    analyzer = FastDNSAnalyzer(results_dir='results')
    analyzer.run_analysis()
@@ -15,6 +15,15 @@ import dpkt
 from dateutil import parser as date_parser
 BANDWIDTH_COLUMNS = [
    'bytes_sent',
    'bytes_received',
    'packets_sent',
    'packets_received',
    'total_bytes',
 ]
 class Packet(NamedTuple):
    """Lightweight packet representation."""
    timestamp: float
@@ -36,6 +45,36 @@ class QueryWindow:
        self.pkts_received = 0
 def is_already_processed(csv_path: Path) -> bool:
    """
    Check if CSV has already been processed.
    Returns True if bandwidth columns exist AND at least one row has non-zero data.
    """
    try:
        with open(csv_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            # Check if columns exist
            if not reader.fieldnames:
                return False
            if not all(col in reader.fieldnames for col in BANDWIDTH_COLUMNS):
                return False
            # Check if any row has non-zero bandwidth data
            for row in reader:
                for col in BANDWIDTH_COLUMNS:
                    val = row.get(col, '').strip()
                    if val and val != '0':
                        return True
            # All rows have zero/empty values - not truly processed
            return False
    except Exception:
        return False
 def parse_csv_timestamp(ts_str: str) -> float:
    """Convert RFC3339Nano timestamp to Unix epoch (seconds)."""
    dt = date_parser.isoparse(ts_str)
@@ -249,24 +288,20 @@ def write_enriched_csv(
            shutil.copy2(csv_path, backup_path)
            print(f"  Backup: {backup_path.name}")
-    # Get fieldnames
+    # Get fieldnames - filter out any existing bandwidth columns to avoid dupes
-    original_fields = list(queries[0]['data'].keys())
+    original_fields = [
-    new_fields = [
+        f for f in queries[0]['data'].keys()
-        'bytes_sent',
+        if f not in BANDWIDTH_COLUMNS
        'bytes_received',
        'packets_sent',
        'packets_received',
        'total_bytes',
    ]
-    fieldnames = original_fields + new_fields
+    fieldnames = original_fields + BANDWIDTH_COLUMNS
    with open(csv_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for q in queries:
-            row = q['data'].copy()
+            row = {k: v for k, v in q['data'].items() if k not in BANDWIDTH_COLUMNS}
-            for field in new_fields:
+            for field in BANDWIDTH_COLUMNS:
                row[field] = q[field]
            writer.writerow(row)
@@ -281,6 +316,7 @@ def process_provider_directory(provider_path: Path):
    csv_files = sorted(provider_path.glob('*.csv'))
    processed = 0
    skipped = 0
    total_time = 0
    for csv_path in csv_files:
@@ -294,6 +330,12 @@ def process_provider_directory(provider_path: Path):
            print(f"\n  ⚠ Skipping {csv_path.name} - no matching PCAP")
            continue
        # Check if already processed
        if is_already_processed(csv_path):
            print(f"\n  ⏭ Skipping {csv_path.name} - already processed")
            skipped += 1
            continue
        print(f"\n  📁 {csv_path.name}")
        file_start = time.time()
@@ -323,7 +365,8 @@ def process_provider_directory(provider_path: Path):
        print(f"    ✓ Completed in {file_time:.2f}s")
    print(f"\n  {'='*58}")
-    print(f"  {provider_path.name}: {processed} files in {total_time:.2f}s")
+    print(f"  {provider_path.name}: {processed} processed, {skipped} skipped")
    print(f"  Time: {total_time:.2f}s")
    print(f"  {'='*58}")
@@ -0,0 +1,207 @@
 #!/usr/bin/env python3
 """
 Merge all DNS test CSVs into a single unified CSV.
 Extracts metadata from filenames and directory structure.
 """
 import csv
 import os
 from pathlib import Path
 from dateutil import parser as date_parser
 import argparse
 def parse_config(filename: str) -> dict:
    """
    Parse protocol, dnssec_mode, and keep_alive from filename.
    Examples:
        doh3-auth.csv         → protocol=doh3, dnssec=auth, persist=0
        tls-trust-persist.csv → protocol=tls, dnssec=trust, persist=1
        https.csv             → protocol=https, dnssec=off, persist=0
        doudp-auth.csv        → protocol=doudp, dnssec=auth, persist=0
        dnscrypt-trust.csv    → protocol=dnscrypt, dnssec=trust, persist=0
    """
    base = filename.replace('.csv', '')
    parts = base.split('-')
    protocol = parts[0]
    dnssec_mode = 'off'
    keep_alive = 0
    for part in parts[1:]:
        if part in ('auth', 'trust'):
            dnssec_mode = part
        elif part == 'persist':
            keep_alive = 1
    return {
        'protocol': protocol,
        'dnssec_mode': dnssec_mode,
        'keep_alive': keep_alive,
    }
 def parse_timestamp_unix(ts_str: str) -> float:
    """Convert RFC3339 timestamp to Unix epoch."""
    try:
        dt = date_parser.isoparse(ts_str)
        return dt.timestamp()
    except Exception:
        return 0.0
 def ns_to_ms(duration_ns: str) -> float:
    """Convert nanoseconds to milliseconds."""
    try:
        return float(duration_ns) / 1_000_000
    except (ValueError, TypeError):
        return 0.0
 def find_csv_files(input_dir: Path) -> list:
    """Find all non-backup CSV files."""
    files = []
    for csv_path in input_dir.rglob('*.csv'):
        if '.bak' in csv_path.name:
            continue
        files.append(csv_path)
    return sorted(files)
 def merge_all_csvs(input_dir: Path, output_path: Path):
    """Merge all CSVs into a single file."""
    csv_files = find_csv_files(input_dir)
    if not csv_files:
        print("No CSV files found")
        return
    print(f"Found {len(csv_files)} CSV files")
    # Output columns in desired order
    output_columns = [
        'id',
        'provider',
        'protocol',
        'dnssec_mode',
        'domain',
        'query_type',
        'keep_alive',
        'dns_server',
        'timestamp',
        'timestamp_unix',
        'duration_ns',
        'duration_ms',
        'request_size_bytes',
        'response_size_bytes',
        'bytes_sent',
        'bytes_received',
        'packets_sent',
        'packets_received',
        'total_bytes',
        'response_code',
        'error',
    ]
    global_id = 0
    total_rows = 0
    with open(output_path, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=output_columns)
        writer.writeheader()
        for csv_path in csv_files:
            # Extract provider from path
            provider = csv_path.parent.name.lower()
            # Parse config from filename
            config = parse_config(csv_path.name)
            print(f"  {provider}/{csv_path.name} ({config['protocol']}, {config['dnssec_mode']}, persist={config['keep_alive']})")
            file_rows = 0
            with open(csv_path, 'r', newline='', encoding='utf-8') as infile:
                reader = csv.DictReader(infile)
                for row in reader:
                    global_id += 1
                    file_rows += 1
                    # Build output row
                    out_row = {
                        'id': global_id,
                        'provider': provider,
                        'protocol': config['protocol'],
                        'dnssec_mode': config['dnssec_mode'],
                        'keep_alive': config['keep_alive'],
                        'domain': row.get('domain', ''),
                        'query_type': row.get('query_type', ''),
                        'dns_server': row.get('dns_server', ''),
                        'timestamp': row.get('timestamp', ''),
                        'timestamp_unix': parse_timestamp_unix(row.get('timestamp', '')),
                        'duration_ns': row.get('duration_ns', ''),
                        'duration_ms': ns_to_ms(row.get('duration_ns', '')),
                        'request_size_bytes': row.get('request_size_bytes', ''),
                        'response_size_bytes': row.get('response_size_bytes', ''),
                        'bytes_sent': row.get('bytes_sent', ''),
                        'bytes_received': row.get('bytes_received', ''),
                        'packets_sent': row.get('packets_sent', ''),
                        'packets_received': row.get('packets_received', ''),
                        'total_bytes': row.get('total_bytes', ''),
                        'response_code': row.get('response_code', ''),
                        'error': row.get('error', ''),
                    }
                    writer.writerow(out_row)
            total_rows += file_rows
            print(f"    → {file_rows:,} rows")
    print(f"\n{'='*60}")
    print(f"Output: {output_path}")
    print(f"Total rows: {total_rows:,}")
    print(f"{'='*60}")
 def main():
    parser = argparse.ArgumentParser(
        description='Merge all DNS test CSVs into a single file'
    )
    parser.add_argument(
        'input_dir',
        nargs='?',
        default='.',
        help='Input directory containing provider folders (default: .)'
    )
    parser.add_argument(
        '-o', '--output',
        default='dns_results.csv',
        help='Output CSV path (default: dns_results.csv)'
    )
    args = parser.parse_args()
    input_dir = Path(args.input_dir)
    output_path = Path(args.output)
    if not input_dir.exists():
        print(f"Error: Input directory not found: {input_dir}")
        return 1
    print("="*60)
    print("MERGE ALL DNS CSVs")
    print("="*60)
    print(f"Input:  {input_dir}")
    print(f"Output: {output_path}")
    print()
    merge_all_csvs(input_dir, output_path)
    return 0
 if __name__ == '__main__':
    exit(main())
@@ -0,0 +1,253 @@
 #!/bin/bash
 # Exit on error
 set -e
 # Default values
 TOOL_PATH="./qol"
 DOMAINS_FILE="./domains.txt"
 OUTPUT_DIR="./results"
 INTERFACE="veth1"
 TIMEOUT="5s"
 SLEEP_TIME="1"
 # Parse arguments
 while [[ $# -gt 0 ]]; do
  case $1 in
    -t|--tool-path)
      TOOL_PATH="$2"
      shift 2
      ;;
    -d|--domains-file)
      DOMAINS_FILE="$2"
      shift 2
      ;;
    -o|--output-dir)
      OUTPUT_DIR="$2"
      shift 2
      ;;
    -I|--interface)
      INTERFACE="$2"
      shift 2
      ;;
    -T|--timeout)
      TIMEOUT="$2"
      shift 2
      ;;
    -s|--sleep)
      SLEEP_TIME="$2"
      shift 2
      ;;
    --help)
      echo "Usage: $0 [OPTIONS]"
      echo ""
      echo "Options:"
      echo "  -t, --tool-path PATH      Path to qol tool (default: ./qol)"
      echo "  -d, --domains-file PATH   Path to domains file (default: ./domains.txt)"
      echo "  -o, --output-dir PATH     Output directory (default: ./results)"
      echo "  -I, --interface NAME      Network interface (default: veth1)"
      echo "  -T, --timeout DURATION    Timeout duration (default: 5s)"
      echo "  -s, --sleep SECONDS       Sleep between runs (default: 1)"
      echo "  --help                    Show this help"
      exit 0
      ;;
    *)
      echo "Unknown option: $1"
      echo "Use --help for usage information"
      exit 1
      ;;
  esac
 done
 echo "Configuration:"
 echo "  Tool path: $TOOL_PATH"
 echo "  Domains file: $DOMAINS_FILE"
 echo "  Output dir: $OUTPUT_DIR"
 echo "  Interface: $INTERFACE"
 echo "  Timeout: $TIMEOUT"
 echo "  Sleep time: ${SLEEP_TIME}s"
 echo ""
 # Server definitions as associative arrays (name -> url)
 declare -A CONN_SERVERS=(
    ["google-dotcp"]="dotcp://8.8.8.8:53"
    ["cloudflare-dotcp"]="dotcp://1.1.1.1:53"
    ["quad9-dotcp"]="dotcp://9.9.9.9:53"
    ["adguard-dotcp"]="dotcp://dns.adguard-dns.com:53"
    ["google-dot"]="tls://8.8.8.8:853"
    ["cloudflare-dot"]="tls://1.1.1.1:853"
    ["quad9-dot"]="tls://9.9.9.9:853"
    ["adguard-dot"]="tls://dns.adguard-dns.com:853"
    ["google-doh"]="https://dns.google/dns-query"
    ["cloudflare-doh"]="https://cloudflare-dns.com/dns-query"
    ["quad9-doh"]="https://dns10.quad9.net/dns-query"
    ["adguard-doh"]="https://dns.adguard-dns.com/dns-query"
 )
 declare -A QUIC_SERVERS=(
    ["google-doh3"]="doh3://dns.google/dns-query"
    ["cloudflare-doh3"]="doh3://cloudflare-dns.com/dns-query"
    ["adguard-doh3"]="doh3://dns.adguard-dns.com/dns-query"
    ["adguard-doq"]="doq://dns.adguard-dns.com:853"
 )
 declare -A CONNLESS_SERVERS=(
    ["google-udp"]="udp://8.8.8.8:53"
    ["cloudflare-udp"]="udp://1.1.1.1:53"
    ["quad9-udp"]="udp://9.9.9.9:53"
    ["adguard-udp"]="udp://dns.adguard-dns.com:53"
    ["adguard-dnscrypt"]="sdns://AQMAAAAAAAAAETk0LjE0MC4xNC4xNDo1NDQzINErR_JS3PLCu_iZEIbq95zkSV2LFsigxDIuUso_OQhzIjIuZG5zY3J5cHQuZGVmYXVsdC5uczEuYWRndWFyZC5jb20"
    ["quad9-dnscrypt"]="sdns://AQMAAAAAAAAAFDE0OS4xMTIuMTEyLjExMjo4NDQzIGfIR7jIdYzRICRVQ751Z0bfNN8dhMALjEcDaN-CHYY-GTIuZG5zY3J5cHQtY2VydC5xdWFkOS5uZXQ"
 )
 # Function to get flags suffix for filename
 get_flags_suffix() {
    local dnssec="$1"
    local auth="$2"
    local keepalive="$3"
    local suffix=""
    if [[ "$dnssec" == "true" ]]; then
        if [[ "$auth" == "true" ]]; then
            suffix="auth"
        else
            suffix="trust"
        fi
    fi
    if [[ "$keepalive" == "true" ]]; then
        if [[ -n "$suffix" ]]; then
            suffix="${suffix}-persist"
        else
            suffix="persist"
        fi
    fi
    echo "$suffix"
 }
 # Function to run with perf and capture CPU metrics
 run_with_perf() {
    local name="$1"
    local url="$2"
    local dnssec="$3"
    local auth="$4"
    local keepalive="$5"
    local suffix=$(get_flags_suffix "$dnssec" "$auth" "$keepalive")
    local base_name="${name}"
    if [[ -n "$suffix" ]]; then
        base_name="${name}-${suffix}"
    fi
    local cpu_csv_file="$OUTPUT_DIR/${name%%-*}/${base_name}.cpu.csv"  # e.g., results/cloudflare/dot-trust-persist.cpu.csv
    # Write header if needed
    if [[ ! -f "$cpu_csv_file" ]]; then
        echo "timestamp,wall_time_seconds,instructions,cycles,peak_rss_kb" > "$cpu_csv_file"
    fi
    # Build command arguments
    local cmd_args=(
        "$DOMAINS_FILE"
        --output-dir "$OUTPUT_DIR"
        --interface "$INTERFACE"
        --timeout "$TIMEOUT"
        -s "$url"
    )
    if [[ "$dnssec" == "true" ]]; then
        cmd_args+=(--dnssec)
        if [[ "$auth" == "true" ]]; then
            cmd_args+=(--auth-dnssec)
        fi
    fi
    if [[ "$keepalive" == "true" ]]; then
        cmd_args+=(--keep-alive)
    fi
    # Create temp files for perf and time output
    local perf_tmp=$(mktemp)
    local time_tmp=$(mktemp)
    # Run with perf stat and /usr/bin/time
    local timestamp=$(date -Iseconds)
    sudo perf stat -e instructions,cycles \
        -o "$perf_tmp" \
        /usr/bin/time -v \
        "$TOOL_PATH" run "${cmd_args[@]}" 2>"$time_tmp" || true
    # Parse perf output
    local instructions=$(grep -oP '\d[\d,]*(?=\s+instructions)' "$perf_tmp" 2>/dev/null | tr -d ',' || echo "0")
    local cycles=$(grep -oP '\d[\d,]*(?=\s+cycles)' "$perf_tmp" 2>/dev/null | tr -d ',' || echo "0")
    local wall_time=$(grep -oP '\d+\.\d+(?= seconds time elapsed)' "$perf_tmp" 2>/dev/null || echo "0")
    # Parse /usr/bin/time output for peak RSS
    local peak_rss=$(grep "Maximum resident set size" "$time_tmp" 2>/dev/null | grep -oP '\d+' || echo "0")
    # Append to CPU CSV
    echo "${timestamp},${wall_time},${instructions},${cycles},${peak_rss}" >> "$cpu_csv_file"
    # Cleanup
    rm -f "$perf_tmp" "$time_tmp"
    echo "  -> CPU metrics saved to ${base_name}.cpu.csv"
 }
 # Function to run servers with given flags
 run_server_group() {
    local -n servers=$1
    local dnssec="$2"
    local auth="$3"
    local keepalive="$4"
    local desc="$5"
    echo "Running: $desc"
    for name in "${!servers[@]}"; do
        local url="${servers[$name]}"
        echo "  Processing: $name ($url)"
        run_with_perf "$name" "$url" "$dnssec" "$auth" "$keepalive"
        sleep "$SLEEP_TIME"
    done
 }
 echo "=== Running TCP-based protocols (TLS/HTTPS) ==="
 # DNSSEC off, Keep off
 run_server_group CONN_SERVERS "false" "false" "false" "no-dnssec, no-keepalive"
 # DNSSEC off, Keep on
 run_server_group CONN_SERVERS "false" "false" "true" "no-dnssec, keepalive"
 # DNSSEC on (trust), Keep on
 run_server_group CONN_SERVERS "true" "false" "true" "dnssec-trust, keepalive"
 # DNSSEC on (auth), Keep on
 run_server_group CONN_SERVERS "true" "true" "true" "dnssec-auth, keepalive"
 echo ""
 echo "=== Running QUIC-based protocols (DoH3/DoQ) ==="
 # DNSSEC off
 run_server_group QUIC_SERVERS "false" "false" "false" "no-dnssec"
 # DNSSEC on (trust)
 run_server_group QUIC_SERVERS "true" "false" "false" "dnssec-trust"
 # DNSSEC on (auth)
 run_server_group QUIC_SERVERS "true" "true" "false" "dnssec-auth"
 echo ""
 echo "=== Running connectionless protocols (UDP) ==="
 # DNSSEC off
 run_server_group CONNLESS_SERVERS "false" "false" "false" "no-dnssec"
 # DNSSEC on (trust)
 run_server_group CONNLESS_SERVERS "true" "false" "false" "dnssec-trust"
 # DNSSEC on (auth)
 run_server_group CONNLESS_SERVERS "true" "true" "false" "dnssec-auth"
 echo ""
 echo "All combinations completed!"
@@ -1,369 +0,0 @@
 package main
 import (
 	"encoding/csv"
 	"fmt"
 	"log"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/google/gopacket"
 	"github.com/google/gopacket/layers"
 	"github.com/google/gopacket/pcapgo"
 )
 type QueryRecord struct {
 	Domain              string
 	QueryType           string
 	Protocol            string
 	DNSSec              string
 	AuthDNSSec          string
 	KeepAlive           string
 	DNSServer           string
 	Timestamp           string
 	DurationNs          int64
 	DurationMs          float64
 	RequestSizeBytes    int
 	ResponseSizeBytes   int
 	ResponseCode        string
 	Error               string
 	BytesSent           int64
 	BytesReceived       int64
 	PacketsSent         int64
 	PacketsReceived     int64
 	TotalBytes          int64
 }
 func parseRFC3339Nano(ts string) (time.Time, error) {
 	return time.Parse(time.RFC3339Nano, ts)
 }
 func processProviderFolder(providerPath string) error {
 	providerName := filepath.Base(providerPath)
 	fmt.Printf("\n=== Processing provider: %s ===\n", providerName)
 	files, err := os.ReadDir(providerPath)
 	if err != nil {
 		return err
 	}
 	processed := 0
 	skipped := 0
 	errors := 0
 	for _, file := range files {
 		if !strings.HasSuffix(file.Name(), ".csv") {
 			continue
 		}
 		csvPath := filepath.Join(providerPath, file.Name())
 		pcapPath := strings.Replace(csvPath, ".csv", ".pcap", 1)
 		// Check if PCAP exists
 		if _, err := os.Stat(pcapPath); os.IsNotExist(err) {
 			fmt.Printf("  ⊗ Skipping: %s (no matching PCAP)\n", file.Name())
 			skipped++
 			continue
 		}
 		// Check if already processed (has backup)
 		backupPath := csvPath + ".bak"
 		if _, err := os.Stat(backupPath); err == nil {
 			fmt.Printf("  ⊙ Skipping: %s (already processed, backup exists)\n", file.Name())
 			skipped++
 			continue
 		}
 		fmt.Printf("  ↻ Processing: %s ... ", file.Name())
 		if err := processPair(csvPath, pcapPath); err != nil {
 			fmt.Printf("ERROR\n")
 			log.Printf("    Error: %v\n", err)
 			errors++
 		} else {
 			fmt.Printf("✓\n")
 			processed++
 		}
 	}
 	fmt.Printf("  Summary: %d processed, %d skipped, %d errors\n", processed, skipped, errors)
 	return nil
 }
 func processPair(csvPath, pcapPath string) error {
 	// Create backup
 	backupPath := csvPath + ".bak"
 	input, err := os.ReadFile(csvPath)
 	if err != nil {
 		return fmt.Errorf("backup read failed: %w", err)
 	}
 	if err := os.WriteFile(backupPath, input, 0644); err != nil {
 		return fmt.Errorf("backup write failed: %w", err)
 	}
 	// Read CSV records
 	records, err := readCSV(csvPath)
 	if err != nil {
 		return fmt.Errorf("CSV read failed: %w", err)
 	}
 	if len(records) == 0 {
 		return fmt.Errorf("no records in CSV")
 	}
 	// Read and parse PCAP
 	packets, err := readPCAPGo(pcapPath)
 	if err != nil {
 		return fmt.Errorf("PCAP read failed: %w", err)
 	}
 	// Enrich records with bandwidth data
 	enrichRecords(records, packets)
 	// Write enriched CSV
 	if err := writeCSV(csvPath, records); err != nil {
 		return fmt.Errorf("CSV write failed: %w", err)
 	}
 	return nil
 }
 func readCSV(path string) ([]*QueryRecord, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	r := csv.NewReader(f)
 	rows, err := r.ReadAll()
 	if err != nil {
 		return nil, err
 	}
 	if len(rows) < 2 {
 		return nil, fmt.Errorf("CSV has no data rows")
 	}
 	records := make([]*QueryRecord, 0, len(rows)-1)
 	for i := 1; i < len(rows); i++ {
 		row := rows[i]
 		if len(row) < 14 {
 			log.Printf("    Warning: Skipping malformed row %d", i+1)
 			continue
 		}
 		durationNs, _ := strconv.ParseInt(row[8], 10, 64)
 		durationMs, _ := strconv.ParseFloat(row[9], 64)
 		reqSize, _ := strconv.Atoi(row[10])
 		respSize, _ := strconv.Atoi(row[11])
 		records = append(records, &QueryRecord{
 			Domain:            row[0],
 			QueryType:         row[1],
 			Protocol:          row[2],
 			DNSSec:            row[3],
 			AuthDNSSec:        row[4],
 			KeepAlive:         row[5],
 			DNSServer:         row[6],
 			Timestamp:         row[7],
 			DurationNs:        durationNs,
 			DurationMs:        durationMs,
 			RequestSizeBytes:  reqSize,
 			ResponseSizeBytes: respSize,
 			ResponseCode:      row[12],
 			Error:             row[13],
 		})
 	}
 	return records, nil
 }
 type PacketInfo struct {
 	Timestamp time.Time
 	Size      int
 	IsSent    bool
 }
 func readPCAPGo(path string) ([]PacketInfo, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	reader, err := pcapgo.NewReader(f)
 	if err != nil {
 		return nil, err
 	}
 	var packets []PacketInfo
 	packetSource := gopacket.NewPacketSource(reader, reader.LinkType())
 	for packet := range packetSource.Packets() {
 		if packet.NetworkLayer() == nil {
 			continue
 		}
 		isDNS := false
 		isSent := false
 		// Check UDP layer (DNS, DoQ, DoH3)
 		if udpLayer := packet.Layer(layers.LayerTypeUDP); udpLayer != nil {
 			udp := udpLayer.(*layers.UDP)
 			isDNS = udp.SrcPort == 53 || udp.DstPort == 53 ||
 				udp.SrcPort == 853 || udp.DstPort == 853 ||
 				udp.SrcPort == 443 || udp.DstPort == 443
 			isSent = udp.DstPort == 53 || udp.DstPort == 853 || udp.DstPort == 443
 		}
 		// Check TCP layer (DoT, DoH)
 		if tcpLayer := packet.Layer(layers.LayerTypeTCP); tcpLayer != nil {
 			tcp := tcpLayer.(*layers.TCP)
 			isDNS = tcp.SrcPort == 53 || tcp.DstPort == 53 ||
 				tcp.SrcPort == 853 || tcp.DstPort == 853 ||
 				tcp.SrcPort == 443 || tcp.DstPort == 443
 			isSent = tcp.DstPort == 53 || tcp.DstPort == 853 || tcp.DstPort == 443
 		}
 		if isDNS {
 			packets = append(packets, PacketInfo{
 				Timestamp: packet.Metadata().Timestamp,
 				Size:      len(packet.Data()),
 				IsSent:    isSent,
 			})
 		}
 	}
 	return packets, nil
 }
 func enrichRecords(records []*QueryRecord, packets []PacketInfo) {
 	for _, rec := range records {
 		ts, err := parseRFC3339Nano(rec.Timestamp)
 		if err != nil {
 			log.Printf("    Warning: Failed to parse timestamp: %s", rec.Timestamp)
 			continue
 		}
 		// Define time window for this query
 		windowStart := ts
 		windowEnd := ts.Add(time.Duration(rec.DurationNs))
 		var sent, recv, pktSent, pktRecv int64
 		// Match packets within the time window
 		for _, pkt := range packets {
 			if (pkt.Timestamp.Equal(windowStart) || pkt.Timestamp.After(windowStart)) &&
 				pkt.Timestamp.Before(windowEnd) {
 				if pkt.IsSent {
 					sent += int64(pkt.Size)
 					pktSent++
 				} else {
 					recv += int64(pkt.Size)
 					pktRecv++
 				}
 			}
 		}
 		rec.BytesSent = sent
 		rec.BytesReceived = recv
 		rec.PacketsSent = pktSent
 		rec.PacketsReceived = pktRecv
 		rec.TotalBytes = sent + recv
 	}
 }
 func writeCSV(path string, records []*QueryRecord) error {
 	f, err := os.Create(path)
 	if err != nil {
 		return err
 	}
 	defer f.Close()
 	w := csv.NewWriter(f)
 	defer w.Flush()
 	// Write header
 	header := []string{
 		"domain", "query_type", "protocol", "dnssec", "auth_dnssec",
 		"keep_alive", "dns_server", "timestamp", "duration_ns", "duration_ms",
 		"request_size_bytes", "response_size_bytes", "response_code", "error",
 		"bytes_sent", "bytes_received", "packets_sent", "packets_received", "total_bytes",
 	}
 	if err := w.Write(header); err != nil {
 		return err
 	}
 	// Write data rows
 	for _, rec := range records {
 		row := []string{
 			rec.Domain,
 			rec.QueryType,
 			rec.Protocol,
 			rec.DNSSec,
 			rec.AuthDNSSec,
 			rec.KeepAlive,
 			rec.DNSServer,
 			rec.Timestamp,
 			strconv.FormatInt(rec.DurationNs, 10),
 			strconv.FormatFloat(rec.DurationMs, 'f', -1, 64),
 			strconv.Itoa(rec.RequestSizeBytes),
 			strconv.Itoa(rec.ResponseSizeBytes),
 			rec.ResponseCode,
 			rec.Error,
 			strconv.FormatInt(rec.BytesSent, 10),
 			strconv.FormatInt(rec.BytesReceived, 10),
 			strconv.FormatInt(rec.PacketsSent, 10),
 			strconv.FormatInt(rec.PacketsReceived, 10),
 			strconv.FormatInt(rec.TotalBytes, 10),
 		}
 		if err := w.Write(row); err != nil {
 			return err
 		}
 	}
 	return nil
 }
 func main() {
 	resultsDir := "results"
 	providers := []string{"adguard", "cloudflare", "google", "quad9"}
 	fmt.Println("╔═══════════════════════════════════════════════╗")
 	fmt.Println("║   DNS PCAP Preprocessor v1.0                  ║")
 	fmt.Println("║   Enriching ALL CSVs with bandwidth metrics   ║")
 	fmt.Println("╚═══════════════════════════════════════════════╝")
 	totalProcessed := 0
 	totalSkipped := 0
 	totalErrors := 0
 	for _, provider := range providers {
 		providerPath := filepath.Join(resultsDir, provider)
 		if _, err := os.Stat(providerPath); os.IsNotExist(err) {
 			fmt.Printf("\n⚠ Provider folder not found: %s\n", provider)
 			continue
 		}
 		if err := processProviderFolder(providerPath); err != nil {
 			log.Printf("Error processing %s: %v\n", provider, err)
 			totalErrors++
 		}
 	}
 	fmt.Println("\n╔═══════════════════════════════════════════════╗")
 	fmt.Println("║   Preprocessing Complete!                     ║")
 	fmt.Println("╚═══════════════════════════════════════════════╝")
 	fmt.Printf("\nAll CSV files now have 5 additional columns:\n")
 	fmt.Printf("  • bytes_sent          - Total bytes sent to DNS server\n")
 	fmt.Printf("  • bytes_received      - Total bytes received from DNS server\n")
 	fmt.Printf("  • packets_sent        - Number of packets sent\n")
 	fmt.Printf("  • packets_received    - Number of packets received\n")
 	fmt.Printf("  • total_bytes         - Sum of sent + received bytes\n")
 	fmt.Printf("\n📁 Backups saved as: *.csv.bak\n")
 	fmt.Printf("\n💡 Tip: The analysis script will filter which files to visualize,\n")
 	fmt.Printf("   but all files now have complete bandwidth metrics!\n")
 }
@@ -1,367 +0,0 @@
 #!/usr/bin/env python3
 """
 Advanced PCAP filter for DNS traffic (with IPv6 support).
 Filters out:
 - Local network traffic except test machine (IPv4: 10.0.0.50; IPv6: specific addresses)
 - AdGuard DNS servers (for non-AdGuard captures)
 - Non-DNS traffic based on protocol-specific ports
 """
 import os
 import subprocess
 from pathlib import Path
 import argparse
 # Test machine IPs (IPv4 and IPv6 from your provided info)
 TEST_IPV4 = '10.0.0.50'
 TEST_IPV6_GLOBAL = '2001:818:e73e:ba00:5506:dfd4:ed8b:96e'
 TEST_IPV6_LINKLOCAL = 'fe80::fe98:c62e:4463:9a2d'
 # Port mappings
 PORT_MAP = {
    'udp': [53],                    # DNS-over-UDP
    'tls': [53, 853],               # DNS-over-TLS
    'https': [53, 443],             # DNS-over-HTTPS (DoH)
    'doq': [53, 784, 8853],         # DNS-over-QUIC
    'doh3': [53, 443]               # DNS-over-HTTP/3
 }
 # AdGuard DNS IPs to filter out (for non-AdGuard captures)
 ADGUARD_IPS = [
    '94.140.14.14',
    '94.140.15.15',
    '2a10:50c0::ad1:ff',
    '2a10:50c0::ad2:ff'
 ]
 def parse_filename(filename):
    """Extract protocol from filename"""
    base = filename.replace('.pcap', '').replace('.csv', '')
    parts = base.split('-')
    if len(parts) < 1:  # Minimum: protocol
        return None
    protocol = parts[0].lower()
    return protocol
 def extract_resolver_from_path(pcap_path):
    """Extract resolver name from directory structure"""
    parts = Path(pcap_path).parts
    for part in parts:
        if part.lower() in ['cloudflare', 'google', 'quad9', 'adguard']:
            return part.lower()
    return None
 def build_filter_expression(protocol, resolver):
    """
    Build tshark filter expression.
    Strategy:
    1. Only protocol-specific DNS ports
    2. Keep only traffic involving the test machine (IPv4/IPv6)
    3. Exclude AdGuard IPs for non-AdGuard captures
    """
    # Get ports for this protocol
    ports = PORT_MAP.get(protocol, [53, 443, 853, 784, 8853])
    # Build port filter (UDP or TCP on these ports)
    port_conditions = []
    for port in ports:
        port_conditions.append(f'(udp.port == {port} or tcp.port == {port})')
    port_filter = ' or '.join(port_conditions)
    # Build test machine filter (keep if src or dst is test machine IP)
    machine_conditions = [f'(ip.addr == {TEST_IPV4})']
    if TEST_IPV6_GLOBAL:
        machine_conditions.append(f'(ipv6.addr == {TEST_IPV6_GLOBAL})')
    if TEST_IPV6_LINKLOCAL:
        machine_conditions.append(f'(ipv6.addr == {TEST_IPV6_LINKLOCAL})')
    machine_filter = ' or '.join(machine_conditions)
    # Build AdGuard exclusion filter
    adguard_exclusions = []
    if resolver != 'adguard':
        for ip in ADGUARD_IPS:
            if ':' in ip:  # IPv6
                adguard_exclusions.append(f'!(ipv6.addr == {ip})')
            else:  # IPv4
                adguard_exclusions.append(f'!(ip.addr == {ip})')
    # Combine all filters
    filters = [f'({port_filter})', f'({machine_filter})']
    if adguard_exclusions:
        adguard_filter = ' and '.join(adguard_exclusions)
        filters.append(f'({adguard_filter})')
    final_filter = ' and '.join(filters)
    return final_filter
 def filter_pcap(input_path, output_path, filter_expr, verbose=False):
    """Apply filter to PCAP file using tshark"""
    cmd = [
        'tshark',
        '-r', input_path,
        '-Y', filter_expr,
        '-w', output_path,
        '-F', 'pcap'
    ]
    try:
        if verbose:
            print(f"  Filter: {filter_expr}")
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=300
        )
        if result.returncode != 0:
            print(f"  ✗ Error: {result.stderr.strip()}")
            return False
        if not os.path.exists(output_path):
            print(f"  ✗ Output file not created")
            return False
        output_size = os.path.getsize(output_path)
        if output_size < 24:
            print(f"  ⚠ Warning: Output is empty")
        return True
    except subprocess.TimeoutExpired:
        print(f"  ✗ Timeout (>5 minutes)")
        return False
    except Exception as e:
        print(f"  ✗ Exception: {e}")
        return False
 def find_pcap_files(root_dir):
    """Recursively find all PCAP files"""
    pcap_files = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.pcap'):
                full_path = os.path.join(root, file)
                pcap_files.append(full_path)
    return sorted(pcap_files)
 def format_bytes(bytes_val):
    """Format bytes as human readable"""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if bytes_val < 1024.0:
            return f"{bytes_val:.1f} {unit}"
        bytes_val /= 1024.0
    return f"{bytes_val:.1f} TB"
 def main():
    parser = argparse.ArgumentParser(
        description='Advanced PCAP filter for DNS traffic (IPv4/IPv6)',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
 Filtering rules:
  1. Only include traffic on protocol-specific DNS ports
  2. Keep only packets involving the test machine (10.0.0.50 or its IPv6 addresses)
  3. Exclude AdGuard IPs for non-AdGuard captures
 Protocol-specific ports:
  udp:   53
  tls:   53, 853
  https: 53, 443
  doq:   53, 784, 8853
  doh3:  53, 443
 Examples:
  # Dry run
  %(prog)s ./results --dry-run
  # Filter with verbose output
  %(prog)s ./results --verbose
  # Custom output directory
  %(prog)s ./results --output ./cleaned
        '''
    )
    parser.add_argument(
        'input_dir',
        help='Input directory containing PCAP files'
    )
    parser.add_argument(
        '-o', '--output',
        default='./results_filtered',
        help='Output directory (default: ./results_filtered)'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be done without filtering'
    )
    parser.add_argument(
        '--limit',
        type=int,
        help='Only process first N files (for testing)'
    )
    parser.add_argument(
        '-v', '--verbose',
        action='store_true',
        help='Verbose output (show filter expressions)'
    )
    parser.add_argument(
        '--overwrite',
        action='store_true',
        help='Overwrite existing filtered files'
    )
    args = parser.parse_args()
    # Check for tshark
    try:
        result = subprocess.run(
            ['tshark', '-v'],
            capture_output=True,
            check=True
        )
        if args.verbose:
            version = result.stdout.decode().split('\n')[0]
            print(f"Using: {version}\n")
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("Error: tshark not found. Install Wireshark/tshark:")
        print("  Ubuntu/Debian: sudo apt-get install tshark")
        print("  macOS: brew install wireshark")
        return 1
    print("=" * 80)
    print("ADVANCED DNS PCAP FILTER (IPv4/IPv6)")
    print("=" * 80)
    print("Filters:")
    print("  1. Protocol-specific DNS ports only")
    print("  2. Keep only traffic involving test machine (10.0.0.50 / IPv6 addresses)")
    print("  3. Exclude AdGuard IPs (for non-AdGuard captures)")
    print(f"\nInput:  {args.input_dir}")
    print(f"Output: {args.output}")
    # Find PCAP files
    print(f"\nScanning for PCAP files...")
    pcap_files = find_pcap_files(args.input_dir)
    if not pcap_files:
        print(f"No PCAP files found in {args.input_dir}")
        return 1
    print(f"Found {len(pcap_files)} PCAP files")
    total_input_size = sum(os.path.getsize(f) for f in pcap_files)
    print(f"Total size: {format_bytes(total_input_size)}")
    if args.limit:
        pcap_files = pcap_files[:args.limit]
        print(f"Limiting to first {args.limit} files")
    if args.dry_run:
        print("\n*** DRY RUN MODE ***\n")
    else:
        print()
    # Process files
    success_count = 0
    skip_count = 0
    fail_count = 0
    total_output_size = 0
    for i, input_path in enumerate(pcap_files, 1):
        # Extract info from path
        filename = Path(input_path).name
        protocol = parse_filename(filename)
        resolver = extract_resolver_from_path(input_path)
        if not protocol:
            print(f"[{i}/{len(pcap_files)}] {filename}")
            print(f"  ⚠ Could not parse protocol, skipping")
            skip_count += 1
            continue
        # Create output path
        rel_path = os.path.relpath(input_path, args.input_dir)
        output_path = os.path.join(args.output, rel_path)
        input_size = os.path.getsize(input_path)
        print(f"[{i}/{len(pcap_files)}] {rel_path}")
        print(f"  Protocol: {protocol.upper()}")
        print(f"  Resolver: {resolver or 'unknown'}")
        print(f"  Size: {format_bytes(input_size)}")
        # Check if already filtered
        if os.path.exists(output_path) and not args.overwrite:
            output_size = os.path.getsize(output_path)
            reduction = ((input_size - output_size) / input_size * 100) if input_size > 0 else 0
            print(f"  ⊙ Already filtered: {format_bytes(output_size)} "
                  f"({reduction:.1f}% reduction)")
            skip_count += 1
            total_output_size += output_size
            continue
        # Build filter
        filter_expr = build_filter_expression(protocol, resolver)
        if args.dry_run:
            print(f"  → Would filter")
            if args.verbose:
                print(f"  Filter: {filter_expr}")
            continue
        # Create output directory
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        # Filter
        success = filter_pcap(input_path, output_path, filter_expr, args.verbose)
        if success:
            output_size = os.path.getsize(output_path)
            reduction = ((input_size - output_size) / input_size * 100) if input_size > 0 else 0
            print(f"  ✓ Filtered: {format_bytes(output_size)} "
                  f"({reduction:.1f}% reduction)")
            success_count += 1
            total_output_size += output_size
        else:
            fail_count += 1
    # Summary
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    if args.dry_run:
        print(f"Would process: {len(pcap_files)} files")
    else:
        print(f"Successful:    {success_count}")
        print(f"Skipped:       {skip_count} (already filtered or unparseable)")
        print(f"Failed:        {fail_count}")
        print(f"Total:         {len(pcap_files)}")
        if success_count > 0 or skip_count > 0:
            print(f"\nInput size:    {format_bytes(total_input_size)}")
            print(f"Output size:   {format_bytes(total_output_size)}")
            if total_input_size > 0:
                reduction = ((total_input_size - total_output_size) / 
                            total_input_size * 100)
                print(f"Reduction:     {reduction:.1f}%")
            print(f"\nOutput directory: {args.output}")
    return 0 if fail_count == 0 else 1
 if __name__ == "__main__":
    exit(main())
@@ -1,426 +0,0 @@
 #!/usr/bin/env python3
 """
 Convert DNS CSV files to SQLite database.
 Creates a single normalized table with unified DNSSEC handling.
 """
 import sqlite3
 import csv
 from pathlib import Path
 from dateutil import parser as date_parser
 def create_database_schema(conn: sqlite3.Connection):
    """Create the database schema with indexes."""
    cursor = conn.cursor()
    # Main queries table
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS dns_queries (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            -- Metadata
            provider TEXT NOT NULL,
            protocol TEXT NOT NULL,
            dnssec_mode TEXT NOT NULL CHECK(dnssec_mode IN ('off', 'auth', 'trust')),
            -- Query details
            domain TEXT NOT NULL,
            query_type TEXT NOT NULL,
            keep_alive BOOLEAN NOT NULL,
            dns_server TEXT NOT NULL,
            -- Timing
            timestamp TEXT NOT NULL,
            timestamp_unix REAL NOT NULL,
            duration_ns INTEGER NOT NULL,
            duration_ms REAL NOT NULL,
            -- Size metrics
            request_size_bytes INTEGER,
            response_size_bytes INTEGER,
            -- Network metrics (from PCAP)
            bytes_sent INTEGER DEFAULT 0,
            bytes_received INTEGER DEFAULT 0,
            packets_sent INTEGER DEFAULT 0,
            packets_received INTEGER DEFAULT 0,
            total_bytes INTEGER DEFAULT 0,
            -- Response
            response_code TEXT,
            error TEXT
        )
    """)
    # Create indexes for common queries
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_provider 
        ON dns_queries(provider)
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_protocol 
        ON dns_queries(protocol)
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_dnssec_mode 
        ON dns_queries(dnssec_mode)
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_keep_alive 
        ON dns_queries(keep_alive)
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_provider_protocol_dnssec 
        ON dns_queries(provider, protocol, dnssec_mode)
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_timestamp 
        ON dns_queries(timestamp_unix)
    """)
    cursor.execute("""
        CREATE INDEX IF NOT EXISTS idx_domain 
        ON dns_queries(domain)
    """)
    conn.commit()
 def parse_protocol_and_dnssec(filename: str) -> tuple[str, str, bool]:
    """
    Extract base protocol, DNSSEC mode, and keep_alive from filename.
    Returns (base_protocol, dnssec_mode, keep_alive)
    Examples:
        'udp.csv' -> ('udp', 'off', False)
        'udp-auth.csv' -> ('udp', 'auth', False)
        'tls.csv' -> ('tls', 'off', False)
        'tls-persist.csv' -> ('tls', 'off', True)
        'https-persist.csv' -> ('https', 'off', True)
        'https-auth-persist.csv' -> ('https', 'auth', True)
        'https-trust-persist.csv' -> ('https', 'trust', True)
        'doh3-auth.csv' -> ('doh3', 'auth', False)
        'doq.csv' -> ('doq', 'off', False)
    """
    name = filename.replace('.csv', '')
    # Check for persist suffix (keep_alive)
    keep_alive = False
    if name.endswith('-persist'):
        keep_alive = True
        name = name.replace('-persist', '')
    # Check for DNSSEC suffix
    dnssec_mode = 'off'
    if name.endswith('-auth'):
        dnssec_mode = 'auth'
        name = name.replace('-auth', '')
    elif name.endswith('-trust'):
        dnssec_mode = 'trust'
        name = name.replace('-trust', '')
    # For UDP, DoH3, and DoQ, keep_alive doesn't apply (connectionless)
    if name in ['udp', 'doh3', 'doq']:
        keep_alive = False
    return (name, dnssec_mode, keep_alive)
 def str_to_bool(value: str) -> bool:
    """Convert string boolean to Python bool."""
    return value.lower() in ('true', '1', 'yes')
 def import_csv_to_db(
    csv_path: Path,
    provider: str,
    conn: sqlite3.Connection
 ) -> int:
    """Import a CSV file into the database."""
    protocol, dnssec_mode, keep_alive_from_filename = parse_protocol_and_dnssec(csv_path.name)
    cursor = conn.cursor()
    rows_imported = 0
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            try:
                # Parse timestamp to Unix epoch
                dt = date_parser.isoparse(row['timestamp'])
                timestamp_unix = dt.timestamp()
                # Use keep_alive from filename (more reliable than CSV)
                keep_alive = keep_alive_from_filename
                # Handle optional fields (may not exist in older CSVs)
                bytes_sent = int(row.get('bytes_sent', 0) or 0)
                bytes_received = int(row.get('bytes_received', 0) or 0)
                packets_sent = int(row.get('packets_sent', 0) or 0)
                packets_received = int(row.get('packets_received', 0) or 0)
                total_bytes = int(row.get('total_bytes', 0) or 0)
                cursor.execute("""
                    INSERT INTO dns_queries (
                        provider, protocol, dnssec_mode,
                        domain, query_type, keep_alive,
                        dns_server, timestamp, timestamp_unix,
                        duration_ns, duration_ms,
                        request_size_bytes, response_size_bytes,
                        bytes_sent, bytes_received, packets_sent, packets_received, total_bytes,
                        response_code, error
                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, (
                    provider,
                    protocol,
                    dnssec_mode,
                    row['domain'],
                    row['query_type'],
                    keep_alive,
                    row['dns_server'],
                    row['timestamp'],
                    timestamp_unix,
                    int(row['duration_ns']),
                    float(row['duration_ms']),
                    int(row.get('request_size_bytes') or 0),
                    int(row.get('response_size_bytes') or 0),
                    bytes_sent,
                    bytes_received,
                    packets_sent,
                    packets_received,
                    total_bytes,
                    row.get('response_code', ''),
                    row.get('error', '')
                ))
                rows_imported += 1
            except Exception as e:
                print(f"      Warning: Skipping row - {e}")
                continue
    conn.commit()
    return rows_imported
 def main():
    """Main import pipeline."""
    print("\n" + "="*60)
    print("CSV to SQLite Database Converter")
    print("="*60)
    results_dir = Path('results')
    db_path = Path('dns.db')
    if not results_dir.exists():
        print(f"\n❌ Error: '{results_dir}' directory not found")
        return
    # Remove existing database
    if db_path.exists():
        print(f"\n⚠ Removing existing database: {db_path}")
        db_path.unlink()
    # Create database and schema
    print(f"\n📊 Creating database: {db_path}")
    conn = sqlite3.connect(db_path)
    create_database_schema(conn)
    print("✓ Schema created")
    # Import CSVs
    providers = ['adguard', 'cloudflare', 'google', 'quad9']
    total_rows = 0
    total_files = 0
    for provider in providers:
        provider_path = results_dir / provider
        if not provider_path.exists():
            print(f"\n⚠ Skipping {provider} - directory not found")
            continue
        print(f"\n{'='*60}")
        print(f"Importing: {provider.upper()}")
        print(f"{'='*60}")
        csv_files = sorted(provider_path.glob('*.csv'))
        provider_rows = 0
        provider_files = 0
        for csv_path in csv_files:
            # Skip backup files
            if '.bak' in csv_path.name:
                continue
            protocol, dnssec, keep_alive = parse_protocol_and_dnssec(csv_path.name)
            ka_str = "persistent" if keep_alive else "non-persist"
            print(f"  📄 {csv_path.name:30} → {protocol:8} (DNSSEC: {dnssec:5}, {ka_str})")
            rows = import_csv_to_db(csv_path, provider, conn)
            print(f"     ✓ Imported {rows:,} rows")
            provider_rows += rows
            provider_files += 1
        print(f"\n  Total: {provider_files} files, {provider_rows:,} rows")
        total_rows += provider_rows
        total_files += provider_files
    # Create summary
    print(f"\n{'='*60}")
    print("Database Summary")
    print(f"{'='*60}")
    cursor = conn.cursor()
    # Total counts
    cursor.execute("SELECT COUNT(*) FROM dns_queries")
    total_queries = cursor.fetchone()[0]
    cursor.execute("SELECT COUNT(DISTINCT provider) FROM dns_queries")
    unique_providers = cursor.fetchone()[0]
    cursor.execute("SELECT COUNT(DISTINCT protocol) FROM dns_queries")
    unique_protocols = cursor.fetchone()[0]
    cursor.execute("SELECT COUNT(DISTINCT domain) FROM dns_queries")
    unique_domains = cursor.fetchone()[0]
    print(f"\nTotal queries:     {total_queries:,}")
    print(f"Providers:         {unique_providers}")
    print(f"Protocols:         {unique_protocols}")
    print(f"Unique domains:    {unique_domains}")
    # Show breakdown by provider, protocol, DNSSEC, and keep_alive
    print(f"\nBreakdown by Provider, Protocol, DNSSEC & Keep-Alive:")
    print(f"{'-'*80}")
    cursor.execute("""
        SELECT provider, protocol, dnssec_mode, keep_alive, COUNT(*) as count
        FROM dns_queries
        GROUP BY provider, protocol, dnssec_mode, keep_alive
        ORDER BY provider, protocol, dnssec_mode, keep_alive
    """)
    current_provider = None
    for provider, protocol, dnssec, keep_alive, count in cursor.fetchall():
        if current_provider != provider:
            if current_provider is not None:
                print()
            current_provider = provider
        ka_str = "✓" if keep_alive else "✗"
        print(f"  {provider:12} | {protocol:8} | {dnssec:5} | KA:{ka_str} | {count:6,} queries")
    # Protocol distribution
    print(f"\n{'-'*80}")
    print("Protocol Distribution:")
    print(f"{'-'*80}")
    cursor.execute("""
        SELECT protocol, COUNT(*) as count
        FROM dns_queries
        GROUP BY protocol
        ORDER BY protocol
    """)
    for protocol, count in cursor.fetchall():
        pct = (count / total_queries) * 100
        print(f"  {protocol:8} | {count:8,} queries ({pct:5.1f}%)")
    # DNSSEC mode distribution
    print(f"\n{'-'*80}")
    print("DNSSEC Mode Distribution:")
    print(f"{'-'*80}")
    cursor.execute("""
        SELECT dnssec_mode, COUNT(*) as count
        FROM dns_queries
        GROUP BY dnssec_mode
        ORDER BY dnssec_mode
    """)
    for dnssec_mode, count in cursor.fetchall():
        pct = (count / total_queries) * 100
        print(f"  {dnssec_mode:5} | {count:8,} queries ({pct:5.1f}%)")
    # Keep-Alive distribution
    print(f"\n{'-'*80}")
    print("Keep-Alive Distribution:")
    print(f"{'-'*80}")
    cursor.execute("""
        SELECT keep_alive, COUNT(*) as count
        FROM dns_queries
        GROUP BY keep_alive
    """)
    for keep_alive, count in cursor.fetchall():
        ka_label = "Persistent" if keep_alive else "Non-persistent"
        pct = (count / total_queries) * 100
        print(f"  {ka_label:15} | {count:8,} queries ({pct:5.1f}%)")
    conn.close()
    print(f"\n{'='*60}")
    print(f"✓ Database created successfully: {db_path}")
    print(f"  Total: {total_files} files, {total_rows:,} rows")
    print(f"{'='*60}\n")
    # Print usage examples
    print("\n📖 Usage Examples for Metabase:")
    print(f"{'-'*60}")
    print("\n1. Compare protocols (DNSSEC off, persistent only):")
    print("""   SELECT provider, protocol, 
          AVG(duration_ms) as avg_latency,
          AVG(total_bytes) as avg_bytes
      FROM dns_queries
      WHERE dnssec_mode = 'off' AND keep_alive = 1
      GROUP BY provider, protocol;""")
    print("\n2. DNSSEC impact on UDP:")
    print("""   SELECT provider, dnssec_mode,
          AVG(duration_ms) as avg_latency
      FROM dns_queries
      WHERE protocol = 'udp'
      GROUP BY provider, dnssec_mode;""")
    print("\n3. Keep-alive impact on TLS:")
    print("""   SELECT provider, keep_alive,
          AVG(duration_ms) as avg_latency,
          AVG(total_bytes) as avg_bytes
      FROM dns_queries
      WHERE protocol = 'tls' AND dnssec_mode = 'off'
      GROUP BY provider, keep_alive;""")
    print("\n4. Time series for line graphs:")
    print("""   SELECT timestamp_unix, duration_ms, total_bytes
      FROM dns_queries
      WHERE provider = 'cloudflare' 
        AND protocol = 'https'
        AND dnssec_mode = 'off'
        AND keep_alive = 1
      ORDER BY timestamp_unix;""")
    print("\n5. Overall comparison table:")
    print("""   SELECT protocol, dnssec_mode, keep_alive,
          COUNT(*) as queries,
          AVG(duration_ms) as avg_latency,
          AVG(total_bytes) as avg_bytes
      FROM dns_queries
      GROUP BY protocol, dnssec_mode, keep_alive
      ORDER BY protocol, dnssec_mode, keep_alive;""")
    print(f"\n{'-'*60}\n")
 if __name__ == '__main__':
    main()
@@ -1,274 +0,0 @@
 #!/usr/bin/env python3
 """
 Merge DNS test files by configuration.
 - Merges CSVs of same config (adds 'run_id' column for traceability)
 - Optionally merges PCAPs using mergecap
 - Flattens date structure
 """
 import os
 import csv
 import subprocess
 import shutil
 from pathlib import Path
 import argparse
 from collections import defaultdict
 def parse_filename(filename):
    """
    Extract config key from filename.
    Format: protocol[-flags]-timestamp.{csv,pcap}
    Config key: protocol[-flags] (ignores timestamp)
    """
    base = filename.replace('.csv', '').replace('.pcap', '')
    parts = base.split('-')
    if len(parts) < 2:
        return None
    # Config is everything except timestamp
    config = '-'.join(parts[:-1])
    timestamp = parts[-1]
    return config, timestamp
 def extract_resolver_from_path(file_path):
    """Extract resolver name from path"""
    parts = Path(file_path).parts
    for part in parts:
        if part.lower() in ['cloudflare', 'google', 'quad9', 'adguard']:
            return part.lower()
    return None
 def find_files(root_dir, extension):
    """Find all files with given extension"""
    files = []
    for root, dirs, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith(extension):
                full_path = os.path.join(root, filename)
                files.append(full_path)
    return sorted(files)
 def merge_csvs(csv_files, output_path, fieldnames):
    """Merge multiple CSVs into one, adding 'run_id' column"""
    with open(output_path, 'w', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames + ['run_id'])
        writer.writeheader()
        for csv_path in csv_files:
            # Use timestamp as run_id
            filename = Path(csv_path).name
            _, timestamp = parse_filename(filename)
            run_id = timestamp  # Or add date if needed
            with open(csv_path, 'r', newline='') as infile:
                reader = csv.DictReader(infile)
                for row in reader:
                    row['run_id'] = run_id
                    writer.writerow(row)
 def merge_pcaps(pcap_files, output_path):
    """Merge PCAP files using mergecap"""
    cmd = ['mergecap', '-w', output_path] + pcap_files
    try:
        subprocess.run(cmd, capture_output=True, check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"  ✗ mergecap error: {e.stderr.decode()}")
        return False
    except FileNotFoundError:
        print("Error: mergecap not found. Install Wireshark:")
        print("  Ubuntu: sudo apt install wireshark-common")
        print("  macOS: brew install wireshark")
        return False
 def format_bytes(bytes_val):
    """Format bytes as human readable"""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if bytes_val < 1024.0:
            return f"{bytes_val:.1f} {unit}"
        bytes_val /= 1024.0
    return f"{bytes_val:.1f} TB"
 def main():
    parser = argparse.ArgumentParser(
        description='Merge DNS test files by configuration',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
 Merges files of same config across dates/timestamps.
 Output: ./results_merged/[resolver]/[config].csv (merged)
        ./results_merged/[resolver]/[config].pcap (merged, if --merge-pcaps)
 Examples:
  # Dry run to preview
  %(prog)s ./results --dry-run
  # Merge CSVs only (recommended)
  %(prog)s ./results
  # Merge CSVs and PCAPs
  %(prog)s ./results --merge-pcaps
  # Custom output directory
  %(prog)s ./results --output ./merged_data
        '''
    )
    parser.add_argument(
        'input_dir',
        help='Input directory (e.g., ./results)'
    )
    parser.add_argument(
        '--output',
        default='./results_merged',
        help='Output directory (default: ./results_merged)'
    )
    parser.add_argument(
        '--merge-pcaps',
        action='store_true',
        help='Merge PCAP files (requires mergecap from Wireshark)'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be done without merging'
    )
    parser.add_argument(
        '-y', '--yes',
        action='store_true',
        help='Skip confirmation prompt'
    )
    args = parser.parse_args()
    if not os.path.isdir(args.input_dir):
        print(f"Error: Input directory not found: {args.input_dir}")
        return 1
    # Find all files
    print("=" * 80)
    print("MERGE DNS TEST FILES")
    print("=" * 80)
    print(f"Input:  {args.input_dir}")
    print(f"Output: {args.output}")
    print(f"Merge PCAPs: {'Yes' if args.merge_pcaps else 'No'}")
    csv_files = find_files(args.input_dir, '.csv')
    pcap_files = find_files(args.input_dir, '.pcap') if args.merge_pcaps else []
    if not csv_files and not pcap_files:
        print("\nNo CSV/PCAP files found")
        return 1
    print(f"\nFound {len(csv_files)} CSV files")
    if args.merge_pcaps:
        print(f"Found {len(pcap_files)} PCAP files")
    # Group files by resolver and config
    csv_groups = defaultdict(list)
    pcap_groups = defaultdict(list)
    for csv_path in csv_files:
        config, _ = parse_filename(Path(csv_path).name)
        resolver = extract_resolver_from_path(csv_path)
        if config and resolver:
            key = (resolver, config)
            csv_groups[key].append(csv_path)
    for pcap_path in pcap_files:
        config, _ = parse_filename(Path(pcap_path).name)
        resolver = extract_resolver_from_path(pcap_path)
        if config and resolver:
            key = (resolver, config)
            pcap_groups[key].append(pcap_path)
    # Summary
    print("\nConfigs to merge:")
    print("-" * 80)
    for (resolver, config), files in sorted(csv_groups.items()):
        print(f"  {resolver}/{config}: {len(files)} runs")
    total_runs = sum(len(files) for files in csv_groups.values())
    print(f"\nTotal configs: {len(csv_groups)}")
    print(f"Total runs:    {total_runs}")
    if args.dry_run:
        print("\n*** DRY RUN MODE ***\n")
        for (resolver, config) in sorted(csv_groups.keys()):
            print(f"Would merge: {resolver}/{config} ({len(csv_groups[(resolver, config)])} CSVs)")
            if args.merge_pcaps and (resolver, config) in pcap_groups:
                print(f"Would merge: {resolver}/{config} ({len(pcap_groups[(resolver, config)])} PCAPs)")
        return 0
    # Confirmation
    if not args.yes:
        response = input(f"\nMerge all into {args.output}? [y/N] ")
        if response.lower() not in ['y', 'yes']:
            print("Cancelled")
            return 0
    # Merge
    print("\n" + "=" * 80)
    print("MERGING FILES")
    print("=" * 80)
    success_count = 0
    fail_count = 0
    total_queries = 0
    total_size = 0
    # Get standard CSV fieldnames (from first file)
    first_csv = next(iter(csv_files))
    with open(first_csv, 'r') as f:
        reader = csv.DictReader(f)
        fieldnames = reader.fieldnames
    for (resolver, config), files in sorted(csv_groups.items()):
        print(f"\n{resolver}/{config} ({len(files)} runs)")
        # Merge CSVs
        output_csv = os.path.join(args.output, resolver, f"{config}.csv")
        os.makedirs(os.path.dirname(output_csv), exist_ok=True)
        merge_csvs(files, output_csv, fieldnames)
        # Count queries in merged file
        with open(output_csv, 'r') as f:
            query_count = sum(1 for _ in csv.reader(f)) - 1  # Minus header
        print(f"  ✓ Merged CSV: {query_count:,} queries")
        total_queries += query_count
        success_count += 1
        # Merge PCAPs if requested
        if args.merge_pcaps and (resolver, config) in pcap_groups:
            output_pcap = os.path.join(args.output, resolver, f"{config}.pcap")
            pcap_list = pcap_groups[(resolver, config)]
            if merge_pcaps(pcap_list, output_pcap):
                merged_size = os.path.getsize(output_pcap)
                orig_size = sum(os.path.getsize(p) for p in pcap_list)
                print(f"  ✓ Merged PCAP: {format_bytes(merged_size)} "
                      f"(from {format_bytes(orig_size)})")
                total_size += merged_size
            else:
                print(f"  ✗ PCAP merge failed")
                fail_count += 1
    # Final summary
    print("\n" + "=" * 80)
    print("COMPLETE")
    print("=" * 80)
    print(f"Successful configs: {success_count}")
    print(f"Failed:            {fail_count}")
    print(f"Total queries:     {total_queries:,}")
    if args.merge_pcaps:
        print(f"Total PCAP size:   {format_bytes(total_size)}")
    print(f"\nMerged files in: {args.output}")
    return 0 if fail_count == 0 else 1
 if __name__ == "__main__":
    exit(main())