diff --git a/flake.lock b/flake.lock deleted file mode 100644 index ed6abb9..0000000 --- a/flake.lock +++ /dev/null @@ -1,27 +0,0 @@ -{ - "nodes": { - "nixpkgs": { - "locked": { - "lastModified": 1760103332, - "narHash": "sha256-BMsGVfKl4Q80Pr9T1AkCRljO1bpwCmY8rTBVj8XGuhA=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "870493f9a8cb0b074ae5b411b2f232015db19a65", - "type": "github" - }, - "original": { - "owner": "NixOS", - "ref": "nixpkgs-unstable", - "repo": "nixpkgs", - "type": "github" - } - }, - "root": { - "inputs": { - "nixpkgs": "nixpkgs" - } - } - }, - "root": "root", - "version": 7 -} diff --git a/flake.nix b/flake.nix deleted file mode 100644 index f6677be..0000000 --- a/flake.nix +++ /dev/null @@ -1,38 +0,0 @@ -{ - description = "A Nix-flake-based Go 1.22 development environment"; - - inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; - - outputs = inputs: - let - goVersion = 25; # Change this to update the whole stack - - supportedSystems = [ "aarch64-darwin" ]; - forEachSupportedSystem = f: inputs.nixpkgs.lib.genAttrs supportedSystems (system: f { - pkgs = import inputs.nixpkgs { - inherit system; - overlays = [ inputs.self.overlays.default ]; - }; - }); - in - { - overlays.default = final: prev: { - go = final."go_1_${toString goVersion}"; - }; - - devShells = forEachSupportedSystem ({ pkgs }: { - default = pkgs.mkShell { - packages = with pkgs; [ - go - gotools - golangci-lint - (pkgs.python3.withPackages (python-pkgs: with python-pkgs; [ - pandas - matplotlib - seaborn - ])) - ]; - }; - }); - }; -} diff --git a/domains.txt b/input_data/domains.txt similarity index 100% rename from domains.txt rename to input_data/domains.txt diff --git a/internal/qol/measurement.go b/internal/qol/measurement.go index a747a1e..225161b 100644 --- a/internal/qol/measurement.go +++ b/internal/qol/measurement.go @@ -12,6 +12,7 @@ import ( "github.com/afonsofrancof/sdns-proxy/client" "github.com/afonsofrancof/sdns-proxy/internal/qol/capture" "github.com/afonsofrancof/sdns-proxy/internal/qol/results" + "github.com/afonsofrancof/sdns-proxy/internal/qol/stats" "github.com/google/gopacket/pcap" "github.com/miekg/dns" ) @@ -86,13 +87,16 @@ func (r *MeasurementRunner) runPerUpstream(upstream string, domains []string, qT defer dnsClient.Close() // Setup output files - csvPath, pcapPath := GenerateOutputPaths(r.config.OutputDir, upstream, r.config.DNSSEC, r.config.AuthoritativeDNSSEC, r.config.KeepAlive) + csvPath, pcapPath, memPath := GenerateOutputPaths(r.config.OutputDir, upstream, r.config.DNSSEC, r.config.AuthoritativeDNSSEC, r.config.KeepAlive) // Create directory if it doesn't exist if err := os.MkdirAll(filepath.Dir(csvPath), 0755); err != nil { return fmt.Errorf("failed to create output directory: %w", err) } + // Initialize runtime collector with memPath + runtimeCollector := stats.NewRuntimeCollector(memPath) + keepAliveStr := "" if r.config.KeepAlive { keepAliveStr = " (keep-alive)" @@ -118,7 +122,17 @@ func (r *MeasurementRunner) runPerUpstream(upstream string, domains []string, qT time.Sleep(time.Second) // Run measurements - return r.runQueries(dnsClient, upstream, domains, qType, writer, packetCapture) + err = r.runQueries(dnsClient, upstream, domains, qType, writer, packetCapture) + if err != nil { + return err + } + + // Write summed mem stats for the entire run + if err := runtimeCollector.WriteStats(); err != nil { + fmt.Fprintf(os.Stderr, "Warning: failed to write mem stats to %s: %v\n", memPath, err) + } + + return nil } func (r *MeasurementRunner) runQueries(dnsClient client.DNSClient, upstream string, diff --git a/internal/qol/stats/runtime.go b/internal/qol/stats/runtime.go new file mode 100644 index 0000000..e1d7e35 --- /dev/null +++ b/internal/qol/stats/runtime.go @@ -0,0 +1,94 @@ +package stats + +import ( + "encoding/csv" + "fmt" + "os" + "runtime" + "time" +) + +type RuntimeStats struct { + TotalAlloc uint64 + Mallocs uint64 + NumGC uint32 + AllocDelta uint64 + MallocsDelta uint64 + GCDelta uint32 +} + +type RuntimeCollector struct { + startStats runtime.MemStats + memPath string +} + +func NewRuntimeCollector(memPath string) *RuntimeCollector { + var stats runtime.MemStats + runtime.ReadMemStats(&stats) + + return &RuntimeCollector{ + startStats: stats, + memPath: memPath, + } +} + +func (rc *RuntimeCollector) Collect() RuntimeStats { + var current runtime.MemStats + runtime.ReadMemStats(¤t) + + return RuntimeStats{ + TotalAlloc: current.TotalAlloc, + Mallocs: current.Mallocs, + NumGC: current.NumGC, + AllocDelta: current.TotalAlloc - rc.startStats.TotalAlloc, + MallocsDelta: current.Mallocs - rc.startStats.Mallocs, + GCDelta: current.NumGC - rc.startStats.NumGC, + } +} + +func (rc *RuntimeCollector) WriteStats() error { + stats := rc.Collect() + timestamp := time.Now().Format(time.RFC3339Nano) + + // Check if file exists + fileExists := false + if _, err := os.Stat(rc.memPath); err == nil { + fileExists = true + } + + // Open in append mode + file, err := os.OpenFile(rc.memPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("failed to open mem.csv: %w", err) + } + defer file.Close() + + writer := csv.NewWriter(file) + + // Write header if new file + if !fileExists { + header := []string{ + "timestamp", "total_alloc_bytes", "mallocs", "gc_cycles", + "alloc_delta", "mallocs_delta", "gc_delta", + } + if err := writer.Write(header); err != nil { + return fmt.Errorf("failed to write mem.csv header: %w", err) + } + } + + row := []string{ + timestamp, + fmt.Sprintf("%d", stats.TotalAlloc), + fmt.Sprintf("%d", stats.Mallocs), + fmt.Sprintf("%d", stats.NumGC), + fmt.Sprintf("%d", stats.AllocDelta), + fmt.Sprintf("%d", stats.MallocsDelta), + fmt.Sprintf("%d", stats.GCDelta), + } + if err := writer.Write(row); err != nil { + return fmt.Errorf("failed to write mem.csv row: %w", err) + } + + writer.Flush() + return writer.Error() +} diff --git a/internal/qol/utils.go b/internal/qol/utils.go index 9f7387a..b47530a 100644 --- a/internal/qol/utils.go +++ b/internal/qol/utils.go @@ -1,4 +1,3 @@ -// ./internal/qol/utils.go package qol import ( @@ -8,7 +7,7 @@ import ( "strings" ) -func GenerateOutputPaths(outputDir, upstream string, dnssec, authDNSSEC, keepAlive bool) (csvPath, pcapPath string) { +func GenerateOutputPaths(outputDir, upstream string, dnssec, authDNSSEC, keepAlive bool) (csvPath, pcapPath, memPath string) { proto := DetectProtocol(upstream) cleanServer := cleanServerName(upstream) @@ -32,8 +31,10 @@ func GenerateOutputPaths(outputDir, upstream string, dnssec, authDNSSEC, keepAli base = fmt.Sprintf("%s-%s", base, strings.Join(flags, "-")) } - return filepath.Join(subDir, base+".csv"), - filepath.Join(subDir, base+".pcap") + csvPath = filepath.Join(subDir, base+".csv") + pcapPath = filepath.Join(subDir, base+".pcap") + memPath = filepath.Join(subDir, base+".mem.csv") + return } func cleanServerName(server string) string { @@ -84,7 +85,6 @@ func DetectProtocol(upstream string) string { u, err := url.Parse(upstream) if err == nil && u.Scheme != "" { scheme := strings.ToLower(u.Scheme) - // Normalize scheme names switch scheme { case "udp", "doudp": return "doudp" diff --git a/run.sh b/run.sh deleted file mode 100755 index 5f06faf..0000000 --- a/run.sh +++ /dev/null @@ -1,187 +0,0 @@ -#!/bin/bash - -# Exit on error -set -e - -# Default values -TOOL_PATH="./qol" -DOMAINS_FILE="./domains.txt" -OUTPUT_DIR="./results" -INTERFACE="veth1" -TIMEOUT="5s" -SLEEP_TIME="1" - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - -t|--tool-path) - TOOL_PATH="$2" - shift 2 - ;; - -d|--domains-file) - DOMAINS_FILE="$2" - shift 2 - ;; - -o|--output-dir) - OUTPUT_DIR="$2" - shift 2 - ;; - -I|--interface) - INTERFACE="$2" - shift 2 - ;; - -T|--timeout) - TIMEOUT="$2" - shift 2 - ;; - -s|--sleep) - SLEEP_TIME="$2" - shift 2 - ;; - --help) - echo "Usage: $0 [OPTIONS]" - echo "" - echo "Options:" - echo " -t, --tool-path PATH Path to qol tool (default: ./qol)" - echo " -d, --domains-file PATH Path to domains file (default: ./domains.txt)" - echo " -o, --output-dir PATH Output directory (default: ./results)" - echo " -I, --interface NAME Network interface (default: veth1)" - echo " -T, --timeout DURATION Timeout duration (default: 5s)" - echo " -s, --sleep SECONDS Sleep between runs (default: 1)" - echo " --help Show this help" - exit 0 - ;; - *) - echo "Unknown option: $1" - echo "Use --help for usage information" - exit 1 - ;; - esac -done - -echo "Configuration:" -echo " Tool path: $TOOL_PATH" -echo " Domains file: $DOMAINS_FILE" -echo " Output dir: $OUTPUT_DIR" -echo " Interface: $INTERFACE" -echo " Timeout: $TIMEOUT" -echo " Sleep time: ${SLEEP_TIME}s" -echo "" - -# Connection-based protocols that benefit from keep-alive (TCP-based) -CONN_SERVERS=( - -s "dotcp://8.8.8.8:53" - -s "dotcp://1.1.1.1:53" - -s "dotcp://9.9.9.9:53" - -s "dotcp://dns.adguard-dns.com:53" - #-s "tls://8.8.8.8:853" - #-s "tls://1.1.1.1:853" - #-s "tls://9.9.9.9:853" - #-s "tls://dns.adguard-dns.com:853" - #-s "https://dns.google/dns-query" - #-s "https://cloudflare-dns.com/dns-query" - #-s "https://dns10.quad9.net/dns-query" - #-s "https://dns.adguard-dns.com/dns-query" -) - -# QUIC-based protocols (have built-in 0-RTT, keep-alive doesn't add value) -QUIC_SERVERS=( - #-s "doh3://dns.google/dns-query" - #-s "doh3://cloudflare-dns.com/dns-query" - #-s "doh3://dns.adguard-dns.com/dns-query" - #-s "doq://dns.adguard-dns.com:853" -) - -# Connectionless protocols (no keep-alive) -CONNLESS_SERVERS=( - # -s "udp://8.8.8.8:53" - # -s "udp://1.1.1.1:53" - # -s "udp://9.9.9.9:53" - # -s "udp://dns.adguard-dns.com:53" - -s "sdns://AQMAAAAAAAAAETk0LjE0MC4xNC4xNDo1NDQzINErR_JS3PLCu_iZEIbq95zkSV2LFsigxDIuUso_OQhzIjIuZG5zY3J5cHQuZGVmYXVsdC5uczEuYWRndWFyZC5jb20" - -s "sdns://AQMAAAAAAAAAFDE0OS4xMTIuMTEyLjExMjo4NDQzIGfIR7jIdYzRICRVQ751Z0bfNN8dhMALjEcDaN-CHYY-GTIuZG5zY3J5cHQtY2VydC5xdWFkOS5uZXQ" -) - -# Common args -COMMON_ARGS=( - "$DOMAINS_FILE" - --interface "$INTERFACE" - --timeout "$TIMEOUT" -) - -# Combinations for TCP-based connection protocols -CONN_COMBINATIONS=( - # DNSSEC off, Keep off - "" - - # DNSSEC off, Keep on - "--keep-alive" - - # DNSSEC on (trust), Keep on - "--dnssec --keep-alive" - - # DNSSEC on (auth), Keep on - "--dnssec --authoritative-dnssec --keep-alive" -) - -# Combinations for QUIC and connectionless protocols (no keep-alive) -NO_KEEPALIVE_COMBINATIONS=( - # DNSSEC off - "" - - # DNSSEC on (trust) - "--dnssec" - - # DNSSEC on (auth) - "--dnssec --authoritative-dnssec" -) - -echo "=== Running TCP-based protocols (TLS/HTTPS) ===" -for FLAGS in "${CONN_COMBINATIONS[@]}"; do - echo "Running: $FLAGS" - - FLAGS_ARRAY=($FLAGS) - - sudo "$TOOL_PATH" run \ - --output-dir "$OUTPUT_DIR" \ - "${COMMON_ARGS[@]}" \ - "${CONN_SERVERS[@]}" \ - "${FLAGS_ARRAY[@]}" || true - - sleep "$SLEEP_TIME" -done - -echo "" -echo "=== Running QUIC-based protocols (DoH3/DoQ) ===" -for FLAGS in "${NO_KEEPALIVE_COMBINATIONS[@]}"; do - echo "Running: $FLAGS" - - FLAGS_ARRAY=($FLAGS) - - sudo "$TOOL_PATH" run \ - --output-dir "$OUTPUT_DIR" \ - "${COMMON_ARGS[@]}" \ - "${QUIC_SERVERS[@]}" \ - "${FLAGS_ARRAY[@]}" || true - - sleep "$SLEEP_TIME" -done - -echo "" -echo "=== Running connectionless protocols (UDP) ===" -for FLAGS in "${NO_KEEPALIVE_COMBINATIONS[@]}"; do - echo "Running: $FLAGS" - - FLAGS_ARRAY=($FLAGS) - - sudo "$TOOL_PATH" run \ - --output-dir "$OUTPUT_DIR" \ - "${COMMON_ARGS[@]}" \ - "${CONNLESS_SERVERS[@]}" \ - "${FLAGS_ARRAY[@]}" || true - - sleep "$SLEEP_TIME" -done - -echo "" -echo "All combinations completed!" diff --git a/scripts/analysis/analyze_dns_metrics.py b/scripts/analysis/analyze_dns_metrics.py deleted file mode 100644 index 81e6a3b..0000000 --- a/scripts/analysis/analyze_dns_metrics.py +++ /dev/null @@ -1,498 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -import numpy as np -from pathlib import Path -from scipy import stats -import warnings - -warnings.filterwarnings('ignore') - -# Set style for publication-quality plots -sns.set_style("whitegrid") -plt.rcParams['figure.dpi'] = 300 -plt.rcParams['savefig.dpi'] = 300 -plt.rcParams['font.size'] = 10 -plt.rcParams['figure.figsize'] = (12, 6) - -class DNSAnalyzer: - def __init__(self, results_dir='results'): - self.results_dir = Path(results_dir) - self.df = None - - def load_all_data(self): - """Load all CSV files from the results directory""" - data_frames = [] - - providers = ['adguard', 'cloudflare', 'google', 'quad9'] - - for provider in providers: - provider_path = self.results_dir / provider - if not provider_path.exists(): - continue - - for csv_file in provider_path.glob('*.csv'): - try: - df = pd.read_csv(csv_file) - df['provider'] = provider - df['test_config'] = csv_file.stem - data_frames.append(df) - except Exception as e: - print(f"Error loading {csv_file}: {e}") - - self.df = pd.concat(data_frames, ignore_index=True) - self._clean_and_enrich_data() - print(f"Loaded {len(self.df)} DNS queries across {len(data_frames)} test configurations") - - def _clean_and_enrich_data(self): - """Clean data and add useful columns""" - # Remove failed queries - self.df = self.df[self.df['error'].isna()] - - # Extract protocol base (remove -auth, -trust suffixes) - self.df['protocol_base'] = self.df['protocol'].str.replace('-auth|-trust', '', regex=True) - - # DNSSEC configuration - self.df['dnssec_mode'] = 'none' - self.df.loc[self.df['auth_dnssec'] == True, 'dnssec_mode'] = 'auth' - self.df.loc[(self.df['dnssec'] == True) & (self.df['auth_dnssec'] == False), 'dnssec_mode'] = 'trust' - - # Protocol categories - self.df['protocol_category'] = self.df['protocol_base'].map({ - 'udp': 'Plain DNS', - 'tls': 'DoT', - 'https': 'DoH', - 'doh3': 'DoH/3', - 'doq': 'DoQ' - }) - - # Connection persistence - self.df['persistence'] = self.df['keep_alive'].fillna(False) - - def generate_summary_statistics(self): - """Generate comprehensive summary statistics""" - print("\n" + "="*80) - print("SUMMARY STATISTICS") - print("="*80) - - # Overall statistics - print("\n--- Overall Performance ---") - print(f"Total queries: {len(self.df)}") - print(f"Mean latency: {self.df['duration_ms'].mean():.2f} ms") - print(f"Median latency: {self.df['duration_ms'].median():.2f} ms") - print(f"95th percentile: {self.df['duration_ms'].quantile(0.95):.2f} ms") - print(f"99th percentile: {self.df['duration_ms'].quantile(0.99):.2f} ms") - - # By protocol - print("\n--- Performance by Protocol ---") - protocol_stats = self.df.groupby('protocol_category')['duration_ms'].agg([ - ('count', 'count'), - ('mean', 'mean'), - ('median', 'median'), - ('std', 'std'), - ('p95', lambda x: x.quantile(0.95)), - ('p99', lambda x: x.quantile(0.99)) - ]).round(2) - print(protocol_stats) - - # By provider - print("\n--- Performance by Provider ---") - provider_stats = self.df.groupby('provider')['duration_ms'].agg([ - ('count', 'count'), - ('mean', 'mean'), - ('median', 'median'), - ('std', 'std'), - ('p95', lambda x: x.quantile(0.95)) - ]).round(2) - print(provider_stats) - - # DNSSEC impact - print("\n--- DNSSEC Validation Impact ---") - dnssec_stats = self.df.groupby('dnssec_mode')['duration_ms'].agg([ - ('count', 'count'), - ('mean', 'mean'), - ('median', 'median'), - ('overhead_vs_none', lambda x: x.mean()) - ]).round(2) - - # Calculate overhead percentage - baseline = dnssec_stats.loc['none', 'mean'] if 'none' in dnssec_stats.index else 0 - if baseline > 0: - dnssec_stats['overhead_pct'] = ((dnssec_stats['overhead_vs_none'] - baseline) / baseline * 100).round(1) - print(dnssec_stats) - - # Bandwidth analysis - print("\n--- Bandwidth Usage ---") - bandwidth_stats = self.df.groupby('protocol_category').agg({ - 'request_size_bytes': ['mean', 'median'], - 'response_size_bytes': ['mean', 'median'] - }).round(2) - print(bandwidth_stats) - - # Persistence impact (where applicable) - print("\n--- Connection Persistence Impact ---") - persist_protocols = self.df[self.df['protocol_base'].isin(['tls', 'https'])] - if len(persist_protocols) > 0: - persist_stats = persist_protocols.groupby(['protocol_base', 'persistence'])['duration_ms'].agg([ - ('mean', 'mean'), - ('median', 'median') - ]).round(2) - print(persist_stats) - - return { - 'protocol': protocol_stats, - 'provider': provider_stats, - 'dnssec': dnssec_stats, - 'bandwidth': bandwidth_stats - } - - def plot_latency_by_protocol(self, output_dir='plots'): - """Violin plot of latency distribution by protocol""" - Path(output_dir).mkdir(exist_ok=True) - - plt.figure(figsize=(14, 7)) - - # Order protocols logically - protocol_order = ['Plain DNS', 'DoT', 'DoH', 'DoH/3', 'DoQ'] - available_protocols = [p for p in protocol_order if p in self.df['protocol_category'].values] - - sns.violinplot(data=self.df, x='protocol_category', y='duration_ms', - order=available_protocols, inner='box', cut=0) - - plt.title('DNS Query Latency Distribution by Protocol', fontsize=14, fontweight='bold') - plt.xlabel('Protocol', fontsize=12) - plt.ylabel('Response Time (ms)', fontsize=12) - plt.xticks(rotation=0) - - # Add mean values as annotations - for i, protocol in enumerate(available_protocols): - mean_val = self.df[self.df['protocol_category'] == protocol]['duration_ms'].mean() - plt.text(i, mean_val, f'{mean_val:.1f}', ha='center', va='bottom', fontweight='bold') - - plt.tight_layout() - plt.savefig(f'{output_dir}/latency_by_protocol.png', bbox_inches='tight') - plt.close() - print(f"✓ Saved: latency_by_protocol.png") - - def plot_provider_comparison(self, output_dir='plots'): - """Box plot comparing providers across protocols""" - Path(output_dir).mkdir(exist_ok=True) - - fig, axes = plt.subplots(2, 2, figsize=(16, 12)) - fig.suptitle('Provider Performance Comparison by Protocol', fontsize=16, fontweight='bold') - - protocols = self.df['protocol_category'].unique() - protocols = [p for p in ['Plain DNS', 'DoT', 'DoH', 'DoH/3'] if p in protocols] - - for idx, protocol in enumerate(protocols[:4]): - ax = axes[idx // 2, idx % 2] - data = self.df[self.df['protocol_category'] == protocol] - - if len(data) > 0: - sns.boxplot(data=data, x='provider', y='duration_ms', ax=ax) - ax.set_title(f'{protocol}', fontsize=12, fontweight='bold') - ax.set_xlabel('Provider', fontsize=10) - ax.set_ylabel('Response Time (ms)', fontsize=10) - ax.tick_params(axis='x', rotation=45) - - plt.tight_layout() - plt.savefig(f'{output_dir}/provider_comparison.png', bbox_inches='tight') - plt.close() - print(f"✓ Saved: provider_comparison.png") - - def plot_dnssec_impact(self, output_dir='plots'): - """Compare DNSSEC validation methods (trust vs auth)""" - Path(output_dir).mkdir(exist_ok=True) - - # Filter for protocols that have DNSSEC variations - dnssec_data = self.df[self.df['dnssec_mode'] != 'none'].copy() - - if len(dnssec_data) == 0: - print("⚠ No DNSSEC data available") - return - - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) - - # Plot 1: Overall DNSSEC impact - protocol_order = ['Plain DNS', 'DoT', 'DoH', 'DoH/3', 'DoQ'] - available = [p for p in protocol_order if p in self.df['protocol_category'].values] - - sns.barplot(data=self.df, x='protocol_category', y='duration_ms', - hue='dnssec_mode', order=available, ax=ax1, ci=95) - ax1.set_title('DNSSEC Validation Overhead by Protocol', fontsize=12, fontweight='bold') - ax1.set_xlabel('Protocol', fontsize=10) - ax1.set_ylabel('Mean Response Time (ms)', fontsize=10) - ax1.legend(title='DNSSEC Mode', labels=['No DNSSEC', 'Auth (Full)', 'Trust (Resolver)']) - ax1.tick_params(axis='x', rotation=0) - - # Plot 2: Trust vs Auth comparison - comparison_data = dnssec_data.groupby(['protocol_category', 'dnssec_mode'])['duration_ms'].mean().reset_index() - pivot_data = comparison_data.pivot(index='protocol_category', columns='dnssec_mode', values='duration_ms') - - if 'auth' in pivot_data.columns and 'trust' in pivot_data.columns: - pivot_data['overhead_pct'] = ((pivot_data['auth'] - pivot_data['trust']) / pivot_data['trust'] * 100) - pivot_data['overhead_pct'].plot(kind='bar', ax=ax2, color='coral') - ax2.set_title('Auth vs Trust: Additional Overhead (%)', fontsize=12, fontweight='bold') - ax2.set_xlabel('Protocol', fontsize=10) - ax2.set_ylabel('Additional Overhead (%)', fontsize=10) - ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.8) - ax2.tick_params(axis='x', rotation=45) - ax2.grid(axis='y', alpha=0.3) - - plt.tight_layout() - plt.savefig(f'{output_dir}/dnssec_impact.png', bbox_inches='tight') - plt.close() - print(f"✓ Saved: dnssec_impact.png") - - def plot_persistence_impact(self, output_dir='plots'): - """Analyze impact of connection persistence""" - Path(output_dir).mkdir(exist_ok=True) - - persist_data = self.df[self.df['protocol_base'].isin(['tls', 'https'])].copy() - - if len(persist_data) == 0: - print("⚠ No persistence data available") - return - - plt.figure(figsize=(12, 6)) - - sns.barplot(data=persist_data, x='protocol_base', y='duration_ms', - hue='persistence', ci=95) - - plt.title('Impact of Connection Persistence on Latency', fontsize=14, fontweight='bold') - plt.xlabel('Protocol', fontsize=12) - plt.ylabel('Mean Response Time (ms)', fontsize=12) - plt.legend(title='Keep-Alive', labels=['Disabled', 'Enabled']) - - # Calculate and annotate overhead reduction - for protocol in persist_data['protocol_base'].unique(): - protocol_data = persist_data[persist_data['protocol_base'] == protocol] - - no_persist = protocol_data[protocol_data['persistence'] == False]['duration_ms'].mean() - with_persist = protocol_data[protocol_data['persistence'] == True]['duration_ms'].mean() - - if not np.isnan(no_persist) and not np.isnan(with_persist): - reduction = ((no_persist - with_persist) / no_persist * 100) - print(f"{protocol}: {reduction:.1f}% reduction with persistence") - - plt.tight_layout() - plt.savefig(f'{output_dir}/persistence_impact.png', bbox_inches='tight') - plt.close() - print(f"✓ Saved: persistence_impact.png") - - def plot_bandwidth_overhead(self, output_dir='plots'): - """Visualize bandwidth usage by protocol""" - Path(output_dir).mkdir(exist_ok=True) - - bandwidth_data = self.df.groupby('protocol_category').agg({ - 'request_size_bytes': 'mean', - 'response_size_bytes': 'mean' - }).reset_index() - - bandwidth_data['total_bytes'] = (bandwidth_data['request_size_bytes'] + - bandwidth_data['response_size_bytes']) - - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) - - # Plot 1: Request vs Response sizes - x = np.arange(len(bandwidth_data)) - width = 0.35 - - ax1.bar(x - width/2, bandwidth_data['request_size_bytes'], width, - label='Request', alpha=0.8) - ax1.bar(x + width/2, bandwidth_data['response_size_bytes'], width, - label='Response', alpha=0.8) - - ax1.set_xlabel('Protocol', fontsize=12) - ax1.set_ylabel('Bytes', fontsize=12) - ax1.set_title('Average Request/Response Sizes', fontsize=12, fontweight='bold') - ax1.set_xticks(x) - ax1.set_xticklabels(bandwidth_data['protocol_category']) - ax1.legend() - ax1.grid(axis='y', alpha=0.3) - - # Plot 2: Total bandwidth overhead vs UDP baseline - udp_total = bandwidth_data[bandwidth_data['protocol_category'] == 'Plain DNS']['total_bytes'].values - if len(udp_total) > 0: - bandwidth_data['overhead_vs_udp'] = ((bandwidth_data['total_bytes'] - udp_total[0]) / udp_total[0] * 100) - - colors = ['green' if x < 0 else 'red' for x in bandwidth_data['overhead_vs_udp']] - ax2.bar(bandwidth_data['protocol_category'], bandwidth_data['overhead_vs_udp'], - color=colors, alpha=0.7) - ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.8) - ax2.set_xlabel('Protocol', fontsize=12) - ax2.set_ylabel('Overhead vs Plain DNS (%)', fontsize=12) - ax2.set_title('Bandwidth Overhead', fontsize=12, fontweight='bold') - ax2.grid(axis='y', alpha=0.3) - - plt.tight_layout() - plt.savefig(f'{output_dir}/bandwidth_overhead.png', bbox_inches='tight') - plt.close() - print(f"✓ Saved: bandwidth_overhead.png") - - def plot_heatmap(self, output_dir='plots'): - """Heatmap of provider-protocol performance""" - Path(output_dir).mkdir(exist_ok=True) - - # Create pivot table - heatmap_data = self.df.groupby(['provider', 'protocol_category'])['duration_ms'].median().unstack() - - plt.figure(figsize=(12, 8)) - sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='RdYlGn_r', - cbar_kws={'label': 'Median Latency (ms)'}) - - plt.title('DNS Provider-Protocol Performance Matrix', fontsize=14, fontweight='bold') - plt.xlabel('Protocol', fontsize=12) - plt.ylabel('Provider', fontsize=12) - - plt.tight_layout() - plt.savefig(f'{output_dir}/provider_protocol_heatmap.png', bbox_inches='tight') - plt.close() - print(f"✓ Saved: provider_protocol_heatmap.png") - - def plot_percentile_comparison(self, output_dir='plots'): - """Plot percentile comparison across protocols""" - Path(output_dir).mkdir(exist_ok=True) - - percentiles = [50, 75, 90, 95, 99] - protocol_order = ['Plain DNS', 'DoT', 'DoH', 'DoH/3', 'DoQ'] - available = [p for p in protocol_order if p in self.df['protocol_category'].values] - - percentile_data = [] - for protocol in available: - data = self.df[self.df['protocol_category'] == protocol]['duration_ms'] - for p in percentiles: - percentile_data.append({ - 'protocol': protocol, - 'percentile': f'P{p}', - 'latency': np.percentile(data, p) - }) - - percentile_df = pd.DataFrame(percentile_data) - - plt.figure(figsize=(14, 7)) - sns.barplot(data=percentile_df, x='protocol', y='latency', hue='percentile', order=available) - - plt.title('Latency Percentiles by Protocol', fontsize=14, fontweight='bold') - plt.xlabel('Protocol', fontsize=12) - plt.ylabel('Response Time (ms)', fontsize=12) - plt.legend(title='Percentile', bbox_to_anchor=(1.05, 1), loc='upper left') - - plt.tight_layout() - plt.savefig(f'{output_dir}/percentile_comparison.png', bbox_inches='tight') - plt.close() - print(f"✓ Saved: percentile_comparison.png") - - def statistical_tests(self): - """Perform statistical significance tests""" - print("\n" + "="*80) - print("STATISTICAL TESTS") - print("="*80) - - # Test 1: Protocol differences (Kruskal-Wallis) - protocols = self.df['protocol_category'].unique() - if len(protocols) > 2: - groups = [self.df[self.df['protocol_category'] == p]['duration_ms'].values - for p in protocols] - h_stat, p_value = stats.kruskal(*groups) - print(f"\n--- Kruskal-Wallis Test (Protocol Differences) ---") - print(f"H-statistic: {h_stat:.4f}") - print(f"p-value: {p_value:.4e}") - print(f"Result: {'Significant' if p_value < 0.05 else 'Not significant'} differences between protocols") - - # Test 2: DNSSEC impact (Mann-Whitney U) - if 'none' in self.df['dnssec_mode'].values and 'auth' in self.df['dnssec_mode'].values: - none_data = self.df[self.df['dnssec_mode'] == 'none']['duration_ms'] - auth_data = self.df[self.df['dnssec_mode'] == 'auth']['duration_ms'] - - u_stat, p_value = stats.mannwhitneyu(none_data, auth_data, alternative='two-sided') - print(f"\n--- Mann-Whitney U Test (No DNSSEC vs Auth) ---") - print(f"U-statistic: {u_stat:.4f}") - print(f"p-value: {p_value:.4e}") - print(f"Result: {'Significant' if p_value < 0.05 else 'Not significant'} difference") - - # Test 3: Trust vs Auth comparison - if 'trust' in self.df['dnssec_mode'].values and 'auth' in self.df['dnssec_mode'].values: - trust_data = self.df[self.df['dnssec_mode'] == 'trust']['duration_ms'] - auth_data = self.df[self.df['dnssec_mode'] == 'auth']['duration_ms'] - - u_stat, p_value = stats.mannwhitneyu(trust_data, auth_data, alternative='two-sided') - print(f"\n--- Mann-Whitney U Test (Trust vs Auth) ---") - print(f"U-statistic: {u_stat:.4f}") - print(f"p-value: {p_value:.4e}") - print(f"Result: Auth is {'significantly' if p_value < 0.05 else 'not significantly'} slower than Trust") - - def generate_latex_table(self, output_dir='plots'): - """Generate LaTeX table for thesis""" - Path(output_dir).mkdir(exist_ok=True) - - # Summary table by protocol - summary = self.df.groupby('protocol_category')['duration_ms'].agg([ - ('Mean', 'mean'), - ('Median', 'median'), - ('Std Dev', 'std'), - ('P95', lambda x: x.quantile(0.95)), - ('P99', lambda x: x.quantile(0.99)) - ]).round(2) - - latex_code = summary.to_latex(float_format="%.2f") - - with open(f'{output_dir}/summary_table.tex', 'w') as f: - f.write(latex_code) - - print(f"✓ Saved: summary_table.tex") - print("\nLaTeX Table Preview:") - print(latex_code) - - def run_full_analysis(self): - """Run complete analysis pipeline""" - print("="*80) - print("DNS QoS Analysis - Starting Full Analysis") - print("="*80) - - # Load data - print("\n[1/10] Loading data...") - self.load_all_data() - - # Generate statistics - print("\n[2/10] Generating summary statistics...") - self.generate_summary_statistics() - - # Statistical tests - print("\n[3/10] Running statistical tests...") - self.statistical_tests() - - # Generate plots - print("\n[4/10] Creating latency by protocol plot...") - self.plot_latency_by_protocol() - - print("\n[5/10] Creating provider comparison plot...") - self.plot_provider_comparison() - - print("\n[6/10] Creating DNSSEC impact plot...") - self.plot_dnssec_impact() - - print("\n[7/10] Creating persistence impact plot...") - self.plot_persistence_impact() - - print("\n[8/10] Creating bandwidth overhead plot...") - self.plot_bandwidth_overhead() - - print("\n[9/10] Creating heatmap...") - self.plot_heatmap() - - print("\n[10/10] Creating percentile comparison...") - self.plot_percentile_comparison() - - # Generate LaTeX table - print("\n[Bonus] Generating LaTeX table...") - self.generate_latex_table() - - print("\n" + "="*80) - print("✓ Analysis Complete! Check the 'plots' directory for all visualizations.") - print("="*80) - - -if __name__ == "__main__": - analyzer = DNSAnalyzer(results_dir='results') - analyzer.run_full_analysis() diff --git a/scripts/analysis/analyze_simple.py b/scripts/analysis/analyze_simple.py deleted file mode 100644 index a82e024..0000000 --- a/scripts/analysis/analyze_simple.py +++ /dev/null @@ -1,536 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -import numpy as np -from pathlib import Path -import datetime -from dateutil import parser as date_parser -import dpkt - -# Set style -sns.set_style("whitegrid") -plt.rcParams['figure.dpi'] = 300 -plt.rcParams['savefig.dpi'] = 300 -plt.rcParams['font.size'] = 10 - -class FastDNSAnalyzer: - def __init__(self, results_dir='results'): - self.results_dir = Path(results_dir) - self.all_data = [] - - def should_include_file(self, filename): - """Filter out DNSSEC and non-persist files""" - name = filename.stem - if 'auth' in name or 'trust' in name: - return False - if name in ['tls', 'https']: - return False - return True - - def parse_rfc3339_nano(self, timestamp_str): - """Parse RFC3339Nano timestamp with timezone""" - try: - dt = date_parser.parse(timestamp_str) - return dt.astimezone(datetime.timezone.utc).timestamp() - except Exception as e: - print(f" Error parsing timestamp {timestamp_str}: {e}") - return None - - def extract_bandwidth_from_pcap_fast(self, pcap_file, csv_data): - """Fast bandwidth extraction using dpkt""" - print(f" Analyzing pcap: {pcap_file.name}") - - try: - with open(pcap_file, 'rb') as f: - pcap = dpkt.pcap.Reader(f) - - # Build query time windows - query_windows = [] - for idx, row in csv_data.iterrows(): - start_time = self.parse_rfc3339_nano(row['timestamp']) - if start_time is None: - continue - - duration_seconds = row['duration_ns'] / 1_000_000_000 - end_time = start_time + duration_seconds - - query_windows.append({ - 'index': idx, - 'start': start_time, - 'end': end_time, - 'bytes_sent': 0, - 'bytes_received': 0, - 'packets_sent': 0, - 'packets_received': 0 - }) - - if not query_windows: - print(" ✗ No valid query windows") - return None - - # Sort windows for faster matching - query_windows.sort(key=lambda x: x['start']) - - # Process packets - packet_count = 0 - matched_count = 0 - - for timestamp, buf in pcap: - packet_count += 1 - packet_size = len(buf) - - # Quick parse to determine direction - try: - eth = dpkt.ethernet.Ethernet(buf) - - # Get IP layer - if isinstance(eth.data, dpkt.ip.IP): - ip = eth.data - elif isinstance(eth.data, dpkt.ip6.IP6): - ip = eth.data - else: - continue - - # Get transport layer - if isinstance(ip.data, dpkt.udp.UDP): - transport = ip.data - src_port = transport.sport - dst_port = transport.dport - elif isinstance(ip.data, dpkt.tcp.TCP): - transport = ip.data - src_port = transport.sport - dst_port = transport.dport - else: - continue - - # Determine direction (client port usually higher) - is_outbound = src_port > dst_port - - # Binary search for matching window - for window in query_windows: - if window['start'] <= timestamp <= window['end']: - if is_outbound: - window['bytes_sent'] += packet_size - window['packets_sent'] += 1 - else: - window['bytes_received'] += packet_size - window['packets_received'] += 1 - matched_count += 1 - break - elif timestamp < window['start']: - break # No more windows to check - - except Exception: - continue - - print(f" ✓ Processed {packet_count} packets, matched {matched_count}") - - # Convert to DataFrame - bandwidth_df = pd.DataFrame(query_windows) - return bandwidth_df[['index', 'bytes_sent', 'bytes_received', - 'packets_sent', 'packets_received']] - - except Exception as e: - print(f" ✗ Error reading pcap: {e}") - return None - - def load_data(self): - """Load all relevant CSV files and extract bandwidth from pcaps""" - print("Loading data and analyzing bandwidth...") - - for provider_dir in self.results_dir.iterdir(): - if not provider_dir.is_dir(): - continue - - provider = provider_dir.name - - for csv_file in provider_dir.glob('*.csv'): - if not self.should_include_file(csv_file): - continue - - try: - df = pd.read_csv(csv_file) - df['provider'] = provider - df['test_file'] = csv_file.stem - df['csv_path'] = str(csv_file) - - # Find corresponding pcap file - pcap_file = csv_file.with_suffix('.pcap') - if pcap_file.exists(): - print(f" Processing: {provider}/{csv_file.name}") - bandwidth_data = self.extract_bandwidth_from_pcap_fast(pcap_file, df) - - if bandwidth_data is not None and len(bandwidth_data) > 0: - # Merge bandwidth data - df = df.reset_index(drop=True) - for col in ['bytes_sent', 'bytes_received', 'packets_sent', 'packets_received']: - df[col] = 0 - - for _, row in bandwidth_data.iterrows(): - idx = int(row['index']) - if idx < len(df): - df.at[idx, 'bytes_sent'] = row['bytes_sent'] - df.at[idx, 'bytes_received'] = row['bytes_received'] - df.at[idx, 'packets_sent'] = row['packets_sent'] - df.at[idx, 'packets_received'] = row['packets_received'] - - df['total_bytes'] = df['bytes_sent'] + df['bytes_received'] - - print(f" ✓ Extracted bandwidth for {len(df)} queries") - else: - print(f" ⚠ Could not extract bandwidth data") - else: - print(f" ⚠ No pcap found for {csv_file.name}") - - self.all_data.append(df) - - except Exception as e: - print(f" ✗ Error loading {csv_file}: {e}") - import traceback - traceback.print_exc() - - print(f"\nTotal files loaded: {len(self.all_data)}") - - def create_line_graphs(self, output_dir='output/line_graphs'): - """Create line graphs for latency and bandwidth""" - Path(output_dir).mkdir(parents=True, exist_ok=True) - - print("\nGenerating line graphs...") - - for df in self.all_data: - provider = df['provider'].iloc[0] - test_name = df['test_file'].iloc[0] - - df['query_index'] = range(1, len(df) + 1) - - # Create figure with 2 subplots - fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10)) - - # Plot 1: Latency - ax1.plot(df['query_index'], df['duration_ms'], marker='o', - markersize=4, linewidth=1, alpha=0.7, color='steelblue') - mean_latency = df['duration_ms'].mean() - ax1.axhline(y=mean_latency, color='r', linestyle='--', - label=f'Mean: {mean_latency:.2f} ms', linewidth=2) - ax1.set_xlabel('Query Number', fontsize=12) - ax1.set_ylabel('Latency (ms)', fontsize=12) - ax1.set_title('Latency Over Time', fontsize=12, fontweight='bold') - ax1.legend() - ax1.grid(True, alpha=0.3) - - # Plot 2: Bandwidth - if 'total_bytes' in df.columns and df['total_bytes'].sum() > 0: - ax2.plot(df['query_index'], df['bytes_sent'], marker='s', - markersize=4, linewidth=1, alpha=0.7, - color='orange', label='Sent') - ax2.plot(df['query_index'], df['bytes_received'], marker='^', - markersize=4, linewidth=1, alpha=0.7, - color='green', label='Received') - - mean_sent = df['bytes_sent'].mean() - mean_received = df['bytes_received'].mean() - ax2.axhline(y=mean_sent, color='orange', linestyle='--', - linewidth=1.5, alpha=0.5) - ax2.axhline(y=mean_received, color='green', linestyle='--', - linewidth=1.5, alpha=0.5) - - ax2.set_xlabel('Query Number', fontsize=12) - ax2.set_ylabel('Bytes', fontsize=12) - ax2.set_title(f'Bandwidth Over Time (Mean: ↑{mean_sent:.0f}B ↓{mean_received:.0f}B)', - fontsize=12, fontweight='bold') - ax2.legend() - ax2.grid(True, alpha=0.3) - - fig.suptitle(f'{provider.upper()} - {test_name}', - fontsize=14, fontweight='bold') - plt.tight_layout() - - filename = f"{provider}_{test_name}.png" - plt.savefig(f'{output_dir}/{filename}', bbox_inches='tight') - plt.close() - - print(f" ✓ Created: {filename}") - - def get_protocol_name(self, test_file): - """Extract clean protocol name""" - name = test_file.replace('-persist', '') - - protocol_map = { - 'udp': 'Plain DNS (UDP)', - 'tls': 'DoT (DNS over TLS)', - 'https': 'DoH (DNS over HTTPS)', - 'doh3': 'DoH/3 (DNS over HTTP/3)', - 'doq': 'DoQ (DNS over QUIC)' - } - - return protocol_map.get(name, name.upper()) - - def create_resolver_comparison_bars(self, output_dir='output/comparisons'): - """Create bar graphs comparing resolvers for latency and bandwidth""" - Path(output_dir).mkdir(parents=True, exist_ok=True) - - print("\nGenerating resolver comparison graphs...") - - combined_df = pd.concat(self.all_data, ignore_index=True) - protocols = combined_df['test_file'].unique() - - for protocol in protocols: - protocol_data = combined_df[combined_df['test_file'] == protocol] - protocol_name = self.get_protocol_name(protocol) - - # Latency stats - latency_stats = protocol_data.groupby('provider')['duration_ms'].agg([ - ('mean', 'mean'), - ('median', 'median'), - ('std', 'std') - ]).reset_index() - - # Create latency comparison - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) - fig.suptitle(f'{protocol_name} - Latency Comparison', - fontsize=16, fontweight='bold') - - # Mean latency - bars1 = ax1.bar(latency_stats['provider'], latency_stats['mean'], - color='steelblue', alpha=0.8, edgecolor='black') - ax1.errorbar(latency_stats['provider'], latency_stats['mean'], - yerr=latency_stats['std'], fmt='none', color='black', - capsize=5, alpha=0.6) - - for bar in bars1: - height = bar.get_height() - ax1.text(bar.get_x() + bar.get_width()/2., height, - f'{height:.2f}', - ha='center', va='bottom', fontweight='bold') - - ax1.set_xlabel('Resolver', fontsize=12) - ax1.set_ylabel('Mean Latency (ms)', fontsize=12) - ax1.set_title('Mean Latency', fontsize=12) - ax1.grid(axis='y', alpha=0.3) - - # Median latency - bars2 = ax2.bar(latency_stats['provider'], latency_stats['median'], - color='coral', alpha=0.8, edgecolor='black') - - for bar in bars2: - height = bar.get_height() - ax2.text(bar.get_x() + bar.get_width()/2., height, - f'{height:.2f}', - ha='center', va='bottom', fontweight='bold') - - ax2.set_xlabel('Resolver', fontsize=12) - ax2.set_ylabel('Median Latency (ms)', fontsize=12) - ax2.set_title('Median Latency', fontsize=12) - ax2.grid(axis='y', alpha=0.3) - - plt.tight_layout() - plt.savefig(f'{output_dir}/latency_{protocol}.png', bbox_inches='tight') - plt.close() - print(f" ✓ Created: latency_{protocol}.png") - - # Bandwidth comparison - if 'total_bytes' in protocol_data.columns and protocol_data['total_bytes'].sum() > 0: - bandwidth_stats = protocol_data.groupby('provider').agg({ - 'bytes_sent': 'mean', - 'bytes_received': 'mean', - 'total_bytes': 'mean' - }).reset_index() - - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) - fig.suptitle(f'{protocol_name} - Bandwidth Comparison', - fontsize=16, fontweight='bold') - - # Sent vs Received - x = np.arange(len(bandwidth_stats)) - width = 0.35 - - bars1 = ax1.bar(x - width/2, bandwidth_stats['bytes_sent'], width, - label='Sent', color='orange', alpha=0.8, edgecolor='black') - bars2 = ax1.bar(x + width/2, bandwidth_stats['bytes_received'], width, - label='Received', color='green', alpha=0.8, edgecolor='black') - - ax1.set_xlabel('Resolver', fontsize=12) - ax1.set_ylabel('Bytes per Query', fontsize=12) - ax1.set_title('Average Bandwidth per Query', fontsize=12) - ax1.set_xticks(x) - ax1.set_xticklabels(bandwidth_stats['provider']) - ax1.legend() - ax1.grid(axis='y', alpha=0.3) - - # Total bandwidth - bars3 = ax2.bar(bandwidth_stats['provider'], bandwidth_stats['total_bytes'], - color='purple', alpha=0.8, edgecolor='black') - - for bar in bars3: - height = bar.get_height() - ax2.text(bar.get_x() + bar.get_width()/2., height, - f'{height:.0f}', - ha='center', va='bottom', fontweight='bold') - - ax2.set_xlabel('Resolver', fontsize=12) - ax2.set_ylabel('Total Bytes per Query', fontsize=12) - ax2.set_title('Total Bandwidth per Query', fontsize=12) - ax2.grid(axis='y', alpha=0.3) - - plt.tight_layout() - plt.savefig(f'{output_dir}/bandwidth_{protocol}.png', bbox_inches='tight') - plt.close() - print(f" ✓ Created: bandwidth_{protocol}.png") - - def generate_latex_tables(self, output_dir='output/tables'): - """Generate LaTeX tables with latency and bandwidth statistics""" - Path(output_dir).mkdir(parents=True, exist_ok=True) - - print("\nGenerating LaTeX tables...") - - combined_df = pd.concat(self.all_data, ignore_index=True) - - # Generate latency table for each resolver - for provider in combined_df['provider'].unique(): - provider_data = combined_df[combined_df['provider'] == provider] - - stats = provider_data.groupby('test_file')['duration_ms'].agg([ - ('Mean', 'mean'), - ('Median', 'median'), - ('Std Dev', 'std'), - ('P95', lambda x: x.quantile(0.95)), - ('P99', lambda x: x.quantile(0.99)) - ]).round(2) - - stats.index = stats.index.map(self.get_protocol_name) - stats.index.name = 'Protocol' - - latex_code = stats.to_latex( - caption=f'{provider.upper()} - Latency Statistics (ms)', - label=f'tab:{provider}_latency', - float_format="%.2f" - ) - - with open(f'{output_dir}/{provider}_latency.tex', 'w') as f: - f.write(latex_code) - - print(f" ✓ Created: {provider}_latency.tex") - - # Generate bandwidth table for each resolver - for provider in combined_df['provider'].unique(): - provider_data = combined_df[combined_df['provider'] == provider] - - if 'total_bytes' not in provider_data.columns or provider_data['total_bytes'].sum() == 0: - continue - - bandwidth_stats = provider_data.groupby('test_file').agg({ - 'bytes_sent': 'mean', - 'bytes_received': 'mean', - 'total_bytes': 'mean' - }).round(2) - - bandwidth_stats.columns = ['Avg Sent (B)', 'Avg Received (B)', 'Avg Total (B)'] - bandwidth_stats.index = bandwidth_stats.index.map(self.get_protocol_name) - bandwidth_stats.index.name = 'Protocol' - - latex_code = bandwidth_stats.to_latex( - caption=f'{provider.upper()} - Bandwidth Statistics', - label=f'tab:{provider}_bandwidth', - float_format="%.2f" - ) - - with open(f'{output_dir}/{provider}_bandwidth.tex', 'w') as f: - f.write(latex_code) - - print(f" ✓ Created: {provider}_bandwidth.tex") - - # Generate protocol efficiency table - print("\nGenerating protocol efficiency table...") - - if 'total_bytes' in combined_df.columns and combined_df['total_bytes'].sum() > 0: - protocol_bandwidth = combined_df.groupby('test_file').agg({ - 'bytes_sent': 'mean', - 'bytes_received': 'mean', - 'total_bytes': 'mean' - }).round(2) - - # Find UDP baseline - udp_baseline = None - for protocol in protocol_bandwidth.index: - if 'udp' in protocol: - udp_baseline = protocol_bandwidth.loc[protocol, 'total_bytes'] - break - - if udp_baseline and udp_baseline > 0: - protocol_bandwidth['Overhead vs UDP (%)'] = ( - (protocol_bandwidth['total_bytes'] - udp_baseline) / udp_baseline * 100 - ).round(1) - protocol_bandwidth['Efficiency (%)'] = ( - 100 / (1 + protocol_bandwidth['Overhead vs UDP (%)'] / 100) - ).round(1) - - protocol_bandwidth.columns = ['Avg Sent (B)', 'Avg Received (B)', - 'Avg Total (B)', 'Overhead (%)', 'Efficiency (%)'] - protocol_bandwidth.index = protocol_bandwidth.index.map(self.get_protocol_name) - protocol_bandwidth.index.name = 'Protocol' - - latex_code = protocol_bandwidth.to_latex( - caption='Protocol Bandwidth Efficiency Comparison', - label='tab:protocol_efficiency', - float_format="%.2f" - ) - - with open(f'{output_dir}/protocol_efficiency.tex', 'w') as f: - f.write(latex_code) - - print(f" ✓ Created: protocol_efficiency.tex") - print("\n--- Protocol Efficiency ---") - print(protocol_bandwidth.to_string()) - - # Generate combined comparison tables - for metric in ['Mean', 'Median', 'P95']: - comparison_stats = combined_df.groupby(['provider', 'test_file'])['duration_ms'].agg([ - ('Mean', 'mean'), - ('Median', 'median'), - ('P95', lambda x: x.quantile(0.95)) - ]).round(2) - - pivot_table = comparison_stats[metric].unstack(level=0) - pivot_table.index = pivot_table.index.map(self.get_protocol_name) - pivot_table.index.name = 'Protocol' - - latex_code = pivot_table.to_latex( - caption=f'Resolver Latency Comparison - {metric} (ms)', - label=f'tab:comparison_{metric.lower()}', - float_format="%.2f" - ) - - with open(f'{output_dir}/comparison_{metric.lower()}.tex', 'w') as f: - f.write(latex_code) - - print(f" ✓ Created: comparison_{metric.lower()}.tex") - - def run_analysis(self): - """Run the complete analysis""" - print("="*80) - print("Fast DNS QoS Analysis with Bandwidth") - print("="*80) - - self.load_data() - - if not self.all_data: - print("\n⚠ No data loaded.") - return - - print("\n" + "="*80) - self.create_line_graphs() - - print("\n" + "="*80) - self.create_resolver_comparison_bars() - - print("\n" + "="*80) - self.generate_latex_tables() - - print("\n" + "="*80) - print("✓ Analysis Complete!") - print("="*80) - - -if __name__ == "__main__": - analyzer = FastDNSAnalyzer(results_dir='results') - analyzer.run_analysis() diff --git a/scripts/tools/add_extra_metrics_to_csv.py b/scripts/post_processing/add_extra_metrics_to_csv.py similarity index 84% rename from scripts/tools/add_extra_metrics_to_csv.py rename to scripts/post_processing/add_extra_metrics_to_csv.py index ed09cba..3f6fdb8 100644 --- a/scripts/tools/add_extra_metrics_to_csv.py +++ b/scripts/post_processing/add_extra_metrics_to_csv.py @@ -15,6 +15,15 @@ import dpkt from dateutil import parser as date_parser +BANDWIDTH_COLUMNS = [ + 'bytes_sent', + 'bytes_received', + 'packets_sent', + 'packets_received', + 'total_bytes', +] + + class Packet(NamedTuple): """Lightweight packet representation.""" timestamp: float @@ -36,6 +45,36 @@ class QueryWindow: self.pkts_received = 0 +def is_already_processed(csv_path: Path) -> bool: + """ + Check if CSV has already been processed. + Returns True if bandwidth columns exist AND at least one row has non-zero data. + """ + try: + with open(csv_path, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + + # Check if columns exist + if not reader.fieldnames: + return False + + if not all(col in reader.fieldnames for col in BANDWIDTH_COLUMNS): + return False + + # Check if any row has non-zero bandwidth data + for row in reader: + for col in BANDWIDTH_COLUMNS: + val = row.get(col, '').strip() + if val and val != '0': + return True + + # All rows have zero/empty values - not truly processed + return False + + except Exception: + return False + + def parse_csv_timestamp(ts_str: str) -> float: """Convert RFC3339Nano timestamp to Unix epoch (seconds).""" dt = date_parser.isoparse(ts_str) @@ -249,24 +288,20 @@ def write_enriched_csv( shutil.copy2(csv_path, backup_path) print(f" Backup: {backup_path.name}") - # Get fieldnames - original_fields = list(queries[0]['data'].keys()) - new_fields = [ - 'bytes_sent', - 'bytes_received', - 'packets_sent', - 'packets_received', - 'total_bytes', + # Get fieldnames - filter out any existing bandwidth columns to avoid dupes + original_fields = [ + f for f in queries[0]['data'].keys() + if f not in BANDWIDTH_COLUMNS ] - fieldnames = original_fields + new_fields + fieldnames = original_fields + BANDWIDTH_COLUMNS with open(csv_path, 'w', encoding='utf-8', newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for q in queries: - row = q['data'].copy() - for field in new_fields: + row = {k: v for k, v in q['data'].items() if k not in BANDWIDTH_COLUMNS} + for field in BANDWIDTH_COLUMNS: row[field] = q[field] writer.writerow(row) @@ -281,6 +316,7 @@ def process_provider_directory(provider_path: Path): csv_files = sorted(provider_path.glob('*.csv')) processed = 0 + skipped = 0 total_time = 0 for csv_path in csv_files: @@ -294,6 +330,12 @@ def process_provider_directory(provider_path: Path): print(f"\n ⚠ Skipping {csv_path.name} - no matching PCAP") continue + # Check if already processed + if is_already_processed(csv_path): + print(f"\n ⏭ Skipping {csv_path.name} - already processed") + skipped += 1 + continue + print(f"\n 📁 {csv_path.name}") file_start = time.time() @@ -323,7 +365,8 @@ def process_provider_directory(provider_path: Path): print(f" ✓ Completed in {file_time:.2f}s") print(f"\n {'='*58}") - print(f" {provider_path.name}: {processed} files in {total_time:.2f}s") + print(f" {provider_path.name}: {processed} processed, {skipped} skipped") + print(f" Time: {total_time:.2f}s") print(f" {'='*58}") diff --git a/scripts/post_processing/merge_files.py b/scripts/post_processing/merge_files.py new file mode 100644 index 0000000..ab9d39f --- /dev/null +++ b/scripts/post_processing/merge_files.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +""" +Merge all DNS test CSVs into a single unified CSV. +Extracts metadata from filenames and directory structure. +""" + +import csv +import os +from pathlib import Path +from dateutil import parser as date_parser +import argparse + + +def parse_config(filename: str) -> dict: + """ + Parse protocol, dnssec_mode, and keep_alive from filename. + + Examples: + doh3-auth.csv → protocol=doh3, dnssec=auth, persist=0 + tls-trust-persist.csv → protocol=tls, dnssec=trust, persist=1 + https.csv → protocol=https, dnssec=off, persist=0 + doudp-auth.csv → protocol=doudp, dnssec=auth, persist=0 + dnscrypt-trust.csv → protocol=dnscrypt, dnssec=trust, persist=0 + """ + base = filename.replace('.csv', '') + parts = base.split('-') + + protocol = parts[0] + dnssec_mode = 'off' + keep_alive = 0 + + for part in parts[1:]: + if part in ('auth', 'trust'): + dnssec_mode = part + elif part == 'persist': + keep_alive = 1 + + return { + 'protocol': protocol, + 'dnssec_mode': dnssec_mode, + 'keep_alive': keep_alive, + } + + +def parse_timestamp_unix(ts_str: str) -> float: + """Convert RFC3339 timestamp to Unix epoch.""" + try: + dt = date_parser.isoparse(ts_str) + return dt.timestamp() + except Exception: + return 0.0 + + +def ns_to_ms(duration_ns: str) -> float: + """Convert nanoseconds to milliseconds.""" + try: + return float(duration_ns) / 1_000_000 + except (ValueError, TypeError): + return 0.0 + + +def find_csv_files(input_dir: Path) -> list: + """Find all non-backup CSV files.""" + files = [] + for csv_path in input_dir.rglob('*.csv'): + if '.bak' in csv_path.name: + continue + files.append(csv_path) + return sorted(files) + + +def merge_all_csvs(input_dir: Path, output_path: Path): + """Merge all CSVs into a single file.""" + + csv_files = find_csv_files(input_dir) + + if not csv_files: + print("No CSV files found") + return + + print(f"Found {len(csv_files)} CSV files") + + # Output columns in desired order + output_columns = [ + 'id', + 'provider', + 'protocol', + 'dnssec_mode', + 'domain', + 'query_type', + 'keep_alive', + 'dns_server', + 'timestamp', + 'timestamp_unix', + 'duration_ns', + 'duration_ms', + 'request_size_bytes', + 'response_size_bytes', + 'bytes_sent', + 'bytes_received', + 'packets_sent', + 'packets_received', + 'total_bytes', + 'response_code', + 'error', + ] + + global_id = 0 + total_rows = 0 + + with open(output_path, 'w', newline='', encoding='utf-8') as outfile: + writer = csv.DictWriter(outfile, fieldnames=output_columns) + writer.writeheader() + + for csv_path in csv_files: + # Extract provider from path + provider = csv_path.parent.name.lower() + + # Parse config from filename + config = parse_config(csv_path.name) + + print(f" {provider}/{csv_path.name} ({config['protocol']}, {config['dnssec_mode']}, persist={config['keep_alive']})") + + file_rows = 0 + + with open(csv_path, 'r', newline='', encoding='utf-8') as infile: + reader = csv.DictReader(infile) + + for row in reader: + global_id += 1 + file_rows += 1 + + # Build output row + out_row = { + 'id': global_id, + 'provider': provider, + 'protocol': config['protocol'], + 'dnssec_mode': config['dnssec_mode'], + 'keep_alive': config['keep_alive'], + 'domain': row.get('domain', ''), + 'query_type': row.get('query_type', ''), + 'dns_server': row.get('dns_server', ''), + 'timestamp': row.get('timestamp', ''), + 'timestamp_unix': parse_timestamp_unix(row.get('timestamp', '')), + 'duration_ns': row.get('duration_ns', ''), + 'duration_ms': ns_to_ms(row.get('duration_ns', '')), + 'request_size_bytes': row.get('request_size_bytes', ''), + 'response_size_bytes': row.get('response_size_bytes', ''), + 'bytes_sent': row.get('bytes_sent', ''), + 'bytes_received': row.get('bytes_received', ''), + 'packets_sent': row.get('packets_sent', ''), + 'packets_received': row.get('packets_received', ''), + 'total_bytes': row.get('total_bytes', ''), + 'response_code': row.get('response_code', ''), + 'error': row.get('error', ''), + } + + writer.writerow(out_row) + + total_rows += file_rows + print(f" → {file_rows:,} rows") + + print(f"\n{'='*60}") + print(f"Output: {output_path}") + print(f"Total rows: {total_rows:,}") + print(f"{'='*60}") + + +def main(): + parser = argparse.ArgumentParser( + description='Merge all DNS test CSVs into a single file' + ) + parser.add_argument( + 'input_dir', + nargs='?', + default='.', + help='Input directory containing provider folders (default: .)' + ) + parser.add_argument( + '-o', '--output', + default='dns_results.csv', + help='Output CSV path (default: dns_results.csv)' + ) + + args = parser.parse_args() + + input_dir = Path(args.input_dir) + output_path = Path(args.output) + + if not input_dir.exists(): + print(f"Error: Input directory not found: {input_dir}") + return 1 + + print("="*60) + print("MERGE ALL DNS CSVs") + print("="*60) + print(f"Input: {input_dir}") + print(f"Output: {output_path}") + print() + + merge_all_csvs(input_dir, output_path) + + return 0 + + +if __name__ == '__main__': + exit(main()) diff --git a/scripts/shell/run.sh b/scripts/shell/run.sh new file mode 100755 index 0000000..cf0b78d --- /dev/null +++ b/scripts/shell/run.sh @@ -0,0 +1,95 @@ +package stats + +import ( + "encoding/csv" + "fmt" + "os" + "runtime" + "time" +) + +type RuntimeStats struct { + TotalAlloc uint64 + Mallocs uint64 + NumGC uint32 + AllocDelta uint64 + MallocsDelta uint64 + GCDelta uint32 +} + +type RuntimeCollector struct { + startStats runtime.MemStats + memPath string +} + +func NewRuntimeCollector(memPath string) *RuntimeCollector { + var stats runtime.MemStats + runtime.ReadMemStats(&stats) + + return &RuntimeCollector{ + startStats: stats, + memPath: memPath, + } +} + +func (rc *RuntimeCollector) Collect() RuntimeStats { + var current runtime.MemStats + runtime.ReadMemStats(¤t) + + return RuntimeStats{ + TotalAlloc: current.TotalAlloc, + Mallocs: current.Mallocs, + NumGC: current.NumGC, + AllocDelta: current.TotalAlloc - rc.startStats.TotalAlloc, + MallocsDelta: current.Mallocs - rc.startStats.Mallocs, + GCDelta: current.NumGC - rc.startStats.NumGC, + } +} + +func (rc *RuntimeCollector) WriteStats() error { + stats := rc.Collect() + timestamp := time.Now().Format(time.RFC3339Nano) + + // Check if file exists + fileExists := false + if _, err := os.Stat(rc.memPath); err == nil { + fileExists = true + } + + // Open in append mode + file, err := os.OpenFile(rc.memPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return fmt.Errorf("failed to open mem.csv: %w", err) + } + defer file.Close() + + writer := csv.NewWriter(file) + + // Write header if new file + if !fileExists { + header := []string{ + "timestamp", "total_alloc_bytes", "mallocs", "gc_cycles", + "alloc_delta", "mallocs_delta", "gc_delta", + } + if err := writer.Write(header); err != nil { + return fmt.Errorf("failed to write mem.csv header: %w", err) + } + } + + // Write data row + row := []string{ + timestamp, + fmt.Sprintf("%d", stats.TotalAlloc), + fmt.Sprintf("%d", stats.Mallocs), + fmt.Sprintf("%d", stats.NumGC), + fmt.Sprintf("%d", stats.AllocDelta), + fmt.Sprintf("%d", stats.MallocsDelta), + fmt.Sprintf("%d", stats.GCDelta), + } + if err := writer.Write(row); err != nil { + return fmt.Errorf("failed to write mem.csv row: %w", err) + } + + writer.Flush() + return writer.Error() +} diff --git a/setup-netns.sh b/scripts/shell/setup-netns.sh similarity index 100% rename from setup-netns.sh rename to scripts/shell/setup-netns.sh diff --git a/scripts/tools/add_extra_metrics_to_csv.go b/scripts/tools/add_extra_metrics_to_csv.go deleted file mode 100644 index 1b79038..0000000 --- a/scripts/tools/add_extra_metrics_to_csv.go +++ /dev/null @@ -1,369 +0,0 @@ -package main - -import ( - "encoding/csv" - "fmt" - "log" - "os" - "path/filepath" - "strconv" - "strings" - "time" - - "github.com/google/gopacket" - "github.com/google/gopacket/layers" - "github.com/google/gopacket/pcapgo" -) - -type QueryRecord struct { - Domain string - QueryType string - Protocol string - DNSSec string - AuthDNSSec string - KeepAlive string - DNSServer string - Timestamp string - DurationNs int64 - DurationMs float64 - RequestSizeBytes int - ResponseSizeBytes int - ResponseCode string - Error string - BytesSent int64 - BytesReceived int64 - PacketsSent int64 - PacketsReceived int64 - TotalBytes int64 -} - -func parseRFC3339Nano(ts string) (time.Time, error) { - return time.Parse(time.RFC3339Nano, ts) -} - -func processProviderFolder(providerPath string) error { - providerName := filepath.Base(providerPath) - fmt.Printf("\n=== Processing provider: %s ===\n", providerName) - - files, err := os.ReadDir(providerPath) - if err != nil { - return err - } - - processed := 0 - skipped := 0 - errors := 0 - - for _, file := range files { - if !strings.HasSuffix(file.Name(), ".csv") { - continue - } - - csvPath := filepath.Join(providerPath, file.Name()) - pcapPath := strings.Replace(csvPath, ".csv", ".pcap", 1) - - // Check if PCAP exists - if _, err := os.Stat(pcapPath); os.IsNotExist(err) { - fmt.Printf(" ⊗ Skipping: %s (no matching PCAP)\n", file.Name()) - skipped++ - continue - } - - // Check if already processed (has backup) - backupPath := csvPath + ".bak" - if _, err := os.Stat(backupPath); err == nil { - fmt.Printf(" ⊙ Skipping: %s (already processed, backup exists)\n", file.Name()) - skipped++ - continue - } - - fmt.Printf(" ↻ Processing: %s ... ", file.Name()) - if err := processPair(csvPath, pcapPath); err != nil { - fmt.Printf("ERROR\n") - log.Printf(" Error: %v\n", err) - errors++ - } else { - fmt.Printf("✓\n") - processed++ - } - } - - fmt.Printf(" Summary: %d processed, %d skipped, %d errors\n", processed, skipped, errors) - return nil -} - -func processPair(csvPath, pcapPath string) error { - // Create backup - backupPath := csvPath + ".bak" - input, err := os.ReadFile(csvPath) - if err != nil { - return fmt.Errorf("backup read failed: %w", err) - } - if err := os.WriteFile(backupPath, input, 0644); err != nil { - return fmt.Errorf("backup write failed: %w", err) - } - - // Read CSV records - records, err := readCSV(csvPath) - if err != nil { - return fmt.Errorf("CSV read failed: %w", err) - } - - if len(records) == 0 { - return fmt.Errorf("no records in CSV") - } - - // Read and parse PCAP - packets, err := readPCAPGo(pcapPath) - if err != nil { - return fmt.Errorf("PCAP read failed: %w", err) - } - - // Enrich records with bandwidth data - enrichRecords(records, packets) - - // Write enriched CSV - if err := writeCSV(csvPath, records); err != nil { - return fmt.Errorf("CSV write failed: %w", err) - } - - return nil -} - -func readCSV(path string) ([]*QueryRecord, error) { - f, err := os.Open(path) - if err != nil { - return nil, err - } - defer f.Close() - - r := csv.NewReader(f) - rows, err := r.ReadAll() - if err != nil { - return nil, err - } - - if len(rows) < 2 { - return nil, fmt.Errorf("CSV has no data rows") - } - - records := make([]*QueryRecord, 0, len(rows)-1) - for i := 1; i < len(rows); i++ { - row := rows[i] - if len(row) < 14 { - log.Printf(" Warning: Skipping malformed row %d", i+1) - continue - } - - durationNs, _ := strconv.ParseInt(row[8], 10, 64) - durationMs, _ := strconv.ParseFloat(row[9], 64) - reqSize, _ := strconv.Atoi(row[10]) - respSize, _ := strconv.Atoi(row[11]) - - records = append(records, &QueryRecord{ - Domain: row[0], - QueryType: row[1], - Protocol: row[2], - DNSSec: row[3], - AuthDNSSec: row[4], - KeepAlive: row[5], - DNSServer: row[6], - Timestamp: row[7], - DurationNs: durationNs, - DurationMs: durationMs, - RequestSizeBytes: reqSize, - ResponseSizeBytes: respSize, - ResponseCode: row[12], - Error: row[13], - }) - } - - return records, nil -} - -type PacketInfo struct { - Timestamp time.Time - Size int - IsSent bool -} - -func readPCAPGo(path string) ([]PacketInfo, error) { - f, err := os.Open(path) - if err != nil { - return nil, err - } - defer f.Close() - - reader, err := pcapgo.NewReader(f) - if err != nil { - return nil, err - } - - var packets []PacketInfo - packetSource := gopacket.NewPacketSource(reader, reader.LinkType()) - - for packet := range packetSource.Packets() { - if packet.NetworkLayer() == nil { - continue - } - - isDNS := false - isSent := false - - // Check UDP layer (DNS, DoQ, DoH3) - if udpLayer := packet.Layer(layers.LayerTypeUDP); udpLayer != nil { - udp := udpLayer.(*layers.UDP) - isDNS = udp.SrcPort == 53 || udp.DstPort == 53 || - udp.SrcPort == 853 || udp.DstPort == 853 || - udp.SrcPort == 443 || udp.DstPort == 443 - isSent = udp.DstPort == 53 || udp.DstPort == 853 || udp.DstPort == 443 - } - - // Check TCP layer (DoT, DoH) - if tcpLayer := packet.Layer(layers.LayerTypeTCP); tcpLayer != nil { - tcp := tcpLayer.(*layers.TCP) - isDNS = tcp.SrcPort == 53 || tcp.DstPort == 53 || - tcp.SrcPort == 853 || tcp.DstPort == 853 || - tcp.SrcPort == 443 || tcp.DstPort == 443 - isSent = tcp.DstPort == 53 || tcp.DstPort == 853 || tcp.DstPort == 443 - } - - if isDNS { - packets = append(packets, PacketInfo{ - Timestamp: packet.Metadata().Timestamp, - Size: len(packet.Data()), - IsSent: isSent, - }) - } - } - - return packets, nil -} - -func enrichRecords(records []*QueryRecord, packets []PacketInfo) { - for _, rec := range records { - ts, err := parseRFC3339Nano(rec.Timestamp) - if err != nil { - log.Printf(" Warning: Failed to parse timestamp: %s", rec.Timestamp) - continue - } - - // Define time window for this query - windowStart := ts - windowEnd := ts.Add(time.Duration(rec.DurationNs)) - - var sent, recv, pktSent, pktRecv int64 - - // Match packets within the time window - for _, pkt := range packets { - if (pkt.Timestamp.Equal(windowStart) || pkt.Timestamp.After(windowStart)) && - pkt.Timestamp.Before(windowEnd) { - if pkt.IsSent { - sent += int64(pkt.Size) - pktSent++ - } else { - recv += int64(pkt.Size) - pktRecv++ - } - } - } - - rec.BytesSent = sent - rec.BytesReceived = recv - rec.PacketsSent = pktSent - rec.PacketsReceived = pktRecv - rec.TotalBytes = sent + recv - } -} - -func writeCSV(path string, records []*QueryRecord) error { - f, err := os.Create(path) - if err != nil { - return err - } - defer f.Close() - - w := csv.NewWriter(f) - defer w.Flush() - - // Write header - header := []string{ - "domain", "query_type", "protocol", "dnssec", "auth_dnssec", - "keep_alive", "dns_server", "timestamp", "duration_ns", "duration_ms", - "request_size_bytes", "response_size_bytes", "response_code", "error", - "bytes_sent", "bytes_received", "packets_sent", "packets_received", "total_bytes", - } - if err := w.Write(header); err != nil { - return err - } - - // Write data rows - for _, rec := range records { - row := []string{ - rec.Domain, - rec.QueryType, - rec.Protocol, - rec.DNSSec, - rec.AuthDNSSec, - rec.KeepAlive, - rec.DNSServer, - rec.Timestamp, - strconv.FormatInt(rec.DurationNs, 10), - strconv.FormatFloat(rec.DurationMs, 'f', -1, 64), - strconv.Itoa(rec.RequestSizeBytes), - strconv.Itoa(rec.ResponseSizeBytes), - rec.ResponseCode, - rec.Error, - strconv.FormatInt(rec.BytesSent, 10), - strconv.FormatInt(rec.BytesReceived, 10), - strconv.FormatInt(rec.PacketsSent, 10), - strconv.FormatInt(rec.PacketsReceived, 10), - strconv.FormatInt(rec.TotalBytes, 10), - } - if err := w.Write(row); err != nil { - return err - } - } - - return nil -} - -func main() { - resultsDir := "results" - providers := []string{"adguard", "cloudflare", "google", "quad9"} - - fmt.Println("╔═══════════════════════════════════════════════╗") - fmt.Println("║ DNS PCAP Preprocessor v1.0 ║") - fmt.Println("║ Enriching ALL CSVs with bandwidth metrics ║") - fmt.Println("╚═══════════════════════════════════════════════╝") - - totalProcessed := 0 - totalSkipped := 0 - totalErrors := 0 - - for _, provider := range providers { - providerPath := filepath.Join(resultsDir, provider) - if _, err := os.Stat(providerPath); os.IsNotExist(err) { - fmt.Printf("\n⚠ Provider folder not found: %s\n", provider) - continue - } - - if err := processProviderFolder(providerPath); err != nil { - log.Printf("Error processing %s: %v\n", provider, err) - totalErrors++ - } - } - - fmt.Println("\n╔═══════════════════════════════════════════════╗") - fmt.Println("║ Preprocessing Complete! ║") - fmt.Println("╚═══════════════════════════════════════════════╝") - fmt.Printf("\nAll CSV files now have 5 additional columns:\n") - fmt.Printf(" • bytes_sent - Total bytes sent to DNS server\n") - fmt.Printf(" • bytes_received - Total bytes received from DNS server\n") - fmt.Printf(" • packets_sent - Number of packets sent\n") - fmt.Printf(" • packets_received - Number of packets received\n") - fmt.Printf(" • total_bytes - Sum of sent + received bytes\n") - fmt.Printf("\n📁 Backups saved as: *.csv.bak\n") - fmt.Printf("\n💡 Tip: The analysis script will filter which files to visualize,\n") - fmt.Printf(" but all files now have complete bandwidth metrics!\n") -} diff --git a/scripts/tools/clean_pcaps.py b/scripts/tools/clean_pcaps.py deleted file mode 100644 index d7ca555..0000000 --- a/scripts/tools/clean_pcaps.py +++ /dev/null @@ -1,367 +0,0 @@ -#!/usr/bin/env python3 -""" -Advanced PCAP filter for DNS traffic (with IPv6 support). - -Filters out: -- Local network traffic except test machine (IPv4: 10.0.0.50; IPv6: specific addresses) -- AdGuard DNS servers (for non-AdGuard captures) -- Non-DNS traffic based on protocol-specific ports -""" - -import os -import subprocess -from pathlib import Path -import argparse - -# Test machine IPs (IPv4 and IPv6 from your provided info) -TEST_IPV4 = '10.0.0.50' -TEST_IPV6_GLOBAL = '2001:818:e73e:ba00:5506:dfd4:ed8b:96e' -TEST_IPV6_LINKLOCAL = 'fe80::fe98:c62e:4463:9a2d' - -# Port mappings -PORT_MAP = { - 'udp': [53], # DNS-over-UDP - 'tls': [53, 853], # DNS-over-TLS - 'https': [53, 443], # DNS-over-HTTPS (DoH) - 'doq': [53, 784, 8853], # DNS-over-QUIC - 'doh3': [53, 443] # DNS-over-HTTP/3 -} - -# AdGuard DNS IPs to filter out (for non-AdGuard captures) -ADGUARD_IPS = [ - '94.140.14.14', - '94.140.15.15', - '2a10:50c0::ad1:ff', - '2a10:50c0::ad2:ff' -] - -def parse_filename(filename): - """Extract protocol from filename""" - base = filename.replace('.pcap', '').replace('.csv', '') - parts = base.split('-') - - if len(parts) < 1: # Minimum: protocol - return None - - protocol = parts[0].lower() - return protocol - -def extract_resolver_from_path(pcap_path): - """Extract resolver name from directory structure""" - parts = Path(pcap_path).parts - - for part in parts: - if part.lower() in ['cloudflare', 'google', 'quad9', 'adguard']: - return part.lower() - - return None - -def build_filter_expression(protocol, resolver): - """ - Build tshark filter expression. - - Strategy: - 1. Only protocol-specific DNS ports - 2. Keep only traffic involving the test machine (IPv4/IPv6) - 3. Exclude AdGuard IPs for non-AdGuard captures - """ - - # Get ports for this protocol - ports = PORT_MAP.get(protocol, [53, 443, 853, 784, 8853]) - - # Build port filter (UDP or TCP on these ports) - port_conditions = [] - for port in ports: - port_conditions.append(f'(udp.port == {port} or tcp.port == {port})') - - port_filter = ' or '.join(port_conditions) - - # Build test machine filter (keep if src or dst is test machine IP) - machine_conditions = [f'(ip.addr == {TEST_IPV4})'] - if TEST_IPV6_GLOBAL: - machine_conditions.append(f'(ipv6.addr == {TEST_IPV6_GLOBAL})') - if TEST_IPV6_LINKLOCAL: - machine_conditions.append(f'(ipv6.addr == {TEST_IPV6_LINKLOCAL})') - - machine_filter = ' or '.join(machine_conditions) - - # Build AdGuard exclusion filter - adguard_exclusions = [] - if resolver != 'adguard': - for ip in ADGUARD_IPS: - if ':' in ip: # IPv6 - adguard_exclusions.append(f'!(ipv6.addr == {ip})') - else: # IPv4 - adguard_exclusions.append(f'!(ip.addr == {ip})') - - # Combine all filters - filters = [f'({port_filter})', f'({machine_filter})'] - - if adguard_exclusions: - adguard_filter = ' and '.join(adguard_exclusions) - filters.append(f'({adguard_filter})') - - final_filter = ' and '.join(filters) - - return final_filter - -def filter_pcap(input_path, output_path, filter_expr, verbose=False): - """Apply filter to PCAP file using tshark""" - - cmd = [ - 'tshark', - '-r', input_path, - '-Y', filter_expr, - '-w', output_path, - '-F', 'pcap' - ] - - try: - if verbose: - print(f" Filter: {filter_expr}") - - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=300 - ) - - if result.returncode != 0: - print(f" ✗ Error: {result.stderr.strip()}") - return False - - if not os.path.exists(output_path): - print(f" ✗ Output file not created") - return False - - output_size = os.path.getsize(output_path) - if output_size < 24: - print(f" ⚠ Warning: Output is empty") - - return True - - except subprocess.TimeoutExpired: - print(f" ✗ Timeout (>5 minutes)") - return False - except Exception as e: - print(f" ✗ Exception: {e}") - return False - -def find_pcap_files(root_dir): - """Recursively find all PCAP files""" - pcap_files = [] - for root, dirs, files in os.walk(root_dir): - for file in files: - if file.endswith('.pcap'): - full_path = os.path.join(root, file) - pcap_files.append(full_path) - return sorted(pcap_files) - -def format_bytes(bytes_val): - """Format bytes as human readable""" - for unit in ['B', 'KB', 'MB', 'GB']: - if bytes_val < 1024.0: - return f"{bytes_val:.1f} {unit}" - bytes_val /= 1024.0 - return f"{bytes_val:.1f} TB" - -def main(): - parser = argparse.ArgumentParser( - description='Advanced PCAP filter for DNS traffic (IPv4/IPv6)', - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=''' -Filtering rules: - 1. Only include traffic on protocol-specific DNS ports - 2. Keep only packets involving the test machine (10.0.0.50 or its IPv6 addresses) - 3. Exclude AdGuard IPs for non-AdGuard captures - -Protocol-specific ports: - udp: 53 - tls: 53, 853 - https: 53, 443 - doq: 53, 784, 8853 - doh3: 53, 443 - -Examples: - # Dry run - %(prog)s ./results --dry-run - - # Filter with verbose output - %(prog)s ./results --verbose - - # Custom output directory - %(prog)s ./results --output ./cleaned - ''' - ) - - parser.add_argument( - 'input_dir', - help='Input directory containing PCAP files' - ) - parser.add_argument( - '-o', '--output', - default='./results_filtered', - help='Output directory (default: ./results_filtered)' - ) - parser.add_argument( - '--dry-run', - action='store_true', - help='Show what would be done without filtering' - ) - parser.add_argument( - '--limit', - type=int, - help='Only process first N files (for testing)' - ) - parser.add_argument( - '-v', '--verbose', - action='store_true', - help='Verbose output (show filter expressions)' - ) - parser.add_argument( - '--overwrite', - action='store_true', - help='Overwrite existing filtered files' - ) - - args = parser.parse_args() - - # Check for tshark - try: - result = subprocess.run( - ['tshark', '-v'], - capture_output=True, - check=True - ) - if args.verbose: - version = result.stdout.decode().split('\n')[0] - print(f"Using: {version}\n") - except (subprocess.CalledProcessError, FileNotFoundError): - print("Error: tshark not found. Install Wireshark/tshark:") - print(" Ubuntu/Debian: sudo apt-get install tshark") - print(" macOS: brew install wireshark") - return 1 - - print("=" * 80) - print("ADVANCED DNS PCAP FILTER (IPv4/IPv6)") - print("=" * 80) - print("Filters:") - print(" 1. Protocol-specific DNS ports only") - print(" 2. Keep only traffic involving test machine (10.0.0.50 / IPv6 addresses)") - print(" 3. Exclude AdGuard IPs (for non-AdGuard captures)") - print(f"\nInput: {args.input_dir}") - print(f"Output: {args.output}") - - # Find PCAP files - print(f"\nScanning for PCAP files...") - pcap_files = find_pcap_files(args.input_dir) - - if not pcap_files: - print(f"No PCAP files found in {args.input_dir}") - return 1 - - print(f"Found {len(pcap_files)} PCAP files") - - total_input_size = sum(os.path.getsize(f) for f in pcap_files) - print(f"Total size: {format_bytes(total_input_size)}") - - if args.limit: - pcap_files = pcap_files[:args.limit] - print(f"Limiting to first {args.limit} files") - - if args.dry_run: - print("\n*** DRY RUN MODE ***\n") - else: - print() - - # Process files - success_count = 0 - skip_count = 0 - fail_count = 0 - total_output_size = 0 - - for i, input_path in enumerate(pcap_files, 1): - # Extract info from path - filename = Path(input_path).name - protocol = parse_filename(filename) - resolver = extract_resolver_from_path(input_path) - - if not protocol: - print(f"[{i}/{len(pcap_files)}] {filename}") - print(f" ⚠ Could not parse protocol, skipping") - skip_count += 1 - continue - - # Create output path - rel_path = os.path.relpath(input_path, args.input_dir) - output_path = os.path.join(args.output, rel_path) - - input_size = os.path.getsize(input_path) - - print(f"[{i}/{len(pcap_files)}] {rel_path}") - print(f" Protocol: {protocol.upper()}") - print(f" Resolver: {resolver or 'unknown'}") - print(f" Size: {format_bytes(input_size)}") - - # Check if already filtered - if os.path.exists(output_path) and not args.overwrite: - output_size = os.path.getsize(output_path) - reduction = ((input_size - output_size) / input_size * 100) if input_size > 0 else 0 - print(f" ⊙ Already filtered: {format_bytes(output_size)} " - f"({reduction:.1f}% reduction)") - skip_count += 1 - total_output_size += output_size - continue - - # Build filter - filter_expr = build_filter_expression(protocol, resolver) - - if args.dry_run: - print(f" → Would filter") - if args.verbose: - print(f" Filter: {filter_expr}") - continue - - # Create output directory - os.makedirs(os.path.dirname(output_path), exist_ok=True) - - # Filter - success = filter_pcap(input_path, output_path, filter_expr, args.verbose) - - if success: - output_size = os.path.getsize(output_path) - reduction = ((input_size - output_size) / input_size * 100) if input_size > 0 else 0 - print(f" ✓ Filtered: {format_bytes(output_size)} " - f"({reduction:.1f}% reduction)") - success_count += 1 - total_output_size += output_size - else: - fail_count += 1 - - # Summary - print("\n" + "=" * 80) - print("SUMMARY") - print("=" * 80) - - if args.dry_run: - print(f"Would process: {len(pcap_files)} files") - else: - print(f"Successful: {success_count}") - print(f"Skipped: {skip_count} (already filtered or unparseable)") - print(f"Failed: {fail_count}") - print(f"Total: {len(pcap_files)}") - - if success_count > 0 or skip_count > 0: - print(f"\nInput size: {format_bytes(total_input_size)}") - print(f"Output size: {format_bytes(total_output_size)}") - if total_input_size > 0: - reduction = ((total_input_size - total_output_size) / - total_input_size * 100) - print(f"Reduction: {reduction:.1f}%") - print(f"\nOutput directory: {args.output}") - - return 0 if fail_count == 0 else 1 - -if __name__ == "__main__": - exit(main()) diff --git a/scripts/tools/csvs_to_sqlite.py b/scripts/tools/csvs_to_sqlite.py deleted file mode 100644 index e501de9..0000000 --- a/scripts/tools/csvs_to_sqlite.py +++ /dev/null @@ -1,426 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert DNS CSV files to SQLite database. -Creates a single normalized table with unified DNSSEC handling. -""" - -import sqlite3 -import csv -from pathlib import Path -from dateutil import parser as date_parser - - -def create_database_schema(conn: sqlite3.Connection): - """Create the database schema with indexes.""" - cursor = conn.cursor() - - # Main queries table - cursor.execute(""" - CREATE TABLE IF NOT EXISTS dns_queries ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - - -- Metadata - provider TEXT NOT NULL, - protocol TEXT NOT NULL, - dnssec_mode TEXT NOT NULL CHECK(dnssec_mode IN ('off', 'auth', 'trust')), - - -- Query details - domain TEXT NOT NULL, - query_type TEXT NOT NULL, - keep_alive BOOLEAN NOT NULL, - dns_server TEXT NOT NULL, - - -- Timing - timestamp TEXT NOT NULL, - timestamp_unix REAL NOT NULL, - duration_ns INTEGER NOT NULL, - duration_ms REAL NOT NULL, - - -- Size metrics - request_size_bytes INTEGER, - response_size_bytes INTEGER, - - -- Network metrics (from PCAP) - bytes_sent INTEGER DEFAULT 0, - bytes_received INTEGER DEFAULT 0, - packets_sent INTEGER DEFAULT 0, - packets_received INTEGER DEFAULT 0, - total_bytes INTEGER DEFAULT 0, - - -- Response - response_code TEXT, - error TEXT - ) - """) - - # Create indexes for common queries - cursor.execute(""" - CREATE INDEX IF NOT EXISTS idx_provider - ON dns_queries(provider) - """) - - cursor.execute(""" - CREATE INDEX IF NOT EXISTS idx_protocol - ON dns_queries(protocol) - """) - - cursor.execute(""" - CREATE INDEX IF NOT EXISTS idx_dnssec_mode - ON dns_queries(dnssec_mode) - """) - - cursor.execute(""" - CREATE INDEX IF NOT EXISTS idx_keep_alive - ON dns_queries(keep_alive) - """) - - cursor.execute(""" - CREATE INDEX IF NOT EXISTS idx_provider_protocol_dnssec - ON dns_queries(provider, protocol, dnssec_mode) - """) - - cursor.execute(""" - CREATE INDEX IF NOT EXISTS idx_timestamp - ON dns_queries(timestamp_unix) - """) - - cursor.execute(""" - CREATE INDEX IF NOT EXISTS idx_domain - ON dns_queries(domain) - """) - - conn.commit() - - -def parse_protocol_and_dnssec(filename: str) -> tuple[str, str, bool]: - """ - Extract base protocol, DNSSEC mode, and keep_alive from filename. - Returns (base_protocol, dnssec_mode, keep_alive) - - Examples: - 'udp.csv' -> ('udp', 'off', False) - 'udp-auth.csv' -> ('udp', 'auth', False) - 'tls.csv' -> ('tls', 'off', False) - 'tls-persist.csv' -> ('tls', 'off', True) - 'https-persist.csv' -> ('https', 'off', True) - 'https-auth-persist.csv' -> ('https', 'auth', True) - 'https-trust-persist.csv' -> ('https', 'trust', True) - 'doh3-auth.csv' -> ('doh3', 'auth', False) - 'doq.csv' -> ('doq', 'off', False) - """ - name = filename.replace('.csv', '') - - # Check for persist suffix (keep_alive) - keep_alive = False - if name.endswith('-persist'): - keep_alive = True - name = name.replace('-persist', '') - - # Check for DNSSEC suffix - dnssec_mode = 'off' - if name.endswith('-auth'): - dnssec_mode = 'auth' - name = name.replace('-auth', '') - elif name.endswith('-trust'): - dnssec_mode = 'trust' - name = name.replace('-trust', '') - - # For UDP, DoH3, and DoQ, keep_alive doesn't apply (connectionless) - if name in ['udp', 'doh3', 'doq']: - keep_alive = False - - return (name, dnssec_mode, keep_alive) - - -def str_to_bool(value: str) -> bool: - """Convert string boolean to Python bool.""" - return value.lower() in ('true', '1', 'yes') - - -def import_csv_to_db( - csv_path: Path, - provider: str, - conn: sqlite3.Connection -) -> int: - """Import a CSV file into the database.""" - protocol, dnssec_mode, keep_alive_from_filename = parse_protocol_and_dnssec(csv_path.name) - - cursor = conn.cursor() - rows_imported = 0 - - with open(csv_path, 'r', encoding='utf-8') as f: - reader = csv.DictReader(f) - - for row in reader: - try: - # Parse timestamp to Unix epoch - dt = date_parser.isoparse(row['timestamp']) - timestamp_unix = dt.timestamp() - - # Use keep_alive from filename (more reliable than CSV) - keep_alive = keep_alive_from_filename - - # Handle optional fields (may not exist in older CSVs) - bytes_sent = int(row.get('bytes_sent', 0) or 0) - bytes_received = int(row.get('bytes_received', 0) or 0) - packets_sent = int(row.get('packets_sent', 0) or 0) - packets_received = int(row.get('packets_received', 0) or 0) - total_bytes = int(row.get('total_bytes', 0) or 0) - - cursor.execute(""" - INSERT INTO dns_queries ( - provider, protocol, dnssec_mode, - domain, query_type, keep_alive, - dns_server, timestamp, timestamp_unix, - duration_ns, duration_ms, - request_size_bytes, response_size_bytes, - bytes_sent, bytes_received, packets_sent, packets_received, total_bytes, - response_code, error - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - """, ( - provider, - protocol, - dnssec_mode, - row['domain'], - row['query_type'], - keep_alive, - row['dns_server'], - row['timestamp'], - timestamp_unix, - int(row['duration_ns']), - float(row['duration_ms']), - int(row.get('request_size_bytes') or 0), - int(row.get('response_size_bytes') or 0), - bytes_sent, - bytes_received, - packets_sent, - packets_received, - total_bytes, - row.get('response_code', ''), - row.get('error', '') - )) - - rows_imported += 1 - - except Exception as e: - print(f" Warning: Skipping row - {e}") - continue - - conn.commit() - return rows_imported - - -def main(): - """Main import pipeline.""" - print("\n" + "="*60) - print("CSV to SQLite Database Converter") - print("="*60) - - results_dir = Path('results') - db_path = Path('dns.db') - - if not results_dir.exists(): - print(f"\n❌ Error: '{results_dir}' directory not found") - return - - # Remove existing database - if db_path.exists(): - print(f"\n⚠ Removing existing database: {db_path}") - db_path.unlink() - - # Create database and schema - print(f"\n📊 Creating database: {db_path}") - conn = sqlite3.connect(db_path) - create_database_schema(conn) - print("✓ Schema created") - - # Import CSVs - providers = ['adguard', 'cloudflare', 'google', 'quad9'] - total_rows = 0 - total_files = 0 - - for provider in providers: - provider_path = results_dir / provider - - if not provider_path.exists(): - print(f"\n⚠ Skipping {provider} - directory not found") - continue - - print(f"\n{'='*60}") - print(f"Importing: {provider.upper()}") - print(f"{'='*60}") - - csv_files = sorted(provider_path.glob('*.csv')) - provider_rows = 0 - provider_files = 0 - - for csv_path in csv_files: - # Skip backup files - if '.bak' in csv_path.name: - continue - - protocol, dnssec, keep_alive = parse_protocol_and_dnssec(csv_path.name) - ka_str = "persistent" if keep_alive else "non-persist" - print(f" 📄 {csv_path.name:30} → {protocol:8} (DNSSEC: {dnssec:5}, {ka_str})") - - rows = import_csv_to_db(csv_path, provider, conn) - print(f" ✓ Imported {rows:,} rows") - - provider_rows += rows - provider_files += 1 - - print(f"\n Total: {provider_files} files, {provider_rows:,} rows") - total_rows += provider_rows - total_files += provider_files - - # Create summary - print(f"\n{'='*60}") - print("Database Summary") - print(f"{'='*60}") - - cursor = conn.cursor() - - # Total counts - cursor.execute("SELECT COUNT(*) FROM dns_queries") - total_queries = cursor.fetchone()[0] - - cursor.execute("SELECT COUNT(DISTINCT provider) FROM dns_queries") - unique_providers = cursor.fetchone()[0] - - cursor.execute("SELECT COUNT(DISTINCT protocol) FROM dns_queries") - unique_protocols = cursor.fetchone()[0] - - cursor.execute("SELECT COUNT(DISTINCT domain) FROM dns_queries") - unique_domains = cursor.fetchone()[0] - - print(f"\nTotal queries: {total_queries:,}") - print(f"Providers: {unique_providers}") - print(f"Protocols: {unique_protocols}") - print(f"Unique domains: {unique_domains}") - - # Show breakdown by provider, protocol, DNSSEC, and keep_alive - print(f"\nBreakdown by Provider, Protocol, DNSSEC & Keep-Alive:") - print(f"{'-'*80}") - - cursor.execute(""" - SELECT provider, protocol, dnssec_mode, keep_alive, COUNT(*) as count - FROM dns_queries - GROUP BY provider, protocol, dnssec_mode, keep_alive - ORDER BY provider, protocol, dnssec_mode, keep_alive - """) - - current_provider = None - for provider, protocol, dnssec, keep_alive, count in cursor.fetchall(): - if current_provider != provider: - if current_provider is not None: - print() - current_provider = provider - - ka_str = "✓" if keep_alive else "✗" - print(f" {provider:12} | {protocol:8} | {dnssec:5} | KA:{ka_str} | {count:6,} queries") - - # Protocol distribution - print(f"\n{'-'*80}") - print("Protocol Distribution:") - print(f"{'-'*80}") - - cursor.execute(""" - SELECT protocol, COUNT(*) as count - FROM dns_queries - GROUP BY protocol - ORDER BY protocol - """) - - for protocol, count in cursor.fetchall(): - pct = (count / total_queries) * 100 - print(f" {protocol:8} | {count:8,} queries ({pct:5.1f}%)") - - # DNSSEC mode distribution - print(f"\n{'-'*80}") - print("DNSSEC Mode Distribution:") - print(f"{'-'*80}") - - cursor.execute(""" - SELECT dnssec_mode, COUNT(*) as count - FROM dns_queries - GROUP BY dnssec_mode - ORDER BY dnssec_mode - """) - - for dnssec_mode, count in cursor.fetchall(): - pct = (count / total_queries) * 100 - print(f" {dnssec_mode:5} | {count:8,} queries ({pct:5.1f}%)") - - # Keep-Alive distribution - print(f"\n{'-'*80}") - print("Keep-Alive Distribution:") - print(f"{'-'*80}") - - cursor.execute(""" - SELECT keep_alive, COUNT(*) as count - FROM dns_queries - GROUP BY keep_alive - """) - - for keep_alive, count in cursor.fetchall(): - ka_label = "Persistent" if keep_alive else "Non-persistent" - pct = (count / total_queries) * 100 - print(f" {ka_label:15} | {count:8,} queries ({pct:5.1f}%)") - - conn.close() - - print(f"\n{'='*60}") - print(f"✓ Database created successfully: {db_path}") - print(f" Total: {total_files} files, {total_rows:,} rows") - print(f"{'='*60}\n") - - # Print usage examples - print("\n📖 Usage Examples for Metabase:") - print(f"{'-'*60}") - - print("\n1. Compare protocols (DNSSEC off, persistent only):") - print(""" SELECT provider, protocol, - AVG(duration_ms) as avg_latency, - AVG(total_bytes) as avg_bytes - FROM dns_queries - WHERE dnssec_mode = 'off' AND keep_alive = 1 - GROUP BY provider, protocol;""") - - print("\n2. DNSSEC impact on UDP:") - print(""" SELECT provider, dnssec_mode, - AVG(duration_ms) as avg_latency - FROM dns_queries - WHERE protocol = 'udp' - GROUP BY provider, dnssec_mode;""") - - print("\n3. Keep-alive impact on TLS:") - print(""" SELECT provider, keep_alive, - AVG(duration_ms) as avg_latency, - AVG(total_bytes) as avg_bytes - FROM dns_queries - WHERE protocol = 'tls' AND dnssec_mode = 'off' - GROUP BY provider, keep_alive;""") - - print("\n4. Time series for line graphs:") - print(""" SELECT timestamp_unix, duration_ms, total_bytes - FROM dns_queries - WHERE provider = 'cloudflare' - AND protocol = 'https' - AND dnssec_mode = 'off' - AND keep_alive = 1 - ORDER BY timestamp_unix;""") - - print("\n5. Overall comparison table:") - print(""" SELECT protocol, dnssec_mode, keep_alive, - COUNT(*) as queries, - AVG(duration_ms) as avg_latency, - AVG(total_bytes) as avg_bytes - FROM dns_queries - GROUP BY protocol, dnssec_mode, keep_alive - ORDER BY protocol, dnssec_mode, keep_alive;""") - - print(f"\n{'-'*60}\n") - - -if __name__ == '__main__': - main() diff --git a/scripts/tools/merge_files.py b/scripts/tools/merge_files.py deleted file mode 100644 index 8d6e300..0000000 --- a/scripts/tools/merge_files.py +++ /dev/null @@ -1,274 +0,0 @@ -#!/usr/bin/env python3 -""" -Merge DNS test files by configuration. - -- Merges CSVs of same config (adds 'run_id' column for traceability) -- Optionally merges PCAPs using mergecap -- Flattens date structure -""" - -import os -import csv -import subprocess -import shutil -from pathlib import Path -import argparse -from collections import defaultdict - -def parse_filename(filename): - """ - Extract config key from filename. - Format: protocol[-flags]-timestamp.{csv,pcap} - Config key: protocol[-flags] (ignores timestamp) - """ - base = filename.replace('.csv', '').replace('.pcap', '') - parts = base.split('-') - - if len(parts) < 2: - return None - - # Config is everything except timestamp - config = '-'.join(parts[:-1]) - timestamp = parts[-1] - - return config, timestamp - -def extract_resolver_from_path(file_path): - """Extract resolver name from path""" - parts = Path(file_path).parts - for part in parts: - if part.lower() in ['cloudflare', 'google', 'quad9', 'adguard']: - return part.lower() - return None - -def find_files(root_dir, extension): - """Find all files with given extension""" - files = [] - for root, dirs, filenames in os.walk(root_dir): - for filename in filenames: - if filename.endswith(extension): - full_path = os.path.join(root, filename) - files.append(full_path) - return sorted(files) - -def merge_csvs(csv_files, output_path, fieldnames): - """Merge multiple CSVs into one, adding 'run_id' column""" - with open(output_path, 'w', newline='') as outfile: - writer = csv.DictWriter(outfile, fieldnames=fieldnames + ['run_id']) - writer.writeheader() - - for csv_path in csv_files: - # Use timestamp as run_id - filename = Path(csv_path).name - _, timestamp = parse_filename(filename) - run_id = timestamp # Or add date if needed - - with open(csv_path, 'r', newline='') as infile: - reader = csv.DictReader(infile) - for row in reader: - row['run_id'] = run_id - writer.writerow(row) - -def merge_pcaps(pcap_files, output_path): - """Merge PCAP files using mergecap""" - cmd = ['mergecap', '-w', output_path] + pcap_files - try: - subprocess.run(cmd, capture_output=True, check=True) - return True - except subprocess.CalledProcessError as e: - print(f" ✗ mergecap error: {e.stderr.decode()}") - return False - except FileNotFoundError: - print("Error: mergecap not found. Install Wireshark:") - print(" Ubuntu: sudo apt install wireshark-common") - print(" macOS: brew install wireshark") - return False - -def format_bytes(bytes_val): - """Format bytes as human readable""" - for unit in ['B', 'KB', 'MB', 'GB']: - if bytes_val < 1024.0: - return f"{bytes_val:.1f} {unit}" - bytes_val /= 1024.0 - return f"{bytes_val:.1f} TB" - -def main(): - parser = argparse.ArgumentParser( - description='Merge DNS test files by configuration', - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=''' -Merges files of same config across dates/timestamps. -Output: ./results_merged/[resolver]/[config].csv (merged) - ./results_merged/[resolver]/[config].pcap (merged, if --merge-pcaps) - -Examples: - # Dry run to preview - %(prog)s ./results --dry-run - - # Merge CSVs only (recommended) - %(prog)s ./results - - # Merge CSVs and PCAPs - %(prog)s ./results --merge-pcaps - - # Custom output directory - %(prog)s ./results --output ./merged_data - ''' - ) - - parser.add_argument( - 'input_dir', - help='Input directory (e.g., ./results)' - ) - parser.add_argument( - '--output', - default='./results_merged', - help='Output directory (default: ./results_merged)' - ) - parser.add_argument( - '--merge-pcaps', - action='store_true', - help='Merge PCAP files (requires mergecap from Wireshark)' - ) - parser.add_argument( - '--dry-run', - action='store_true', - help='Show what would be done without merging' - ) - parser.add_argument( - '-y', '--yes', - action='store_true', - help='Skip confirmation prompt' - ) - - args = parser.parse_args() - - if not os.path.isdir(args.input_dir): - print(f"Error: Input directory not found: {args.input_dir}") - return 1 - - # Find all files - print("=" * 80) - print("MERGE DNS TEST FILES") - print("=" * 80) - print(f"Input: {args.input_dir}") - print(f"Output: {args.output}") - print(f"Merge PCAPs: {'Yes' if args.merge_pcaps else 'No'}") - - csv_files = find_files(args.input_dir, '.csv') - pcap_files = find_files(args.input_dir, '.pcap') if args.merge_pcaps else [] - - if not csv_files and not pcap_files: - print("\nNo CSV/PCAP files found") - return 1 - - print(f"\nFound {len(csv_files)} CSV files") - if args.merge_pcaps: - print(f"Found {len(pcap_files)} PCAP files") - - # Group files by resolver and config - csv_groups = defaultdict(list) - pcap_groups = defaultdict(list) - - for csv_path in csv_files: - config, _ = parse_filename(Path(csv_path).name) - resolver = extract_resolver_from_path(csv_path) - if config and resolver: - key = (resolver, config) - csv_groups[key].append(csv_path) - - for pcap_path in pcap_files: - config, _ = parse_filename(Path(pcap_path).name) - resolver = extract_resolver_from_path(pcap_path) - if config and resolver: - key = (resolver, config) - pcap_groups[key].append(pcap_path) - - # Summary - print("\nConfigs to merge:") - print("-" * 80) - for (resolver, config), files in sorted(csv_groups.items()): - print(f" {resolver}/{config}: {len(files)} runs") - - total_runs = sum(len(files) for files in csv_groups.values()) - print(f"\nTotal configs: {len(csv_groups)}") - print(f"Total runs: {total_runs}") - - if args.dry_run: - print("\n*** DRY RUN MODE ***\n") - for (resolver, config) in sorted(csv_groups.keys()): - print(f"Would merge: {resolver}/{config} ({len(csv_groups[(resolver, config)])} CSVs)") - if args.merge_pcaps and (resolver, config) in pcap_groups: - print(f"Would merge: {resolver}/{config} ({len(pcap_groups[(resolver, config)])} PCAPs)") - return 0 - - # Confirmation - if not args.yes: - response = input(f"\nMerge all into {args.output}? [y/N] ") - if response.lower() not in ['y', 'yes']: - print("Cancelled") - return 0 - - # Merge - print("\n" + "=" * 80) - print("MERGING FILES") - print("=" * 80) - - success_count = 0 - fail_count = 0 - total_queries = 0 - total_size = 0 - - # Get standard CSV fieldnames (from first file) - first_csv = next(iter(csv_files)) - with open(first_csv, 'r') as f: - reader = csv.DictReader(f) - fieldnames = reader.fieldnames - - for (resolver, config), files in sorted(csv_groups.items()): - print(f"\n{resolver}/{config} ({len(files)} runs)") - - # Merge CSVs - output_csv = os.path.join(args.output, resolver, f"{config}.csv") - os.makedirs(os.path.dirname(output_csv), exist_ok=True) - - merge_csvs(files, output_csv, fieldnames) - - # Count queries in merged file - with open(output_csv, 'r') as f: - query_count = sum(1 for _ in csv.reader(f)) - 1 # Minus header - - print(f" ✓ Merged CSV: {query_count:,} queries") - total_queries += query_count - success_count += 1 - - # Merge PCAPs if requested - if args.merge_pcaps and (resolver, config) in pcap_groups: - output_pcap = os.path.join(args.output, resolver, f"{config}.pcap") - pcap_list = pcap_groups[(resolver, config)] - - if merge_pcaps(pcap_list, output_pcap): - merged_size = os.path.getsize(output_pcap) - orig_size = sum(os.path.getsize(p) for p in pcap_list) - print(f" ✓ Merged PCAP: {format_bytes(merged_size)} " - f"(from {format_bytes(orig_size)})") - total_size += merged_size - else: - print(f" ✗ PCAP merge failed") - fail_count += 1 - - # Final summary - print("\n" + "=" * 80) - print("COMPLETE") - print("=" * 80) - print(f"Successful configs: {success_count}") - print(f"Failed: {fail_count}") - print(f"Total queries: {total_queries:,}") - if args.merge_pcaps: - print(f"Total PCAP size: {format_bytes(total_size)}") - print(f"\nMerged files in: {args.output}") - - return 0 if fail_count == 0 else 1 - -if __name__ == "__main__": - exit(main())