Files
sdns-proxy/scripts/analysis/analyze_dns_metrics.py

499 lines
21 KiB
Python

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
# Set style for publication-quality plots
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10
plt.rcParams['figure.figsize'] = (12, 6)
class DNSAnalyzer:
def __init__(self, results_dir='results'):
self.results_dir = Path(results_dir)
self.df = None
def load_all_data(self):
"""Load all CSV files from the results directory"""
data_frames = []
providers = ['adguard', 'cloudflare', 'google', 'quad9']
for provider in providers:
provider_path = self.results_dir / provider
if not provider_path.exists():
continue
for csv_file in provider_path.glob('*.csv'):
try:
df = pd.read_csv(csv_file)
df['provider'] = provider
df['test_config'] = csv_file.stem
data_frames.append(df)
except Exception as e:
print(f"Error loading {csv_file}: {e}")
self.df = pd.concat(data_frames, ignore_index=True)
self._clean_and_enrich_data()
print(f"Loaded {len(self.df)} DNS queries across {len(data_frames)} test configurations")
def _clean_and_enrich_data(self):
"""Clean data and add useful columns"""
# Remove failed queries
self.df = self.df[self.df['error'].isna()]
# Extract protocol base (remove -auth, -trust suffixes)
self.df['protocol_base'] = self.df['protocol'].str.replace('-auth|-trust', '', regex=True)
# DNSSEC configuration
self.df['dnssec_mode'] = 'none'
self.df.loc[self.df['auth_dnssec'] == True, 'dnssec_mode'] = 'auth'
self.df.loc[(self.df['dnssec'] == True) & (self.df['auth_dnssec'] == False), 'dnssec_mode'] = 'trust'
# Protocol categories
self.df['protocol_category'] = self.df['protocol_base'].map({
'udp': 'Plain DNS',
'tls': 'DoT',
'https': 'DoH',
'doh3': 'DoH/3',
'doq': 'DoQ'
})
# Connection persistence
self.df['persistence'] = self.df['keep_alive'].fillna(False)
def generate_summary_statistics(self):
"""Generate comprehensive summary statistics"""
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
# Overall statistics
print("\n--- Overall Performance ---")
print(f"Total queries: {len(self.df)}")
print(f"Mean latency: {self.df['duration_ms'].mean():.2f} ms")
print(f"Median latency: {self.df['duration_ms'].median():.2f} ms")
print(f"95th percentile: {self.df['duration_ms'].quantile(0.95):.2f} ms")
print(f"99th percentile: {self.df['duration_ms'].quantile(0.99):.2f} ms")
# By protocol
print("\n--- Performance by Protocol ---")
protocol_stats = self.df.groupby('protocol_category')['duration_ms'].agg([
('count', 'count'),
('mean', 'mean'),
('median', 'median'),
('std', 'std'),
('p95', lambda x: x.quantile(0.95)),
('p99', lambda x: x.quantile(0.99))
]).round(2)
print(protocol_stats)
# By provider
print("\n--- Performance by Provider ---")
provider_stats = self.df.groupby('provider')['duration_ms'].agg([
('count', 'count'),
('mean', 'mean'),
('median', 'median'),
('std', 'std'),
('p95', lambda x: x.quantile(0.95))
]).round(2)
print(provider_stats)
# DNSSEC impact
print("\n--- DNSSEC Validation Impact ---")
dnssec_stats = self.df.groupby('dnssec_mode')['duration_ms'].agg([
('count', 'count'),
('mean', 'mean'),
('median', 'median'),
('overhead_vs_none', lambda x: x.mean())
]).round(2)
# Calculate overhead percentage
baseline = dnssec_stats.loc['none', 'mean'] if 'none' in dnssec_stats.index else 0
if baseline > 0:
dnssec_stats['overhead_pct'] = ((dnssec_stats['overhead_vs_none'] - baseline) / baseline * 100).round(1)
print(dnssec_stats)
# Bandwidth analysis
print("\n--- Bandwidth Usage ---")
bandwidth_stats = self.df.groupby('protocol_category').agg({
'request_size_bytes': ['mean', 'median'],
'response_size_bytes': ['mean', 'median']
}).round(2)
print(bandwidth_stats)
# Persistence impact (where applicable)
print("\n--- Connection Persistence Impact ---")
persist_protocols = self.df[self.df['protocol_base'].isin(['tls', 'https'])]
if len(persist_protocols) > 0:
persist_stats = persist_protocols.groupby(['protocol_base', 'persistence'])['duration_ms'].agg([
('mean', 'mean'),
('median', 'median')
]).round(2)
print(persist_stats)
return {
'protocol': protocol_stats,
'provider': provider_stats,
'dnssec': dnssec_stats,
'bandwidth': bandwidth_stats
}
def plot_latency_by_protocol(self, output_dir='plots'):
"""Violin plot of latency distribution by protocol"""
Path(output_dir).mkdir(exist_ok=True)
plt.figure(figsize=(14, 7))
# Order protocols logically
protocol_order = ['Plain DNS', 'DoT', 'DoH', 'DoH/3', 'DoQ']
available_protocols = [p for p in protocol_order if p in self.df['protocol_category'].values]
sns.violinplot(data=self.df, x='protocol_category', y='duration_ms',
order=available_protocols, inner='box', cut=0)
plt.title('DNS Query Latency Distribution by Protocol', fontsize=14, fontweight='bold')
plt.xlabel('Protocol', fontsize=12)
plt.ylabel('Response Time (ms)', fontsize=12)
plt.xticks(rotation=0)
# Add mean values as annotations
for i, protocol in enumerate(available_protocols):
mean_val = self.df[self.df['protocol_category'] == protocol]['duration_ms'].mean()
plt.text(i, mean_val, f'{mean_val:.1f}', ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
plt.savefig(f'{output_dir}/latency_by_protocol.png', bbox_inches='tight')
plt.close()
print(f"✓ Saved: latency_by_protocol.png")
def plot_provider_comparison(self, output_dir='plots'):
"""Box plot comparing providers across protocols"""
Path(output_dir).mkdir(exist_ok=True)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Provider Performance Comparison by Protocol', fontsize=16, fontweight='bold')
protocols = self.df['protocol_category'].unique()
protocols = [p for p in ['Plain DNS', 'DoT', 'DoH', 'DoH/3'] if p in protocols]
for idx, protocol in enumerate(protocols[:4]):
ax = axes[idx // 2, idx % 2]
data = self.df[self.df['protocol_category'] == protocol]
if len(data) > 0:
sns.boxplot(data=data, x='provider', y='duration_ms', ax=ax)
ax.set_title(f'{protocol}', fontsize=12, fontweight='bold')
ax.set_xlabel('Provider', fontsize=10)
ax.set_ylabel('Response Time (ms)', fontsize=10)
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig(f'{output_dir}/provider_comparison.png', bbox_inches='tight')
plt.close()
print(f"✓ Saved: provider_comparison.png")
def plot_dnssec_impact(self, output_dir='plots'):
"""Compare DNSSEC validation methods (trust vs auth)"""
Path(output_dir).mkdir(exist_ok=True)
# Filter for protocols that have DNSSEC variations
dnssec_data = self.df[self.df['dnssec_mode'] != 'none'].copy()
if len(dnssec_data) == 0:
print("⚠ No DNSSEC data available")
return
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
# Plot 1: Overall DNSSEC impact
protocol_order = ['Plain DNS', 'DoT', 'DoH', 'DoH/3', 'DoQ']
available = [p for p in protocol_order if p in self.df['protocol_category'].values]
sns.barplot(data=self.df, x='protocol_category', y='duration_ms',
hue='dnssec_mode', order=available, ax=ax1, ci=95)
ax1.set_title('DNSSEC Validation Overhead by Protocol', fontsize=12, fontweight='bold')
ax1.set_xlabel('Protocol', fontsize=10)
ax1.set_ylabel('Mean Response Time (ms)', fontsize=10)
ax1.legend(title='DNSSEC Mode', labels=['No DNSSEC', 'Auth (Full)', 'Trust (Resolver)'])
ax1.tick_params(axis='x', rotation=0)
# Plot 2: Trust vs Auth comparison
comparison_data = dnssec_data.groupby(['protocol_category', 'dnssec_mode'])['duration_ms'].mean().reset_index()
pivot_data = comparison_data.pivot(index='protocol_category', columns='dnssec_mode', values='duration_ms')
if 'auth' in pivot_data.columns and 'trust' in pivot_data.columns:
pivot_data['overhead_pct'] = ((pivot_data['auth'] - pivot_data['trust']) / pivot_data['trust'] * 100)
pivot_data['overhead_pct'].plot(kind='bar', ax=ax2, color='coral')
ax2.set_title('Auth vs Trust: Additional Overhead (%)', fontsize=12, fontweight='bold')
ax2.set_xlabel('Protocol', fontsize=10)
ax2.set_ylabel('Additional Overhead (%)', fontsize=10)
ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.8)
ax2.tick_params(axis='x', rotation=45)
ax2.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(f'{output_dir}/dnssec_impact.png', bbox_inches='tight')
plt.close()
print(f"✓ Saved: dnssec_impact.png")
def plot_persistence_impact(self, output_dir='plots'):
"""Analyze impact of connection persistence"""
Path(output_dir).mkdir(exist_ok=True)
persist_data = self.df[self.df['protocol_base'].isin(['tls', 'https'])].copy()
if len(persist_data) == 0:
print("⚠ No persistence data available")
return
plt.figure(figsize=(12, 6))
sns.barplot(data=persist_data, x='protocol_base', y='duration_ms',
hue='persistence', ci=95)
plt.title('Impact of Connection Persistence on Latency', fontsize=14, fontweight='bold')
plt.xlabel('Protocol', fontsize=12)
plt.ylabel('Mean Response Time (ms)', fontsize=12)
plt.legend(title='Keep-Alive', labels=['Disabled', 'Enabled'])
# Calculate and annotate overhead reduction
for protocol in persist_data['protocol_base'].unique():
protocol_data = persist_data[persist_data['protocol_base'] == protocol]
no_persist = protocol_data[protocol_data['persistence'] == False]['duration_ms'].mean()
with_persist = protocol_data[protocol_data['persistence'] == True]['duration_ms'].mean()
if not np.isnan(no_persist) and not np.isnan(with_persist):
reduction = ((no_persist - with_persist) / no_persist * 100)
print(f"{protocol}: {reduction:.1f}% reduction with persistence")
plt.tight_layout()
plt.savefig(f'{output_dir}/persistence_impact.png', bbox_inches='tight')
plt.close()
print(f"✓ Saved: persistence_impact.png")
def plot_bandwidth_overhead(self, output_dir='plots'):
"""Visualize bandwidth usage by protocol"""
Path(output_dir).mkdir(exist_ok=True)
bandwidth_data = self.df.groupby('protocol_category').agg({
'request_size_bytes': 'mean',
'response_size_bytes': 'mean'
}).reset_index()
bandwidth_data['total_bytes'] = (bandwidth_data['request_size_bytes'] +
bandwidth_data['response_size_bytes'])
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
# Plot 1: Request vs Response sizes
x = np.arange(len(bandwidth_data))
width = 0.35
ax1.bar(x - width/2, bandwidth_data['request_size_bytes'], width,
label='Request', alpha=0.8)
ax1.bar(x + width/2, bandwidth_data['response_size_bytes'], width,
label='Response', alpha=0.8)
ax1.set_xlabel('Protocol', fontsize=12)
ax1.set_ylabel('Bytes', fontsize=12)
ax1.set_title('Average Request/Response Sizes', fontsize=12, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(bandwidth_data['protocol_category'])
ax1.legend()
ax1.grid(axis='y', alpha=0.3)
# Plot 2: Total bandwidth overhead vs UDP baseline
udp_total = bandwidth_data[bandwidth_data['protocol_category'] == 'Plain DNS']['total_bytes'].values
if len(udp_total) > 0:
bandwidth_data['overhead_vs_udp'] = ((bandwidth_data['total_bytes'] - udp_total[0]) / udp_total[0] * 100)
colors = ['green' if x < 0 else 'red' for x in bandwidth_data['overhead_vs_udp']]
ax2.bar(bandwidth_data['protocol_category'], bandwidth_data['overhead_vs_udp'],
color=colors, alpha=0.7)
ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.8)
ax2.set_xlabel('Protocol', fontsize=12)
ax2.set_ylabel('Overhead vs Plain DNS (%)', fontsize=12)
ax2.set_title('Bandwidth Overhead', fontsize=12, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(f'{output_dir}/bandwidth_overhead.png', bbox_inches='tight')
plt.close()
print(f"✓ Saved: bandwidth_overhead.png")
def plot_heatmap(self, output_dir='plots'):
"""Heatmap of provider-protocol performance"""
Path(output_dir).mkdir(exist_ok=True)
# Create pivot table
heatmap_data = self.df.groupby(['provider', 'protocol_category'])['duration_ms'].median().unstack()
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='RdYlGn_r',
cbar_kws={'label': 'Median Latency (ms)'})
plt.title('DNS Provider-Protocol Performance Matrix', fontsize=14, fontweight='bold')
plt.xlabel('Protocol', fontsize=12)
plt.ylabel('Provider', fontsize=12)
plt.tight_layout()
plt.savefig(f'{output_dir}/provider_protocol_heatmap.png', bbox_inches='tight')
plt.close()
print(f"✓ Saved: provider_protocol_heatmap.png")
def plot_percentile_comparison(self, output_dir='plots'):
"""Plot percentile comparison across protocols"""
Path(output_dir).mkdir(exist_ok=True)
percentiles = [50, 75, 90, 95, 99]
protocol_order = ['Plain DNS', 'DoT', 'DoH', 'DoH/3', 'DoQ']
available = [p for p in protocol_order if p in self.df['protocol_category'].values]
percentile_data = []
for protocol in available:
data = self.df[self.df['protocol_category'] == protocol]['duration_ms']
for p in percentiles:
percentile_data.append({
'protocol': protocol,
'percentile': f'P{p}',
'latency': np.percentile(data, p)
})
percentile_df = pd.DataFrame(percentile_data)
plt.figure(figsize=(14, 7))
sns.barplot(data=percentile_df, x='protocol', y='latency', hue='percentile', order=available)
plt.title('Latency Percentiles by Protocol', fontsize=14, fontweight='bold')
plt.xlabel('Protocol', fontsize=12)
plt.ylabel('Response Time (ms)', fontsize=12)
plt.legend(title='Percentile', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(f'{output_dir}/percentile_comparison.png', bbox_inches='tight')
plt.close()
print(f"✓ Saved: percentile_comparison.png")
def statistical_tests(self):
"""Perform statistical significance tests"""
print("\n" + "="*80)
print("STATISTICAL TESTS")
print("="*80)
# Test 1: Protocol differences (Kruskal-Wallis)
protocols = self.df['protocol_category'].unique()
if len(protocols) > 2:
groups = [self.df[self.df['protocol_category'] == p]['duration_ms'].values
for p in protocols]
h_stat, p_value = stats.kruskal(*groups)
print(f"\n--- Kruskal-Wallis Test (Protocol Differences) ---")
print(f"H-statistic: {h_stat:.4f}")
print(f"p-value: {p_value:.4e}")
print(f"Result: {'Significant' if p_value < 0.05 else 'Not significant'} differences between protocols")
# Test 2: DNSSEC impact (Mann-Whitney U)
if 'none' in self.df['dnssec_mode'].values and 'auth' in self.df['dnssec_mode'].values:
none_data = self.df[self.df['dnssec_mode'] == 'none']['duration_ms']
auth_data = self.df[self.df['dnssec_mode'] == 'auth']['duration_ms']
u_stat, p_value = stats.mannwhitneyu(none_data, auth_data, alternative='two-sided')
print(f"\n--- Mann-Whitney U Test (No DNSSEC vs Auth) ---")
print(f"U-statistic: {u_stat:.4f}")
print(f"p-value: {p_value:.4e}")
print(f"Result: {'Significant' if p_value < 0.05 else 'Not significant'} difference")
# Test 3: Trust vs Auth comparison
if 'trust' in self.df['dnssec_mode'].values and 'auth' in self.df['dnssec_mode'].values:
trust_data = self.df[self.df['dnssec_mode'] == 'trust']['duration_ms']
auth_data = self.df[self.df['dnssec_mode'] == 'auth']['duration_ms']
u_stat, p_value = stats.mannwhitneyu(trust_data, auth_data, alternative='two-sided')
print(f"\n--- Mann-Whitney U Test (Trust vs Auth) ---")
print(f"U-statistic: {u_stat:.4f}")
print(f"p-value: {p_value:.4e}")
print(f"Result: Auth is {'significantly' if p_value < 0.05 else 'not significantly'} slower than Trust")
def generate_latex_table(self, output_dir='plots'):
"""Generate LaTeX table for thesis"""
Path(output_dir).mkdir(exist_ok=True)
# Summary table by protocol
summary = self.df.groupby('protocol_category')['duration_ms'].agg([
('Mean', 'mean'),
('Median', 'median'),
('Std Dev', 'std'),
('P95', lambda x: x.quantile(0.95)),
('P99', lambda x: x.quantile(0.99))
]).round(2)
latex_code = summary.to_latex(float_format="%.2f")
with open(f'{output_dir}/summary_table.tex', 'w') as f:
f.write(latex_code)
print(f"✓ Saved: summary_table.tex")
print("\nLaTeX Table Preview:")
print(latex_code)
def run_full_analysis(self):
"""Run complete analysis pipeline"""
print("="*80)
print("DNS QoS Analysis - Starting Full Analysis")
print("="*80)
# Load data
print("\n[1/10] Loading data...")
self.load_all_data()
# Generate statistics
print("\n[2/10] Generating summary statistics...")
self.generate_summary_statistics()
# Statistical tests
print("\n[3/10] Running statistical tests...")
self.statistical_tests()
# Generate plots
print("\n[4/10] Creating latency by protocol plot...")
self.plot_latency_by_protocol()
print("\n[5/10] Creating provider comparison plot...")
self.plot_provider_comparison()
print("\n[6/10] Creating DNSSEC impact plot...")
self.plot_dnssec_impact()
print("\n[7/10] Creating persistence impact plot...")
self.plot_persistence_impact()
print("\n[8/10] Creating bandwidth overhead plot...")
self.plot_bandwidth_overhead()
print("\n[9/10] Creating heatmap...")
self.plot_heatmap()
print("\n[10/10] Creating percentile comparison...")
self.plot_percentile_comparison()
# Generate LaTeX table
print("\n[Bonus] Generating LaTeX table...")
self.generate_latex_table()
print("\n" + "="*80)
print("✓ Analysis Complete! Check the 'plots' directory for all visualizations.")
print("="*80)
if __name__ == "__main__":
analyzer = DNSAnalyzer(results_dir='results')
analyzer.run_full_analysis()