sdns-proxy/scripts/analysis/analyze_simple.py

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import datetime
from dateutil import parser as date_parser
import dpkt

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10

class FastDNSAnalyzer:
    def __init__(self, results_dir='results'):
        self.results_dir = Path(results_dir)
        self.all_data = []

    def should_include_file(self, filename):
        """Filter out DNSSEC and non-persist files"""
        name = filename.stem
        if 'auth' in name or 'trust' in name:
            return False
        if name in ['tls', 'https']:
            return False
        return True

    def parse_rfc3339_nano(self, timestamp_str):
        """Parse RFC3339Nano timestamp with timezone"""
        try:
            dt = date_parser.parse(timestamp_str)
            return dt.astimezone(datetime.timezone.utc).timestamp()
        except Exception as e:
            print(f"    Error parsing timestamp {timestamp_str}: {e}")
            return None

    def extract_bandwidth_from_pcap_fast(self, pcap_file, csv_data):
        """Fast bandwidth extraction using dpkt"""
        print(f"    Analyzing pcap: {pcap_file.name}")

        try:
            with open(pcap_file, 'rb') as f:
                pcap = dpkt.pcap.Reader(f)

                # Build query time windows
                query_windows = []
                for idx, row in csv_data.iterrows():
                    start_time = self.parse_rfc3339_nano(row['timestamp'])
                    if start_time is None:
                        continue

                    duration_seconds = row['duration_ns'] / 1_000_000_000
                    end_time = start_time + duration_seconds

                    query_windows.append({
                        'index': idx,
                        'start': start_time,
                        'end': end_time,
                        'bytes_sent': 0,
                        'bytes_received': 0,
                        'packets_sent': 0,
                        'packets_received': 0
                    })

                if not query_windows:
                    print("    ✗ No valid query windows")
                    return None

                # Sort windows for faster matching
                query_windows.sort(key=lambda x: x['start'])

                # Process packets
                packet_count = 0
                matched_count = 0

                for timestamp, buf in pcap:
                    packet_count += 1
                    packet_size = len(buf)

                    # Quick parse to determine direction
                    try:
                        eth = dpkt.ethernet.Ethernet(buf)

                        # Get IP layer
                        if isinstance(eth.data, dpkt.ip.IP):
                            ip = eth.data
                        elif isinstance(eth.data, dpkt.ip6.IP6):
                            ip = eth.data
                        else:
                            continue

                        # Get transport layer
                        if isinstance(ip.data, dpkt.udp.UDP):
                            transport = ip.data
                            src_port = transport.sport
                            dst_port = transport.dport
                        elif isinstance(ip.data, dpkt.tcp.TCP):
                            transport = ip.data
                            src_port = transport.sport
                            dst_port = transport.dport
                        else:
                            continue

                        # Determine direction (client port usually higher)
                        is_outbound = src_port > dst_port

                        # Binary search for matching window
                        for window in query_windows:
                            if window['start'] <= timestamp <= window['end']:
                                if is_outbound:
                                    window['bytes_sent'] += packet_size
                                    window['packets_sent'] += 1
                                else:
                                    window['bytes_received'] += packet_size
                                    window['packets_received'] += 1
                                matched_count += 1
                                break
                            elif timestamp < window['start']:
                                break  # No more windows to check

                    except Exception:
                        continue

                print(f"    ✓ Processed {packet_count} packets, matched {matched_count}")

                # Convert to DataFrame
                bandwidth_df = pd.DataFrame(query_windows)
                return bandwidth_df[['index', 'bytes_sent', 'bytes_received',
                                   'packets_sent', 'packets_received']]

        except Exception as e:
            print(f"    ✗ Error reading pcap: {e}")
            return None

    def load_data(self):
        """Load all relevant CSV files and extract bandwidth from pcaps"""
        print("Loading data and analyzing bandwidth...")

        for provider_dir in self.results_dir.iterdir():
            if not provider_dir.is_dir():
                continue

            provider = provider_dir.name

            for csv_file in provider_dir.glob('*.csv'):
                if not self.should_include_file(csv_file):
                    continue

                try:
                    df = pd.read_csv(csv_file)
                    df['provider'] = provider
                    df['test_file'] = csv_file.stem
                    df['csv_path'] = str(csv_file)

                    # Find corresponding pcap file
                    pcap_file = csv_file.with_suffix('.pcap')
                    if pcap_file.exists():
                        print(f"  Processing: {provider}/{csv_file.name}")
                        bandwidth_data = self.extract_bandwidth_from_pcap_fast(pcap_file, df)

                        if bandwidth_data is not None and len(bandwidth_data) > 0:
                            # Merge bandwidth data
                            df = df.reset_index(drop=True)
                            for col in ['bytes_sent', 'bytes_received', 'packets_sent', 'packets_received']:
                                df[col] = 0

                            for _, row in bandwidth_data.iterrows():
                                idx = int(row['index'])
                                if idx < len(df):
                                    df.at[idx, 'bytes_sent'] = row['bytes_sent']
                                    df.at[idx, 'bytes_received'] = row['bytes_received']
                                    df.at[idx, 'packets_sent'] = row['packets_sent']
                                    df.at[idx, 'packets_received'] = row['packets_received']

                            df['total_bytes'] = df['bytes_sent'] + df['bytes_received']

                            print(f"    ✓ Extracted bandwidth for {len(df)} queries")
                        else:
                            print(f"    ⚠ Could not extract bandwidth data")
                    else:
                        print(f"  ⚠ No pcap found for {csv_file.name}")

                    self.all_data.append(df)

                except Exception as e:
                    print(f"  ✗ Error loading {csv_file}: {e}")
                    import traceback
                    traceback.print_exc()

        print(f"\nTotal files loaded: {len(self.all_data)}")

    def create_line_graphs(self, output_dir='output/line_graphs'):
        """Create line graphs for latency and bandwidth"""
        Path(output_dir).mkdir(parents=True, exist_ok=True)

        print("\nGenerating line graphs...")

        for df in self.all_data:
            provider = df['provider'].iloc[0]
            test_name = df['test_file'].iloc[0]

            df['query_index'] = range(1, len(df) + 1)

            # Create figure with 2 subplots
            fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))

            # Plot 1: Latency
            ax1.plot(df['query_index'], df['duration_ms'], marker='o',
                    markersize=4, linewidth=1, alpha=0.7, color='steelblue')
            mean_latency = df['duration_ms'].mean()
            ax1.axhline(y=mean_latency, color='r', linestyle='--',
                       label=f'Mean: {mean_latency:.2f} ms', linewidth=2)
            ax1.set_xlabel('Query Number', fontsize=12)
            ax1.set_ylabel('Latency (ms)', fontsize=12)
            ax1.set_title('Latency Over Time', fontsize=12, fontweight='bold')
            ax1.legend()
            ax1.grid(True, alpha=0.3)

            # Plot 2: Bandwidth
            if 'total_bytes' in df.columns and df['total_bytes'].sum() > 0:
                ax2.plot(df['query_index'], df['bytes_sent'], marker='s',
                        markersize=4, linewidth=1, alpha=0.7,
                        color='orange', label='Sent')
                ax2.plot(df['query_index'], df['bytes_received'], marker='^',
                        markersize=4, linewidth=1, alpha=0.7,
                        color='green', label='Received')

                mean_sent = df['bytes_sent'].mean()
                mean_received = df['bytes_received'].mean()
                ax2.axhline(y=mean_sent, color='orange', linestyle='--',
                           linewidth=1.5, alpha=0.5)
                ax2.axhline(y=mean_received, color='green', linestyle='--',
                           linewidth=1.5, alpha=0.5)

                ax2.set_xlabel('Query Number', fontsize=12)
                ax2.set_ylabel('Bytes', fontsize=12)
                ax2.set_title(f'Bandwidth Over Time (Mean: ↑{mean_sent:.0f}B ↓{mean_received:.0f}B)',
                             fontsize=12, fontweight='bold')
                ax2.legend()
                ax2.grid(True, alpha=0.3)

            fig.suptitle(f'{provider.upper()} - {test_name}',
                        fontsize=14, fontweight='bold')
            plt.tight_layout()

            filename = f"{provider}_{test_name}.png"
            plt.savefig(f'{output_dir}/{filename}', bbox_inches='tight')
            plt.close()

            print(f"  ✓ Created: {filename}")

    def get_protocol_name(self, test_file):
        """Extract clean protocol name"""
        name = test_file.replace('-persist', '')

        protocol_map = {
            'udp': 'Plain DNS (UDP)',
            'tls': 'DoT (DNS over TLS)',
            'https': 'DoH (DNS over HTTPS)',
            'doh3': 'DoH/3 (DNS over HTTP/3)',
            'doq': 'DoQ (DNS over QUIC)'
        }

        return protocol_map.get(name, name.upper())

    def create_resolver_comparison_bars(self, output_dir='output/comparisons'):
        """Create bar graphs comparing resolvers for latency and bandwidth"""
        Path(output_dir).mkdir(parents=True, exist_ok=True)

        print("\nGenerating resolver comparison graphs...")

        combined_df = pd.concat(self.all_data, ignore_index=True)
        protocols = combined_df['test_file'].unique()

        for protocol in protocols:
            protocol_data = combined_df[combined_df['test_file'] == protocol]
            protocol_name = self.get_protocol_name(protocol)

            # Latency stats
            latency_stats = protocol_data.groupby('provider')['duration_ms'].agg([
                ('mean', 'mean'),
                ('median', 'median'),
                ('std', 'std')
            ]).reset_index()

            # Create latency comparison
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
            fig.suptitle(f'{protocol_name} - Latency Comparison',
                        fontsize=16, fontweight='bold')

            # Mean latency
            bars1 = ax1.bar(latency_stats['provider'], latency_stats['mean'],
                           color='steelblue', alpha=0.8, edgecolor='black')
            ax1.errorbar(latency_stats['provider'], latency_stats['mean'],
                        yerr=latency_stats['std'], fmt='none', color='black',
                        capsize=5, alpha=0.6)

            for bar in bars1:
                height = bar.get_height()
                ax1.text(bar.get_x() + bar.get_width()/2., height,
                        f'{height:.2f}',
                        ha='center', va='bottom', fontweight='bold')

            ax1.set_xlabel('Resolver', fontsize=12)
            ax1.set_ylabel('Mean Latency (ms)', fontsize=12)
            ax1.set_title('Mean Latency', fontsize=12)
            ax1.grid(axis='y', alpha=0.3)

            # Median latency
            bars2 = ax2.bar(latency_stats['provider'], latency_stats['median'],
                           color='coral', alpha=0.8, edgecolor='black')

            for bar in bars2:
                height = bar.get_height()
                ax2.text(bar.get_x() + bar.get_width()/2., height,
                        f'{height:.2f}',
                        ha='center', va='bottom', fontweight='bold')

            ax2.set_xlabel('Resolver', fontsize=12)
            ax2.set_ylabel('Median Latency (ms)', fontsize=12)
            ax2.set_title('Median Latency', fontsize=12)
            ax2.grid(axis='y', alpha=0.3)

            plt.tight_layout()
            plt.savefig(f'{output_dir}/latency_{protocol}.png', bbox_inches='tight')
            plt.close()
            print(f"  ✓ Created: latency_{protocol}.png")

            # Bandwidth comparison
            if 'total_bytes' in protocol_data.columns and protocol_data['total_bytes'].sum() > 0:
                bandwidth_stats = protocol_data.groupby('provider').agg({
                    'bytes_sent': 'mean',
                    'bytes_received': 'mean',
                    'total_bytes': 'mean'
                }).reset_index()

                fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
                fig.suptitle(f'{protocol_name} - Bandwidth Comparison',
                            fontsize=16, fontweight='bold')

                # Sent vs Received
                x = np.arange(len(bandwidth_stats))
                width = 0.35

                bars1 = ax1.bar(x - width/2, bandwidth_stats['bytes_sent'], width,
                               label='Sent', color='orange', alpha=0.8, edgecolor='black')
                bars2 = ax1.bar(x + width/2, bandwidth_stats['bytes_received'], width,
                               label='Received', color='green', alpha=0.8, edgecolor='black')

                ax1.set_xlabel('Resolver', fontsize=12)
                ax1.set_ylabel('Bytes per Query', fontsize=12)
                ax1.set_title('Average Bandwidth per Query', fontsize=12)
                ax1.set_xticks(x)
                ax1.set_xticklabels(bandwidth_stats['provider'])
                ax1.legend()
                ax1.grid(axis='y', alpha=0.3)

                # Total bandwidth
                bars3 = ax2.bar(bandwidth_stats['provider'], bandwidth_stats['total_bytes'],
                               color='purple', alpha=0.8, edgecolor='black')

                for bar in bars3:
                    height = bar.get_height()
                    ax2.text(bar.get_x() + bar.get_width()/2., height,
                            f'{height:.0f}',
                            ha='center', va='bottom', fontweight='bold')

                ax2.set_xlabel('Resolver', fontsize=12)
                ax2.set_ylabel('Total Bytes per Query', fontsize=12)
                ax2.set_title('Total Bandwidth per Query', fontsize=12)
                ax2.grid(axis='y', alpha=0.3)

                plt.tight_layout()
                plt.savefig(f'{output_dir}/bandwidth_{protocol}.png', bbox_inches='tight')
                plt.close()
                print(f"  ✓ Created: bandwidth_{protocol}.png")

    def generate_latex_tables(self, output_dir='output/tables'):
        """Generate LaTeX tables with latency and bandwidth statistics"""
        Path(output_dir).mkdir(parents=True, exist_ok=True)

        print("\nGenerating LaTeX tables...")

        combined_df = pd.concat(self.all_data, ignore_index=True)

        # Generate latency table for each resolver
        for provider in combined_df['provider'].unique():
            provider_data = combined_df[combined_df['provider'] == provider]

            stats = provider_data.groupby('test_file')['duration_ms'].agg([
                ('Mean', 'mean'),
                ('Median', 'median'),
                ('Std Dev', 'std'),
                ('P95', lambda x: x.quantile(0.95)),
                ('P99', lambda x: x.quantile(0.99))
            ]).round(2)

            stats.index = stats.index.map(self.get_protocol_name)
            stats.index.name = 'Protocol'

            latex_code = stats.to_latex(
                caption=f'{provider.upper()} - Latency Statistics (ms)',
                label=f'tab:{provider}_latency',
                float_format="%.2f"
            )

            with open(f'{output_dir}/{provider}_latency.tex', 'w') as f:
                f.write(latex_code)

            print(f"  ✓ Created: {provider}_latency.tex")

        # Generate bandwidth table for each resolver
        for provider in combined_df['provider'].unique():
            provider_data = combined_df[combined_df['provider'] == provider]

            if 'total_bytes' not in provider_data.columns or provider_data['total_bytes'].sum() == 0:
                continue

            bandwidth_stats = provider_data.groupby('test_file').agg({
                'bytes_sent': 'mean',
                'bytes_received': 'mean',
                'total_bytes': 'mean'
            }).round(2)

            bandwidth_stats.columns = ['Avg Sent (B)', 'Avg Received (B)', 'Avg Total (B)']
            bandwidth_stats.index = bandwidth_stats.index.map(self.get_protocol_name)
            bandwidth_stats.index.name = 'Protocol'

            latex_code = bandwidth_stats.to_latex(
                caption=f'{provider.upper()} - Bandwidth Statistics',
                label=f'tab:{provider}_bandwidth',
                float_format="%.2f"
            )

            with open(f'{output_dir}/{provider}_bandwidth.tex', 'w') as f:
                f.write(latex_code)

            print(f"  ✓ Created: {provider}_bandwidth.tex")

        # Generate protocol efficiency table
        print("\nGenerating protocol efficiency table...")

        if 'total_bytes' in combined_df.columns and combined_df['total_bytes'].sum() > 0:
            protocol_bandwidth = combined_df.groupby('test_file').agg({
                'bytes_sent': 'mean',
                'bytes_received': 'mean',
                'total_bytes': 'mean'
            }).round(2)

            # Find UDP baseline
            udp_baseline = None
            for protocol in protocol_bandwidth.index:
                if 'udp' in protocol:
                    udp_baseline = protocol_bandwidth.loc[protocol, 'total_bytes']
                    break

            if udp_baseline and udp_baseline > 0:
                protocol_bandwidth['Overhead vs UDP (%)'] = (
                    (protocol_bandwidth['total_bytes'] - udp_baseline) / udp_baseline * 100
                ).round(1)
                protocol_bandwidth['Efficiency (%)'] = (
                    100 / (1 + protocol_bandwidth['Overhead vs UDP (%)'] / 100)
                ).round(1)

            protocol_bandwidth.columns = ['Avg Sent (B)', 'Avg Received (B)',
                                         'Avg Total (B)', 'Overhead (%)', 'Efficiency (%)']
            protocol_bandwidth.index = protocol_bandwidth.index.map(self.get_protocol_name)
            protocol_bandwidth.index.name = 'Protocol'

            latex_code = protocol_bandwidth.to_latex(
                caption='Protocol Bandwidth Efficiency Comparison',
                label='tab:protocol_efficiency',
                float_format="%.2f"
            )

            with open(f'{output_dir}/protocol_efficiency.tex', 'w') as f:
                f.write(latex_code)

            print(f"  ✓ Created: protocol_efficiency.tex")
            print("\n--- Protocol Efficiency ---")
            print(protocol_bandwidth.to_string())

        # Generate combined comparison tables
        for metric in ['Mean', 'Median', 'P95']:
            comparison_stats = combined_df.groupby(['provider', 'test_file'])['duration_ms'].agg([
                ('Mean', 'mean'),
                ('Median', 'median'),
                ('P95', lambda x: x.quantile(0.95))
            ]).round(2)

            pivot_table = comparison_stats[metric].unstack(level=0)
            pivot_table.index = pivot_table.index.map(self.get_protocol_name)
            pivot_table.index.name = 'Protocol'

            latex_code = pivot_table.to_latex(
                caption=f'Resolver Latency Comparison - {metric} (ms)',
                label=f'tab:comparison_{metric.lower()}',
                float_format="%.2f"
            )

            with open(f'{output_dir}/comparison_{metric.lower()}.tex', 'w') as f:
                f.write(latex_code)

            print(f"  ✓ Created: comparison_{metric.lower()}.tex")

    def run_analysis(self):
        """Run the complete analysis"""
        print("="*80)
        print("Fast DNS QoS Analysis with Bandwidth")
        print("="*80)

        self.load_data()

        if not self.all_data:
            print("\n⚠ No data loaded.")
            return

        print("\n" + "="*80)
        self.create_line_graphs()

        print("\n" + "="*80)
        self.create_resolver_comparison_bars()

        print("\n" + "="*80)
        self.generate_latex_tables()

        print("\n" + "="*80)
        print("✓ Analysis Complete!")
        print("="*80)


if __name__ == "__main__":
    analyzer = FastDNSAnalyzer(results_dir='results')
    analyzer.run_analysis()