feat(dns): add dnscrypt and dns over tcp

This commit is contained in:
2026-02-04 22:08:05 +00:00
parent 5d9b630d13
commit 92351a80a9
12 changed files with 2576 additions and 568 deletions

View File

@@ -1,250 +1,362 @@
#!/usr/bin/env python3
"""
Add network metrics from PCAP files to DNS CSV files.
Adds: raw_bytes_total, raw_packet_count, overhead_bytes, efficiency_percent
Fast PCAP Preprocessor for DNS QoS Analysis
Loads PCAP into memory first, then uses binary search for matching.
Uses LAN IP to determine direction (LAN = sent, non-LAN = received).
"""
import csv
import os
import argparse
import re
import shutil
from pathlib import Path
from datetime import datetime, timezone
from scapy.all import rdpcap
from typing import Dict, List, NamedTuple
import time
def parse_timestamp(ts_str):
"""Parse timestamp with timezone and nanoseconds (RFC3339Nano)."""
match = re.match(
r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})\.(\d+)([\+\-]\d{2}:\d{2})',
ts_str
)
if not match:
raise ValueError(f"Invalid timestamp format: {ts_str}")
base, nanos, tz = match.groups()
micros = nanos[:6].ljust(6, '0')
iso_str = f"{base}.{micros}{tz}"
dt = datetime.fromisoformat(iso_str)
full_nanos = int(nanos.ljust(9, '0'))
return dt, full_nanos
import dpkt
from dateutil import parser as date_parser
def read_pcap(pcap_path):
"""Read PCAP and return list of (timestamp_epoch, size)."""
class Packet(NamedTuple):
"""Lightweight packet representation."""
timestamp: float
size: int
is_outbound: bool # True if from LAN, False if from internet
class QueryWindow:
"""Efficient query window representation."""
__slots__ = ['index', 'start', 'end', 'sent', 'received', 'pkts_sent', 'pkts_received']
def __init__(self, index: int, start: float, end: float):
self.index = index
self.start = start
self.end = end
self.sent = 0
self.received = 0
self.pkts_sent = 0
self.pkts_received = 0
def parse_csv_timestamp(ts_str: str) -> float:
"""Convert RFC3339Nano timestamp to Unix epoch (seconds)."""
dt = date_parser.isoparse(ts_str)
return dt.timestamp()
def is_lan_ip(ip_bytes: bytes) -> bool:
"""Check if IP is a private/LAN address."""
if len(ip_bytes) != 4:
return False
first = ip_bytes[0]
second = ip_bytes[1]
# 10.0.0.0/8
if first == 10:
return True
# 172.16.0.0/12
if first == 172 and 16 <= second <= 31:
return True
# 192.168.0.0/16
if first == 192 and second == 168:
return True
# 127.0.0.0/8 (localhost)
if first == 127:
return True
return False
def load_pcap_into_memory(pcap_path: Path) -> List[Packet]:
"""Load all packets from PCAP into memory with minimal data."""
packets = []
print(f" Loading PCAP into memory...")
start_time = time.time()
try:
pkts = rdpcap(str(pcap_path))
for pkt in pkts:
timestamp = float(pkt.time)
length = len(pkt)
packets.append((timestamp, length))
with open(pcap_path, 'rb') as f:
try:
pcap = dpkt.pcap.Reader(f)
except:
# Try pcapng format
f.seek(0)
pcap = dpkt.pcapng.Reader(f)
for ts, buf in pcap:
try:
packet_time = float(ts)
packet_size = len(buf)
# Parse to get source IP
eth = dpkt.ethernet.Ethernet(buf)
# Default to outbound if we can't determine
is_outbound = True
if isinstance(eth.data, dpkt.ip.IP):
ip = eth.data
src_ip = ip.src
is_outbound = is_lan_ip(src_ip)
packets.append(Packet(
timestamp=packet_time,
size=packet_size,
is_outbound=is_outbound
))
except (dpkt.dpkt.NeedData, dpkt.dpkt.UnpackError, AttributeError):
continue
except Exception as e:
print(f" Error reading PCAP: {e}")
print(f" Error reading PCAP: {e}")
return []
elapsed = time.time() - start_time
print(f" Loaded {len(packets):,} packets in {elapsed:.2f}s")
# Sort by timestamp for binary search
packets.sort(key=lambda p: p.timestamp)
return packets
def find_packets_in_window(packets, start_ts, start_nanos, duration_ns):
"""Find packets within exact time window."""
start_epoch = start_ts.timestamp()
start_epoch += (start_nanos % 1_000_000) / 1_000_000_000
end_epoch = start_epoch + (duration_ns / 1_000_000_000)
total_bytes = 0
packet_count = 0
for pkt_ts, pkt_len in packets:
if start_epoch <= pkt_ts <= end_epoch:
total_bytes += pkt_len
packet_count += 1
return total_bytes, packet_count
def enhance_csv(csv_path, pcap_path, output_path, debug=False):
"""Add PCAP metrics to CSV."""
if not os.path.exists(pcap_path):
print(f"⚠️ PCAP not found: {pcap_path}")
return False
print(f"Processing: {os.path.basename(csv_path)}")
# Read PCAP
packets = read_pcap(pcap_path)
print(f" Loaded {len(packets)} packets")
def find_packets_in_window(
packets: List[Packet],
start_time: float,
end_time: float,
left_hint: int = 0
) -> tuple[List[Packet], int]:
"""
Binary search to find all packets within time window.
Returns (matching_packets, left_index_hint_for_next_search).
"""
if not packets:
print(" ❌ No packets found")
return False
return [], 0
if packets and debug:
first_pcap = packets[0][0]
last_pcap = packets[-1][0]
print(f" First PCAP packet: {first_pcap:.6f}")
print(f" Last PCAP packet: {last_pcap:.6f}")
print(f" PCAP duration: {(last_pcap - first_pcap):.3f}s")
# Binary search for first packet >= start_time
left, right = left_hint, len(packets) - 1
first_idx = len(packets)
# Read CSV
with open(csv_path, 'r', newline='') as f:
reader = csv.DictReader(f)
fieldnames = list(reader.fieldnames) + [
'raw_bytes_total',
'raw_packet_count',
'overhead_bytes',
'efficiency_percent'
]
rows = list(reader)
while left <= right:
mid = (left + right) // 2
if packets[mid].timestamp >= start_time:
first_idx = mid
right = mid - 1
else:
left = mid + 1
if rows and debug:
try:
first_ts, _ = parse_timestamp(rows[0]['timestamp'])
last_ts, _ = parse_timestamp(rows[-1]['timestamp'])
print(f" First CSV query: {first_ts.timestamp():.6f}")
print(f" Last CSV query: {last_ts.timestamp():.6f}")
offset = packets[0][0] - first_ts.timestamp()
print(f" Time offset (PCAP - CSV): {offset:.3f}s")
except:
pass
# No packets in range
if first_idx >= len(packets) or packets[first_idx].timestamp > end_time:
return [], first_idx
# Enhance rows
enhanced = []
matched = 0
# Collect all packets in window
matching = []
idx = first_idx
while idx < len(packets) and packets[idx].timestamp <= end_time:
matching.append(packets[idx])
idx += 1
for i, row in enumerate(rows):
try:
timestamp, nanos = parse_timestamp(row['timestamp'])
duration_ns = int(row['duration_ns'])
raw_bytes, packet_count = find_packets_in_window(
packets, timestamp, nanos, duration_ns
)
useful_bytes = (
int(row['request_size_bytes']) +
int(row['response_size_bytes'])
)
overhead = raw_bytes - useful_bytes
efficiency = (
(useful_bytes / raw_bytes * 100)
if raw_bytes > 0 else 0
)
row['raw_bytes_total'] = raw_bytes
row['raw_packet_count'] = packet_count
row['overhead_bytes'] = overhead
row['efficiency_percent'] = f"{efficiency:.2f}"
if raw_bytes > 0:
matched += 1
# Debug first few queries
if debug and i < 3:
print(f" Query {i}: {row['domain']}")
print(f" Duration: {duration_ns / 1e6:.3f}ms")
print(f" Matched packets: {packet_count}")
print(f" Raw bytes: {raw_bytes}")
print(f" Useful bytes: {useful_bytes}")
print(f" Efficiency: {efficiency:.2f}%")
except (ValueError, KeyError) as e:
if debug:
print(f" Error processing row {i}: {e}")
row['raw_bytes_total'] = 0
row['raw_packet_count'] = 0
row['overhead_bytes'] = 0
row['efficiency_percent'] = "0.00"
enhanced.append(row)
print(f" Matched: {matched}/{len(rows)} queries")
if matched == 0:
print(" ⚠️ WARNING: No queries matched any packets!")
print(" This might indicate timestamp misalignment.")
# Write output
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(enhanced)
print(f" ✓ Saved: {output_path}")
return True
return matching, first_idx
def main():
parser = argparse.ArgumentParser(
description='Add PCAP network metrics to DNS CSV files'
)
parser.add_argument('input_dir', help='Input directory (e.g., results)')
parser.add_argument(
'--output',
default='./results_enriched',
help='Output directory (default: ./results_enriched)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Preview files without processing'
)
parser.add_argument(
'--debug',
action='store_true',
help='Show detailed timing information'
)
def load_csv_queries(csv_path: Path) -> List[Dict]:
"""Load CSV and create query data structures."""
queries = []
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
try:
ts_epoch = parse_csv_timestamp(row['timestamp'])
duration_s = float(row['duration_ns']) / 1e9
queries.append({
'data': row,
'start_time': ts_epoch,
'end_time': ts_epoch + duration_s,
})
except Exception as e:
print(f" Warning: Skipping row - {e}")
continue
return queries
def match_packets_to_queries(
packets: List[Packet],
queries: List[Dict]
) -> List[Dict]:
"""Match packets to query windows using binary search."""
if not queries or not packets:
return queries
args = parser.parse_args()
print(f" Matching packets to queries...")
start_time = time.time()
print("=" * 60)
print("ENHANCE DNS CSVs WITH PCAP METRICS")
print("=" * 60)
print(f"Input: {args.input_dir}")
print(f"Output: {args.output}")
if args.debug:
print("Debug: ENABLED")
print()
# Initialize metrics
for q in queries:
q['bytes_sent'] = 0
q['bytes_received'] = 0
q['packets_sent'] = 0
q['packets_received'] = 0
q['total_bytes'] = 0
# Find CSV files
csv_files = list(Path(args.input_dir).rglob('*.csv'))
# Sort queries by start time for sequential processing
queries_sorted = sorted(enumerate(queries), key=lambda x: x[1]['start_time'])
if not csv_files:
print("❌ No CSV files found")
return 1
matched_packets = 0
left_hint = 0 # Optimization: start next search from here
print(f"Found {len(csv_files)} CSV files\n")
for original_idx, q in queries_sorted:
matching, left_hint = find_packets_in_window(
packets,
q['start_time'],
q['end_time'],
left_hint
)
for pkt in matching:
matched_packets += 1
if pkt.is_outbound:
q['bytes_sent'] += pkt.size
q['packets_sent'] += 1
else:
q['bytes_received'] += pkt.size
q['packets_received'] += 1
q['total_bytes'] = q['bytes_sent'] + q['bytes_received']
if args.dry_run:
print("DRY RUN - would process:")
for csv_path in csv_files:
pcap_path = csv_path.with_suffix('.pcap')
print(f" {csv_path.relative_to(args.input_dir)}")
print(f" PCAP: {'' if pcap_path.exists() else ''}")
return 0
elapsed = time.time() - start_time
print(f" Matched {matched_packets:,} packets in {elapsed:.2f}s")
# Process files
success = 0
failed = 0
# Statistics
total_sent = sum(q['bytes_sent'] for q in queries)
total_recv = sum(q['bytes_received'] for q in queries)
queries_with_data = sum(1 for q in queries if q['total_bytes'] > 0)
print(f" Total: {total_sent:,} bytes sent, {total_recv:,} bytes received")
print(f" Queries with data: {queries_with_data}/{len(queries)}")
return queries
def write_enriched_csv(
csv_path: Path, queries: List[Dict], backup: bool = True
):
"""Write enriched CSV with bandwidth columns."""
if backup and csv_path.exists():
backup_path = csv_path.with_suffix('.csv.bak')
if not backup_path.exists(): # Don't overwrite existing backup
shutil.copy2(csv_path, backup_path)
print(f" Backup: {backup_path.name}")
# Get fieldnames
original_fields = list(queries[0]['data'].keys())
new_fields = [
'bytes_sent',
'bytes_received',
'packets_sent',
'packets_received',
'total_bytes',
]
fieldnames = original_fields + new_fields
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for q in queries:
row = q['data'].copy()
for field in new_fields:
row[field] = q[field]
writer.writerow(row)
print(f" Written: {csv_path.name}")
def process_provider_directory(provider_path: Path):
"""Process all CSV/PCAP pairs in a provider directory."""
print(f"\n{'='*60}")
print(f"Processing: {provider_path.name.upper()}")
print(f"{'='*60}")
csv_files = sorted(provider_path.glob('*.csv'))
processed = 0
total_time = 0
for csv_path in csv_files:
pcap_path = csv_path.with_suffix('.pcap')
rel_path = csv_path.relative_to(args.input_dir)
output_path = Path(args.output) / rel_path
# Skip backup files
if '.bak' in csv_path.name:
continue
if enhance_csv(str(csv_path), str(pcap_path), str(output_path),
args.debug):
success += 1
else:
failed += 1
print()
pcap_path = csv_path.with_suffix('.pcap')
if not pcap_path.exists():
print(f"\n ⚠ Skipping {csv_path.name} - no matching PCAP")
continue
print(f"\n 📁 {csv_path.name}")
file_start = time.time()
# Load PCAP into memory first
packets = load_pcap_into_memory(pcap_path)
if not packets:
print(f" ⚠ No packets found in PCAP")
continue
# Load CSV queries
queries = load_csv_queries(csv_path)
if not queries:
print(f" ⚠ No valid queries found")
continue
print(f" Loaded {len(queries):,} queries")
# Match packets to queries
enriched_queries = match_packets_to_queries(packets, queries)
# Write enriched CSV
write_enriched_csv(csv_path, enriched_queries)
file_time = time.time() - file_start
total_time += file_time
processed += 1
print(f" ✓ Completed in {file_time:.2f}s")
# Summary
print("=" * 60)
print(f"✓ Success: {success}")
print(f"✗ Failed: {failed}")
print(f"Total: {len(csv_files)}")
print(f"\nOutput: {args.output}")
return 0 if failed == 0 else 1
print(f"\n {'='*58}")
print(f" {provider_path.name}: {processed} files in {total_time:.2f}s")
print(f" {'='*58}")
if __name__ == "__main__":
exit(main())
def main():
"""Main preprocessing pipeline."""
overall_start = time.time()
print("\n" + "="*60)
print("DNS PCAP PREPROCESSOR - Memory-Optimized Edition")
print("="*60)
results_dir = Path('results')
if not results_dir.exists():
print(f"\n❌ Error: '{results_dir}' directory not found")
return
providers = ['adguard', 'cloudflare', 'google', 'quad9']
for provider in providers:
provider_path = results_dir / provider
if provider_path.exists():
process_provider_directory(provider_path)
else:
print(f"\n⚠ Warning: Provider directory not found: {provider}")
overall_time = time.time() - overall_start
print("\n" + "="*60)
print(f"✓ PREPROCESSING COMPLETE")
print(f" Total time: {overall_time:.2f}s ({overall_time/60:.1f} minutes)")
print("="*60 + "\n")
if __name__ == '__main__':
main()