feat(scripts): add scripts to process data

This commit is contained in:
2025-10-11 23:12:31 +01:00
parent 319c9d0767
commit 4cec2fabd4
8 changed files with 932 additions and 8 deletions

View File

@@ -0,0 +1,289 @@
import csv
import os
import statistics
from collections import defaultdict
from pathlib import Path
def map_server_to_resolver(server):
"""Map server address/domain to resolver name"""
server_lower = server.lower()
if '1.1.1.1' in server_lower or 'cloudflare' in server_lower:
return 'Cloudflare'
elif '8.8.8.8' in server_lower or 'google' in server_lower:
return 'Google'
elif '9.9.9.9' in server_lower or 'quad9' in server_lower:
return 'Quad9'
elif 'adguard' in server_lower:
return 'AdGuard'
else:
return server # Fallback to original server name
def extract_from_new_format(filename):
"""Parse new filename format: protocol[-flags]-timestamp.csv"""
base = filename.replace('.csv', '')
parts = base.split('-')
if len(parts) < 2:
return None, None, None, None
protocol = parts[0]
timestamp = parts[-1]
# Flags are everything between protocol and timestamp
flags_str = '-'.join(parts[1:-1])
# Determine DNSSEC status
if 'auth' in flags_str:
dnssec_status = 'auth' # Authoritative DNSSEC
elif 'trust' in flags_str:
dnssec_status = 'trust' # Trust-based DNSSEC
else:
dnssec_status = 'off'
keepalive_status = 'on' if 'persist' in flags_str else 'off'
return protocol, dnssec_status, keepalive_status, flags_str
def extract_server_info_from_csv(row):
"""Extract DNSSEC info from CSV row data"""
dnssec = row.get('dnssec', 'false').lower() == 'true'
auth_dnssec = row.get('auth_dnssec', 'false').lower() == 'true'
keepalive = row.get('keep_alive', 'false').lower() == 'true'
if dnssec:
if auth_dnssec:
dnssec_status = 'auth'
else:
dnssec_status = 'trust'
else:
dnssec_status = 'off'
keepalive_status = 'on' if keepalive else 'off'
return dnssec_status, keepalive_status
def extract_server_info(file_path, row):
"""Extract info using directory structure, filename, and CSV data"""
path = Path(file_path)
# First try to get DNSSEC info from CSV row (most accurate)
try:
csv_dnssec_status, csv_keepalive_status = extract_server_info_from_csv(row)
protocol = row.get('protocol', '').lower()
# Get server from directory structure
parts = path.parts
if len(parts) >= 4:
potential_date = parts[-2]
# Check if it's a date like YYYY-MM-DD
if len(potential_date) == 10 and potential_date[4] == '-' and potential_date[7] == '-' and potential_date.replace('-', '').isdigit():
server = parts[-3] # resolver folder (e.g., cloudflare)
return protocol, server, csv_dnssec_status, csv_keepalive_status
# Fallback to DNS server field
server = row.get('dns_server', '')
return protocol, server, csv_dnssec_status, csv_keepalive_status
except (KeyError, ValueError):
pass
# Fallback to filename parsing
filename = path.name
protocol, dnssec_status, keepalive_status, flags = extract_from_new_format(filename)
if protocol:
# Get server from directory structure
parts = path.parts
if len(parts) >= 4:
potential_date = parts[-2]
if len(potential_date) == 10 and potential_date[4] == '-' and potential_date[7] == '-' and potential_date.replace('-', '').isdigit():
server = parts[-3]
return protocol, server, dnssec_status, keepalive_status
# Fallback to DNS server field
server = row.get('dns_server', '')
return protocol, server, dnssec_status, keepalive_status
return None, None, None, None
def get_dnssec_display_name(dnssec_status):
"""Convert DNSSEC status to display name"""
if dnssec_status == 'auth':
return 'DNSSEC (Authoritative)'
elif dnssec_status == 'trust':
return 'DNSSEC (Trust-based)'
else:
return 'No DNSSEC'
def analyze_dns_data(root_directory, output_file):
"""Analyze DNS data and generate metrics"""
# Dictionary to store measurements: {(resolver, protocol, dnssec, keepalive): [durations]}
measurements = defaultdict(list)
# Walk through all directories
for root, dirs, files in os.walk(root_directory):
for file in files:
if file.endswith('.csv'):
file_path = os.path.join(root, file)
print(f"Processing: {file_path}")
try:
with open(file_path, 'r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row_num, row in enumerate(reader, 2): # Start at 2 since header is row 1
try:
protocol, server, dnssec_status, keepalive_status = extract_server_info(file_path, row)
if protocol and server:
resolver = map_server_to_resolver(server)
duration_ms = float(row.get('duration_ms', 0))
# Only include successful queries
if row.get('response_code', '') in ['NOERROR', '']:
key = (resolver, protocol, dnssec_status, keepalive_status)
measurements[key].append(duration_ms)
except (ValueError, TypeError) as e:
print(f"Data parse error in {file_path} row {row_num}: {e}")
continue
except Exception as e:
print(f"Error processing file {file_path}: {e}")
continue
# Calculate statistics grouped by resolver first, then by configuration
resolver_results = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for (resolver, protocol, dnssec, keepalive), durations in measurements.items():
if durations:
stats = {
'protocol': protocol.upper(),
'dnssec': dnssec,
'keepalive': keepalive,
'total_queries': len(durations),
'avg_latency_ms': round(statistics.mean(durations), 3),
'median_latency_ms': round(statistics.median(durations), 3),
'min_latency_ms': round(min(durations), 3),
'max_latency_ms': round(max(durations), 3),
'std_dev_ms': round(statistics.stdev(durations) if len(durations) > 1 else 0, 3),
'p95_latency_ms': round(statistics.quantiles(durations, n=20)[18], 3) if len(durations) >= 20 else round(max(durations), 3),
'p99_latency_ms': round(statistics.quantiles(durations, n=100)[98], 3) if len(durations) >= 100 else round(max(durations), 3)
}
# Group by resolver -> dnssec -> keepalive -> protocol
resolver_results[resolver][dnssec][keepalive].append(stats)
# Sort each configuration's results by average latency
for resolver in resolver_results:
for dnssec in resolver_results[resolver]:
for keepalive in resolver_results[resolver][dnssec]:
resolver_results[resolver][dnssec][keepalive].sort(key=lambda x: x['avg_latency_ms'])
# Write to CSV with all data
all_results = []
for resolver in resolver_results:
for dnssec in resolver_results[resolver]:
for keepalive in resolver_results[resolver][dnssec]:
for result in resolver_results[resolver][dnssec][keepalive]:
result['resolver'] = resolver
all_results.append(result)
with open(output_file, 'w', newline='') as csvfile:
fieldnames = [
'resolver', 'protocol', 'dnssec', 'keepalive', 'total_queries',
'avg_latency_ms', 'median_latency_ms', 'min_latency_ms',
'max_latency_ms', 'std_dev_ms', 'p95_latency_ms', 'p99_latency_ms'
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_results)
print(f"\nAnalysis complete! Full results written to {output_file}")
print(f"Total measurements: {sum(len(durations) for durations in measurements.values())}")
def print_configuration_table(resolver, dnssec_status, keepalive_status, results):
"""Print a formatted table for a specific configuration"""
ka_indicator = "PERSISTENT" if keepalive_status == 'on' else "NEW CONN"
dnssec_display = get_dnssec_display_name(dnssec_status)
print(f"\n {dnssec_display} - {ka_indicator}")
print(" " + "-" * 90)
print(f" {'Protocol':<12} {'Queries':<8} {'Avg(ms)':<10} {'Median(ms)':<12} {'Min(ms)':<10} {'Max(ms)':<10} {'P95(ms)':<10}")
print(" " + "-" * 90)
for result in results:
print(f" {result['protocol']:<12} {result['total_queries']:<8} "
f"{result['avg_latency_ms']:<10} {result['median_latency_ms']:<12} "
f"{result['min_latency_ms']:<10} {result['max_latency_ms']:<10} "
f"{result['p95_latency_ms']:<10}")
# Print results grouped by resolver first
print(f"\n{'=' * 100}")
print("DNS RESOLVER PERFORMANCE COMPARISON")
print(f"{'=' * 100}")
for resolver in sorted(resolver_results.keys()):
print(f"\n{resolver} DNS Resolver")
print("=" * 100)
# Order configurations logically
config_order = [
('off', 'off'), # No DNSSEC, New connections
('off', 'on'), # No DNSSEC, Persistent
('trust', 'off'), # Trust DNSSEC, New connections
('trust', 'on'), # Trust DNSSEC, Persistent
('auth', 'off'), # Auth DNSSEC, New connections
('auth', 'on'), # Auth DNSSEC, Persistent
]
for dnssec_status, keepalive_status in config_order:
if dnssec_status in resolver_results[resolver] and keepalive_status in resolver_results[resolver][dnssec_status]:
results = resolver_results[resolver][dnssec_status][keepalive_status]
if results: # Only print if there are results
print_configuration_table(resolver, dnssec_status, keepalive_status, results)
# Summary comparison across resolvers
print(f"\n{'=' * 100}")
print("CROSS-RESOLVER PROTOCOL COMPARISON")
print(f"{'=' * 100}")
# Group by protocol and configuration for cross-resolver comparison
protocol_comparison = defaultdict(lambda: defaultdict(list))
for resolver in resolver_results:
for dnssec in resolver_results[resolver]:
for keepalive in resolver_results[resolver][dnssec]:
for result in resolver_results[resolver][dnssec][keepalive]:
config_key = f"{get_dnssec_display_name(dnssec)} - {'PERSISTENT' if keepalive == 'on' else 'NEW CONN'}"
protocol_comparison[result['protocol']][config_key].append({
'resolver': resolver,
'avg_latency_ms': result['avg_latency_ms'],
'total_queries': result['total_queries']
})
for protocol in sorted(protocol_comparison.keys()):
print(f"\n{protocol} Protocol Comparison")
print("-" * 100)
for config in sorted(protocol_comparison[protocol].keys()):
resolvers_data = protocol_comparison[protocol][config]
if resolvers_data:
print(f"\n {config}")
print(" " + "-" * 60)
print(f" {'Resolver':<15} {'Avg Latency (ms)':<20} {'Queries':<10}")
print(" " + "-" * 60)
# Sort by average latency
resolvers_data.sort(key=lambda x: x['avg_latency_ms'])
for data in resolvers_data:
print(f" {data['resolver']:<15} {data['avg_latency_ms']:<20} {data['total_queries']:<10}")
if __name__ == "__main__":
root_dir = "."
output_file = "dns_metrics.csv"
analyze_dns_data(root_dir, output_file)

View File

@@ -0,0 +1,284 @@
#!/usr/bin/env python3
"""
Add network metrics from PCAP files to DNS CSV files.
Adds: pcap_network_bytes_in, pcap_network_bytes_out, pcap_overhead_bytes
"""
import csv
import os
import argparse
from pathlib import Path
from datetime import datetime, timezone
import dpkt
import socket
# Test machine IPs
TEST_IPS = {
'10.0.0.50',
'2001:818:e73e:ba00:5506:dfd4:ed8b:96e',
'fe80::fe98:c62e:4463:9a2d'
}
def inet_to_str(inet):
"""Convert inet bytes to IP string"""
try:
return socket.inet_ntop(socket.AF_INET, inet)
except ValueError:
try:
return socket.inet_ntop(socket.AF_INET6, inet)
except ValueError:
return None
def read_pcap(pcap_path):
"""Read PCAP and return list of (timestamp_ns, size, src_ip, dst_ip)"""
packets = []
with open(pcap_path, 'rb') as f:
try:
pcap = dpkt.pcap.Reader(f)
except:
f.seek(0)
pcap = dpkt.pcapng.Reader(f)
for ts, buf in pcap:
try:
# Convert PCAP timestamp (float seconds) to nanoseconds
timestamp_ns = int(ts * 1_000_000_000)
size = len(buf)
eth = dpkt.ethernet.Ethernet(buf)
src_ip = dst_ip = None
if isinstance(eth.data, dpkt.ip.IP):
src_ip = inet_to_str(eth.data.src)
dst_ip = inet_to_str(eth.data.dst)
elif isinstance(eth.data, dpkt.ip6.IP6):
src_ip = inet_to_str(eth.data.src)
dst_ip = inet_to_str(eth.data.dst)
if src_ip and dst_ip:
packets.append((timestamp_ns, size, src_ip, dst_ip))
except (dpkt.dpkt.NeedData, dpkt.dpkt.UnpackError):
continue
return packets
def find_packets_in_window(packets, start_ns, duration_ns):
"""Find packets within exact time window (nanosecond precision)"""
end_ns = start_ns + duration_ns
matching = []
for timestamp_ns, size, src_ip, dst_ip in packets:
if start_ns <= timestamp_ns <= end_ns:
matching.append((size, src_ip, dst_ip))
return matching
def calculate_metrics(packets):
"""Calculate network metrics from packets"""
bytes_in = 0
bytes_out = 0
for size, src_ip, dst_ip in packets:
if dst_ip in TEST_IPS:
bytes_in += size
elif src_ip in TEST_IPS:
bytes_out += size
return {
'pcap_network_bytes_in': bytes_in,
'pcap_network_bytes_out': bytes_out,
'pcap_overhead_bytes': bytes_in + bytes_out
}
def parse_timestamp_to_ns(ts_str):
"""Parse ISO timestamp to nanoseconds since epoch"""
try:
dt = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
if dt.tzinfo is not None:
dt = dt.astimezone(timezone.utc)
# Convert to nanoseconds since epoch
return int(dt.timestamp() * 1_000_000_000)
except ValueError:
return None
def enhance_csv(csv_path, pcap_path, output_path, debug=False):
"""Add PCAP metrics to CSV"""
if not os.path.exists(pcap_path):
print(f"⚠️ PCAP not found: {pcap_path}")
return False
print(f"Processing: {os.path.basename(csv_path)}")
# Read PCAP
try:
packets = read_pcap(pcap_path)
print(f" Loaded {len(packets)} packets")
if packets and debug:
first_pcap_ns = packets[0][0]
last_pcap_ns = packets[-1][0]
print(f" First PCAP packet: {first_pcap_ns} ns")
print(f" Last PCAP packet: {last_pcap_ns} ns")
print(f" PCAP duration: {(last_pcap_ns - first_pcap_ns) / 1e9:.3f}s")
except Exception as e:
print(f" ❌ Error reading PCAP: {e}")
return False
if not packets:
print(" ❌ No packets found")
return False
# Read CSV
with open(csv_path, 'r', newline='') as f:
reader = csv.DictReader(f)
fieldnames = list(reader.fieldnames) + [
'pcap_network_bytes_in',
'pcap_network_bytes_out',
'pcap_overhead_bytes'
]
rows = list(reader)
if rows and debug:
first_csv_ns = parse_timestamp_to_ns(rows[0]['timestamp'])
last_csv_ns = parse_timestamp_to_ns(rows[-1]['timestamp'])
if first_csv_ns and last_csv_ns:
print(f" First CSV query: {first_csv_ns} ns")
print(f" Last CSV query: {last_csv_ns} ns")
print(f" CSV duration: {(last_csv_ns - first_csv_ns) / 1e9:.3f}s")
# Check alignment
offset_ns = packets[0][0] - first_csv_ns
print(f" Time offset (PCAP - CSV): {offset_ns / 1e9:.3f}s")
# Enhance rows
enhanced = []
matched = 0
for i, row in enumerate(rows):
ts_ns = parse_timestamp_to_ns(row['timestamp'])
if not ts_ns:
continue
duration_ns = int(row.get('duration_ns', 0))
matching_packets = find_packets_in_window(packets, ts_ns, duration_ns)
metrics = calculate_metrics(matching_packets)
row.update(metrics)
enhanced.append(row)
if metrics['pcap_overhead_bytes'] > 0:
matched += 1
# Debug first few queries
if debug and i < 3:
print(f" Query {i}: {row['domain']}")
print(f" Start: {ts_ns} ns")
print(f" Duration: {duration_ns} ns ({duration_ns / 1e6:.3f}ms)")
print(f" End: {ts_ns + duration_ns} ns")
print(f" Matched packets: {len(matching_packets)}")
print(f" Bytes: {metrics['pcap_overhead_bytes']}")
print(f" Matched: {matched}/{len(rows)} queries")
if matched == 0:
print(" ⚠️ WARNING: No queries matched any packets!")
print(" This might indicate timestamp misalignment.")
# Write output
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
writer.writerows(enhanced)
print(f" ✓ Saved: {output_path}")
return True
def main():
parser = argparse.ArgumentParser(
description='Add PCAP network metrics to DNS CSV files'
)
parser.add_argument('input_dir', help='Input directory (e.g., results_merged)')
parser.add_argument(
'--output',
default='./results_enhanced',
help='Output directory (default: ./results_enhanced)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Preview files without processing'
)
parser.add_argument(
'--debug',
action='store_true',
help='Show detailed timing information'
)
args = parser.parse_args()
print("=" * 60)
print("ENHANCE DNS CSVs WITH PCAP METRICS")
print("=" * 60)
print(f"Input: {args.input_dir}")
print(f"Output: {args.output}")
if args.debug:
print("Debug: ENABLED")
print()
# Find CSV files
csv_files = list(Path(args.input_dir).rglob('*.csv'))
if not csv_files:
print("❌ No CSV files found")
return 1
print(f"Found {len(csv_files)} CSV files\n")
if args.dry_run:
print("DRY RUN - would process:")
for csv_path in csv_files:
pcap_path = csv_path.with_suffix('.pcap')
print(f" {csv_path.relative_to(args.input_dir)}")
print(f" PCAP: {'' if pcap_path.exists() else ''}")
return 0
# Process files
success = 0
failed = 0
for csv_path in csv_files:
pcap_path = csv_path.with_suffix('.pcap')
rel_path = csv_path.relative_to(args.input_dir)
output_path = Path(args.output) / rel_path
if enhance_csv(str(csv_path), str(pcap_path), str(output_path),
args.debug):
success += 1
else:
failed += 1
print()
# Summary
print("=" * 60)
print(f"✓ Success: {success}")
print(f"✗ Failed: {failed}")
print(f"Total: {len(csv_files)}")
print(f"\nOutput: {args.output}")
return 0 if failed == 0 else 1
if __name__ == "__main__":
exit(main())

View File

@@ -0,0 +1,367 @@
#!/usr/bin/env python3
"""
Advanced PCAP filter for DNS traffic (with IPv6 support).
Filters out:
- Local network traffic except test machine (IPv4: 10.0.0.50; IPv6: specific addresses)
- AdGuard DNS servers (for non-AdGuard captures)
- Non-DNS traffic based on protocol-specific ports
"""
import os
import subprocess
from pathlib import Path
import argparse
# Test machine IPs (IPv4 and IPv6 from your provided info)
TEST_IPV4 = '10.0.0.50'
TEST_IPV6_GLOBAL = '2001:818:e73e:ba00:5506:dfd4:ed8b:96e'
TEST_IPV6_LINKLOCAL = 'fe80::fe98:c62e:4463:9a2d'
# Port mappings
PORT_MAP = {
'udp': [53], # DNS-over-UDP
'tls': [53, 853], # DNS-over-TLS
'https': [53, 443], # DNS-over-HTTPS (DoH)
'doq': [53, 784, 8853], # DNS-over-QUIC
'doh3': [53, 443] # DNS-over-HTTP/3
}
# AdGuard DNS IPs to filter out (for non-AdGuard captures)
ADGUARD_IPS = [
'94.140.14.14',
'94.140.15.15',
'2a10:50c0::ad1:ff',
'2a10:50c0::ad2:ff'
]
def parse_filename(filename):
"""Extract protocol from filename"""
base = filename.replace('.pcap', '').replace('.csv', '')
parts = base.split('-')
if len(parts) < 1: # Minimum: protocol
return None
protocol = parts[0].lower()
return protocol
def extract_resolver_from_path(pcap_path):
"""Extract resolver name from directory structure"""
parts = Path(pcap_path).parts
for part in parts:
if part.lower() in ['cloudflare', 'google', 'quad9', 'adguard']:
return part.lower()
return None
def build_filter_expression(protocol, resolver):
"""
Build tshark filter expression.
Strategy:
1. Only protocol-specific DNS ports
2. Keep only traffic involving the test machine (IPv4/IPv6)
3. Exclude AdGuard IPs for non-AdGuard captures
"""
# Get ports for this protocol
ports = PORT_MAP.get(protocol, [53, 443, 853, 784, 8853])
# Build port filter (UDP or TCP on these ports)
port_conditions = []
for port in ports:
port_conditions.append(f'(udp.port == {port} or tcp.port == {port})')
port_filter = ' or '.join(port_conditions)
# Build test machine filter (keep if src or dst is test machine IP)
machine_conditions = [f'(ip.addr == {TEST_IPV4})']
if TEST_IPV6_GLOBAL:
machine_conditions.append(f'(ipv6.addr == {TEST_IPV6_GLOBAL})')
if TEST_IPV6_LINKLOCAL:
machine_conditions.append(f'(ipv6.addr == {TEST_IPV6_LINKLOCAL})')
machine_filter = ' or '.join(machine_conditions)
# Build AdGuard exclusion filter
adguard_exclusions = []
if resolver != 'adguard':
for ip in ADGUARD_IPS:
if ':' in ip: # IPv6
adguard_exclusions.append(f'!(ipv6.addr == {ip})')
else: # IPv4
adguard_exclusions.append(f'!(ip.addr == {ip})')
# Combine all filters
filters = [f'({port_filter})', f'({machine_filter})']
if adguard_exclusions:
adguard_filter = ' and '.join(adguard_exclusions)
filters.append(f'({adguard_filter})')
final_filter = ' and '.join(filters)
return final_filter
def filter_pcap(input_path, output_path, filter_expr, verbose=False):
"""Apply filter to PCAP file using tshark"""
cmd = [
'tshark',
'-r', input_path,
'-Y', filter_expr,
'-w', output_path,
'-F', 'pcap'
]
try:
if verbose:
print(f" Filter: {filter_expr}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
)
if result.returncode != 0:
print(f" ✗ Error: {result.stderr.strip()}")
return False
if not os.path.exists(output_path):
print(f" ✗ Output file not created")
return False
output_size = os.path.getsize(output_path)
if output_size < 24:
print(f" ⚠ Warning: Output is empty")
return True
except subprocess.TimeoutExpired:
print(f" ✗ Timeout (>5 minutes)")
return False
except Exception as e:
print(f" ✗ Exception: {e}")
return False
def find_pcap_files(root_dir):
"""Recursively find all PCAP files"""
pcap_files = []
for root, dirs, files in os.walk(root_dir):
for file in files:
if file.endswith('.pcap'):
full_path = os.path.join(root, file)
pcap_files.append(full_path)
return sorted(pcap_files)
def format_bytes(bytes_val):
"""Format bytes as human readable"""
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes_val < 1024.0:
return f"{bytes_val:.1f} {unit}"
bytes_val /= 1024.0
return f"{bytes_val:.1f} TB"
def main():
parser = argparse.ArgumentParser(
description='Advanced PCAP filter for DNS traffic (IPv4/IPv6)',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
Filtering rules:
1. Only include traffic on protocol-specific DNS ports
2. Keep only packets involving the test machine (10.0.0.50 or its IPv6 addresses)
3. Exclude AdGuard IPs for non-AdGuard captures
Protocol-specific ports:
udp: 53
tls: 53, 853
https: 53, 443
doq: 53, 784, 8853
doh3: 53, 443
Examples:
# Dry run
%(prog)s ./results --dry-run
# Filter with verbose output
%(prog)s ./results --verbose
# Custom output directory
%(prog)s ./results --output ./cleaned
'''
)
parser.add_argument(
'input_dir',
help='Input directory containing PCAP files'
)
parser.add_argument(
'-o', '--output',
default='./results_filtered',
help='Output directory (default: ./results_filtered)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be done without filtering'
)
parser.add_argument(
'--limit',
type=int,
help='Only process first N files (for testing)'
)
parser.add_argument(
'-v', '--verbose',
action='store_true',
help='Verbose output (show filter expressions)'
)
parser.add_argument(
'--overwrite',
action='store_true',
help='Overwrite existing filtered files'
)
args = parser.parse_args()
# Check for tshark
try:
result = subprocess.run(
['tshark', '-v'],
capture_output=True,
check=True
)
if args.verbose:
version = result.stdout.decode().split('\n')[0]
print(f"Using: {version}\n")
except (subprocess.CalledProcessError, FileNotFoundError):
print("Error: tshark not found. Install Wireshark/tshark:")
print(" Ubuntu/Debian: sudo apt-get install tshark")
print(" macOS: brew install wireshark")
return 1
print("=" * 80)
print("ADVANCED DNS PCAP FILTER (IPv4/IPv6)")
print("=" * 80)
print("Filters:")
print(" 1. Protocol-specific DNS ports only")
print(" 2. Keep only traffic involving test machine (10.0.0.50 / IPv6 addresses)")
print(" 3. Exclude AdGuard IPs (for non-AdGuard captures)")
print(f"\nInput: {args.input_dir}")
print(f"Output: {args.output}")
# Find PCAP files
print(f"\nScanning for PCAP files...")
pcap_files = find_pcap_files(args.input_dir)
if not pcap_files:
print(f"No PCAP files found in {args.input_dir}")
return 1
print(f"Found {len(pcap_files)} PCAP files")
total_input_size = sum(os.path.getsize(f) for f in pcap_files)
print(f"Total size: {format_bytes(total_input_size)}")
if args.limit:
pcap_files = pcap_files[:args.limit]
print(f"Limiting to first {args.limit} files")
if args.dry_run:
print("\n*** DRY RUN MODE ***\n")
else:
print()
# Process files
success_count = 0
skip_count = 0
fail_count = 0
total_output_size = 0
for i, input_path in enumerate(pcap_files, 1):
# Extract info from path
filename = Path(input_path).name
protocol = parse_filename(filename)
resolver = extract_resolver_from_path(input_path)
if not protocol:
print(f"[{i}/{len(pcap_files)}] {filename}")
print(f" ⚠ Could not parse protocol, skipping")
skip_count += 1
continue
# Create output path
rel_path = os.path.relpath(input_path, args.input_dir)
output_path = os.path.join(args.output, rel_path)
input_size = os.path.getsize(input_path)
print(f"[{i}/{len(pcap_files)}] {rel_path}")
print(f" Protocol: {protocol.upper()}")
print(f" Resolver: {resolver or 'unknown'}")
print(f" Size: {format_bytes(input_size)}")
# Check if already filtered
if os.path.exists(output_path) and not args.overwrite:
output_size = os.path.getsize(output_path)
reduction = ((input_size - output_size) / input_size * 100) if input_size > 0 else 0
print(f" ⊙ Already filtered: {format_bytes(output_size)} "
f"({reduction:.1f}% reduction)")
skip_count += 1
total_output_size += output_size
continue
# Build filter
filter_expr = build_filter_expression(protocol, resolver)
if args.dry_run:
print(f" → Would filter")
if args.verbose:
print(f" Filter: {filter_expr}")
continue
# Create output directory
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Filter
success = filter_pcap(input_path, output_path, filter_expr, args.verbose)
if success:
output_size = os.path.getsize(output_path)
reduction = ((input_size - output_size) / input_size * 100) if input_size > 0 else 0
print(f" ✓ Filtered: {format_bytes(output_size)} "
f"({reduction:.1f}% reduction)")
success_count += 1
total_output_size += output_size
else:
fail_count += 1
# Summary
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
if args.dry_run:
print(f"Would process: {len(pcap_files)} files")
else:
print(f"Successful: {success_count}")
print(f"Skipped: {skip_count} (already filtered or unparseable)")
print(f"Failed: {fail_count}")
print(f"Total: {len(pcap_files)}")
if success_count > 0 or skip_count > 0:
print(f"\nInput size: {format_bytes(total_input_size)}")
print(f"Output size: {format_bytes(total_output_size)}")
if total_input_size > 0:
reduction = ((total_input_size - total_output_size) /
total_input_size * 100)
print(f"Reduction: {reduction:.1f}%")
print(f"\nOutput directory: {args.output}")
return 0 if fail_count == 0 else 1
if __name__ == "__main__":
exit(main())

View File

@@ -0,0 +1,274 @@
#!/usr/bin/env python3
"""
Merge DNS test files by configuration.
- Merges CSVs of same config (adds 'run_id' column for traceability)
- Optionally merges PCAPs using mergecap
- Flattens date structure
"""
import os
import csv
import subprocess
import shutil
from pathlib import Path
import argparse
from collections import defaultdict
def parse_filename(filename):
"""
Extract config key from filename.
Format: protocol[-flags]-timestamp.{csv,pcap}
Config key: protocol[-flags] (ignores timestamp)
"""
base = filename.replace('.csv', '').replace('.pcap', '')
parts = base.split('-')
if len(parts) < 2:
return None
# Config is everything except timestamp
config = '-'.join(parts[:-1])
timestamp = parts[-1]
return config, timestamp
def extract_resolver_from_path(file_path):
"""Extract resolver name from path"""
parts = Path(file_path).parts
for part in parts:
if part.lower() in ['cloudflare', 'google', 'quad9', 'adguard']:
return part.lower()
return None
def find_files(root_dir, extension):
"""Find all files with given extension"""
files = []
for root, dirs, filenames in os.walk(root_dir):
for filename in filenames:
if filename.endswith(extension):
full_path = os.path.join(root, filename)
files.append(full_path)
return sorted(files)
def merge_csvs(csv_files, output_path, fieldnames):
"""Merge multiple CSVs into one, adding 'run_id' column"""
with open(output_path, 'w', newline='') as outfile:
writer = csv.DictWriter(outfile, fieldnames=fieldnames + ['run_id'])
writer.writeheader()
for csv_path in csv_files:
# Use timestamp as run_id
filename = Path(csv_path).name
_, timestamp = parse_filename(filename)
run_id = timestamp # Or add date if needed
with open(csv_path, 'r', newline='') as infile:
reader = csv.DictReader(infile)
for row in reader:
row['run_id'] = run_id
writer.writerow(row)
def merge_pcaps(pcap_files, output_path):
"""Merge PCAP files using mergecap"""
cmd = ['mergecap', '-w', output_path] + pcap_files
try:
subprocess.run(cmd, capture_output=True, check=True)
return True
except subprocess.CalledProcessError as e:
print(f" ✗ mergecap error: {e.stderr.decode()}")
return False
except FileNotFoundError:
print("Error: mergecap not found. Install Wireshark:")
print(" Ubuntu: sudo apt install wireshark-common")
print(" macOS: brew install wireshark")
return False
def format_bytes(bytes_val):
"""Format bytes as human readable"""
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes_val < 1024.0:
return f"{bytes_val:.1f} {unit}"
bytes_val /= 1024.0
return f"{bytes_val:.1f} TB"
def main():
parser = argparse.ArgumentParser(
description='Merge DNS test files by configuration',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
Merges files of same config across dates/timestamps.
Output: ./results_merged/[resolver]/[config].csv (merged)
./results_merged/[resolver]/[config].pcap (merged, if --merge-pcaps)
Examples:
# Dry run to preview
%(prog)s ./results --dry-run
# Merge CSVs only (recommended)
%(prog)s ./results
# Merge CSVs and PCAPs
%(prog)s ./results --merge-pcaps
# Custom output directory
%(prog)s ./results --output ./merged_data
'''
)
parser.add_argument(
'input_dir',
help='Input directory (e.g., ./results)'
)
parser.add_argument(
'--output',
default='./results_merged',
help='Output directory (default: ./results_merged)'
)
parser.add_argument(
'--merge-pcaps',
action='store_true',
help='Merge PCAP files (requires mergecap from Wireshark)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be done without merging'
)
parser.add_argument(
'-y', '--yes',
action='store_true',
help='Skip confirmation prompt'
)
args = parser.parse_args()
if not os.path.isdir(args.input_dir):
print(f"Error: Input directory not found: {args.input_dir}")
return 1
# Find all files
print("=" * 80)
print("MERGE DNS TEST FILES")
print("=" * 80)
print(f"Input: {args.input_dir}")
print(f"Output: {args.output}")
print(f"Merge PCAPs: {'Yes' if args.merge_pcaps else 'No'}")
csv_files = find_files(args.input_dir, '.csv')
pcap_files = find_files(args.input_dir, '.pcap') if args.merge_pcaps else []
if not csv_files and not pcap_files:
print("\nNo CSV/PCAP files found")
return 1
print(f"\nFound {len(csv_files)} CSV files")
if args.merge_pcaps:
print(f"Found {len(pcap_files)} PCAP files")
# Group files by resolver and config
csv_groups = defaultdict(list)
pcap_groups = defaultdict(list)
for csv_path in csv_files:
config, _ = parse_filename(Path(csv_path).name)
resolver = extract_resolver_from_path(csv_path)
if config and resolver:
key = (resolver, config)
csv_groups[key].append(csv_path)
for pcap_path in pcap_files:
config, _ = parse_filename(Path(pcap_path).name)
resolver = extract_resolver_from_path(pcap_path)
if config and resolver:
key = (resolver, config)
pcap_groups[key].append(pcap_path)
# Summary
print("\nConfigs to merge:")
print("-" * 80)
for (resolver, config), files in sorted(csv_groups.items()):
print(f" {resolver}/{config}: {len(files)} runs")
total_runs = sum(len(files) for files in csv_groups.values())
print(f"\nTotal configs: {len(csv_groups)}")
print(f"Total runs: {total_runs}")
if args.dry_run:
print("\n*** DRY RUN MODE ***\n")
for (resolver, config) in sorted(csv_groups.keys()):
print(f"Would merge: {resolver}/{config} ({len(csv_groups[(resolver, config)])} CSVs)")
if args.merge_pcaps and (resolver, config) in pcap_groups:
print(f"Would merge: {resolver}/{config} ({len(pcap_groups[(resolver, config)])} PCAPs)")
return 0
# Confirmation
if not args.yes:
response = input(f"\nMerge all into {args.output}? [y/N] ")
if response.lower() not in ['y', 'yes']:
print("Cancelled")
return 0
# Merge
print("\n" + "=" * 80)
print("MERGING FILES")
print("=" * 80)
success_count = 0
fail_count = 0
total_queries = 0
total_size = 0
# Get standard CSV fieldnames (from first file)
first_csv = next(iter(csv_files))
with open(first_csv, 'r') as f:
reader = csv.DictReader(f)
fieldnames = reader.fieldnames
for (resolver, config), files in sorted(csv_groups.items()):
print(f"\n{resolver}/{config} ({len(files)} runs)")
# Merge CSVs
output_csv = os.path.join(args.output, resolver, f"{config}.csv")
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
merge_csvs(files, output_csv, fieldnames)
# Count queries in merged file
with open(output_csv, 'r') as f:
query_count = sum(1 for _ in csv.reader(f)) - 1 # Minus header
print(f" ✓ Merged CSV: {query_count:,} queries")
total_queries += query_count
success_count += 1
# Merge PCAPs if requested
if args.merge_pcaps and (resolver, config) in pcap_groups:
output_pcap = os.path.join(args.output, resolver, f"{config}.pcap")
pcap_list = pcap_groups[(resolver, config)]
if merge_pcaps(pcap_list, output_pcap):
merged_size = os.path.getsize(output_pcap)
orig_size = sum(os.path.getsize(p) for p in pcap_list)
print(f" ✓ Merged PCAP: {format_bytes(merged_size)} "
f"(from {format_bytes(orig_size)})")
total_size += merged_size
else:
print(f" ✗ PCAP merge failed")
fail_count += 1
# Final summary
print("\n" + "=" * 80)
print("COMPLETE")
print("=" * 80)
print(f"Successful configs: {success_count}")
print(f"Failed: {fail_count}")
print(f"Total queries: {total_queries:,}")
if args.merge_pcaps:
print(f"Total PCAP size: {format_bytes(total_size)}")
print(f"\nMerged files in: {args.output}")
return 0 if fail_count == 0 else 1
if __name__ == "__main__":
exit(main())