feat(scripts): add scripts to process data
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -17,3 +17,4 @@
|
||||
**/tls-key-log.txt
|
||||
/results
|
||||
/results.bak
|
||||
/results_merged
|
||||
|
||||
6
flake.lock
generated
6
flake.lock
generated
@@ -2,11 +2,11 @@
|
||||
"nodes": {
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1758916627,
|
||||
"narHash": "sha256-fB2ISCc+xn+9hZ6gOsABxSBcsCgLCjbJ5bC6U9bPzQ4=",
|
||||
"lastModified": 1760103332,
|
||||
"narHash": "sha256-BMsGVfKl4Q80Pr9T1AkCRljO1bpwCmY8rTBVj8XGuhA=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "53614373268559d054c080d070cfc732dbe68ac4",
|
||||
"rev": "870493f9a8cb0b074ae5b411b2f232015db19a65",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -155,8 +155,6 @@ func (r *MeasurementRunner) runQueries(dnsClient client.DNSClient, upstream stri
|
||||
|
||||
r.printQueryResult(metric)
|
||||
|
||||
// For keep-alive connections, add smaller delays between queries
|
||||
// to better utilize the persistent connection
|
||||
if r.config.KeepAlive {
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
} else {
|
||||
@@ -182,7 +180,6 @@ func (r *MeasurementRunner) performQuery(dnsClient client.DNSClient, domain, ups
|
||||
AuthoritativeDNSSEC: r.config.AuthoritativeDNSSEC,
|
||||
KeepAlive: r.config.KeepAlive,
|
||||
DNSServer: upstream,
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
msg := new(dns.Msg)
|
||||
@@ -199,6 +196,7 @@ func (r *MeasurementRunner) performQuery(dnsClient client.DNSClient, domain, ups
|
||||
metric.RequestSize = len(packed)
|
||||
|
||||
start := time.Now()
|
||||
metric.Timestamp = start
|
||||
resp, err := dnsClient.Query(msg)
|
||||
metric.Duration = time.Since(start).Nanoseconds()
|
||||
metric.DurationMs = float64(metric.Duration) / 1e6
|
||||
|
||||
@@ -67,7 +67,7 @@ func (mw *MetricsWriter) WriteMetric(metric DNSMetric) error {
|
||||
strconv.FormatBool(metric.AuthoritativeDNSSEC),
|
||||
strconv.FormatBool(metric.KeepAlive),
|
||||
metric.DNSServer,
|
||||
metric.Timestamp.Format(time.RFC3339),
|
||||
metric.Timestamp.Format(time.RFC3339Nano),
|
||||
strconv.FormatInt(metric.Duration, 10),
|
||||
strconv.FormatFloat(metric.DurationMs, 'f', 3, 64),
|
||||
strconv.Itoa(metric.RequestSize),
|
||||
|
||||
@@ -10,7 +10,7 @@ def map_server_to_resolver(server):
|
||||
|
||||
if '1.1.1.1' in server_lower or 'cloudflare' in server_lower:
|
||||
return 'Cloudflare'
|
||||
elif '8.8.8.8' in server_lower or 'dns.google' in server_lower:
|
||||
elif '8.8.8.8' in server_lower or 'google' in server_lower:
|
||||
return 'Google'
|
||||
elif '9.9.9.9' in server_lower or 'quad9' in server_lower:
|
||||
return 'Quad9'
|
||||
284
scripts/tools/add_extra_metrics_to_csv.py
Normal file
284
scripts/tools/add_extra_metrics_to_csv.py
Normal file
@@ -0,0 +1,284 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Add network metrics from PCAP files to DNS CSV files.
|
||||
Adds: pcap_network_bytes_in, pcap_network_bytes_out, pcap_overhead_bytes
|
||||
"""
|
||||
|
||||
import csv
|
||||
import os
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
import dpkt
|
||||
import socket
|
||||
|
||||
# Test machine IPs
|
||||
TEST_IPS = {
|
||||
'10.0.0.50',
|
||||
'2001:818:e73e:ba00:5506:dfd4:ed8b:96e',
|
||||
'fe80::fe98:c62e:4463:9a2d'
|
||||
}
|
||||
|
||||
|
||||
def inet_to_str(inet):
|
||||
"""Convert inet bytes to IP string"""
|
||||
try:
|
||||
return socket.inet_ntop(socket.AF_INET, inet)
|
||||
except ValueError:
|
||||
try:
|
||||
return socket.inet_ntop(socket.AF_INET6, inet)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def read_pcap(pcap_path):
|
||||
"""Read PCAP and return list of (timestamp_ns, size, src_ip, dst_ip)"""
|
||||
packets = []
|
||||
|
||||
with open(pcap_path, 'rb') as f:
|
||||
try:
|
||||
pcap = dpkt.pcap.Reader(f)
|
||||
except:
|
||||
f.seek(0)
|
||||
pcap = dpkt.pcapng.Reader(f)
|
||||
|
||||
for ts, buf in pcap:
|
||||
try:
|
||||
# Convert PCAP timestamp (float seconds) to nanoseconds
|
||||
timestamp_ns = int(ts * 1_000_000_000)
|
||||
size = len(buf)
|
||||
eth = dpkt.ethernet.Ethernet(buf)
|
||||
|
||||
src_ip = dst_ip = None
|
||||
|
||||
if isinstance(eth.data, dpkt.ip.IP):
|
||||
src_ip = inet_to_str(eth.data.src)
|
||||
dst_ip = inet_to_str(eth.data.dst)
|
||||
elif isinstance(eth.data, dpkt.ip6.IP6):
|
||||
src_ip = inet_to_str(eth.data.src)
|
||||
dst_ip = inet_to_str(eth.data.dst)
|
||||
|
||||
if src_ip and dst_ip:
|
||||
packets.append((timestamp_ns, size, src_ip, dst_ip))
|
||||
|
||||
except (dpkt.dpkt.NeedData, dpkt.dpkt.UnpackError):
|
||||
continue
|
||||
|
||||
return packets
|
||||
|
||||
|
||||
def find_packets_in_window(packets, start_ns, duration_ns):
|
||||
"""Find packets within exact time window (nanosecond precision)"""
|
||||
end_ns = start_ns + duration_ns
|
||||
|
||||
matching = []
|
||||
for timestamp_ns, size, src_ip, dst_ip in packets:
|
||||
if start_ns <= timestamp_ns <= end_ns:
|
||||
matching.append((size, src_ip, dst_ip))
|
||||
|
||||
return matching
|
||||
|
||||
|
||||
def calculate_metrics(packets):
|
||||
"""Calculate network metrics from packets"""
|
||||
bytes_in = 0
|
||||
bytes_out = 0
|
||||
|
||||
for size, src_ip, dst_ip in packets:
|
||||
if dst_ip in TEST_IPS:
|
||||
bytes_in += size
|
||||
elif src_ip in TEST_IPS:
|
||||
bytes_out += size
|
||||
|
||||
return {
|
||||
'pcap_network_bytes_in': bytes_in,
|
||||
'pcap_network_bytes_out': bytes_out,
|
||||
'pcap_overhead_bytes': bytes_in + bytes_out
|
||||
}
|
||||
|
||||
|
||||
def parse_timestamp_to_ns(ts_str):
|
||||
"""Parse ISO timestamp to nanoseconds since epoch"""
|
||||
try:
|
||||
dt = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
|
||||
if dt.tzinfo is not None:
|
||||
dt = dt.astimezone(timezone.utc)
|
||||
# Convert to nanoseconds since epoch
|
||||
return int(dt.timestamp() * 1_000_000_000)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def enhance_csv(csv_path, pcap_path, output_path, debug=False):
|
||||
"""Add PCAP metrics to CSV"""
|
||||
if not os.path.exists(pcap_path):
|
||||
print(f"⚠️ PCAP not found: {pcap_path}")
|
||||
return False
|
||||
|
||||
print(f"Processing: {os.path.basename(csv_path)}")
|
||||
|
||||
# Read PCAP
|
||||
try:
|
||||
packets = read_pcap(pcap_path)
|
||||
print(f" Loaded {len(packets)} packets")
|
||||
|
||||
if packets and debug:
|
||||
first_pcap_ns = packets[0][0]
|
||||
last_pcap_ns = packets[-1][0]
|
||||
print(f" First PCAP packet: {first_pcap_ns} ns")
|
||||
print(f" Last PCAP packet: {last_pcap_ns} ns")
|
||||
print(f" PCAP duration: {(last_pcap_ns - first_pcap_ns) / 1e9:.3f}s")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error reading PCAP: {e}")
|
||||
return False
|
||||
|
||||
if not packets:
|
||||
print(" ❌ No packets found")
|
||||
return False
|
||||
|
||||
# Read CSV
|
||||
with open(csv_path, 'r', newline='') as f:
|
||||
reader = csv.DictReader(f)
|
||||
fieldnames = list(reader.fieldnames) + [
|
||||
'pcap_network_bytes_in',
|
||||
'pcap_network_bytes_out',
|
||||
'pcap_overhead_bytes'
|
||||
]
|
||||
rows = list(reader)
|
||||
|
||||
if rows and debug:
|
||||
first_csv_ns = parse_timestamp_to_ns(rows[0]['timestamp'])
|
||||
last_csv_ns = parse_timestamp_to_ns(rows[-1]['timestamp'])
|
||||
if first_csv_ns and last_csv_ns:
|
||||
print(f" First CSV query: {first_csv_ns} ns")
|
||||
print(f" Last CSV query: {last_csv_ns} ns")
|
||||
print(f" CSV duration: {(last_csv_ns - first_csv_ns) / 1e9:.3f}s")
|
||||
|
||||
# Check alignment
|
||||
offset_ns = packets[0][0] - first_csv_ns
|
||||
print(f" Time offset (PCAP - CSV): {offset_ns / 1e9:.3f}s")
|
||||
|
||||
# Enhance rows
|
||||
enhanced = []
|
||||
matched = 0
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
ts_ns = parse_timestamp_to_ns(row['timestamp'])
|
||||
if not ts_ns:
|
||||
continue
|
||||
|
||||
duration_ns = int(row.get('duration_ns', 0))
|
||||
|
||||
matching_packets = find_packets_in_window(packets, ts_ns, duration_ns)
|
||||
|
||||
metrics = calculate_metrics(matching_packets)
|
||||
row.update(metrics)
|
||||
enhanced.append(row)
|
||||
|
||||
if metrics['pcap_overhead_bytes'] > 0:
|
||||
matched += 1
|
||||
|
||||
# Debug first few queries
|
||||
if debug and i < 3:
|
||||
print(f" Query {i}: {row['domain']}")
|
||||
print(f" Start: {ts_ns} ns")
|
||||
print(f" Duration: {duration_ns} ns ({duration_ns / 1e6:.3f}ms)")
|
||||
print(f" End: {ts_ns + duration_ns} ns")
|
||||
print(f" Matched packets: {len(matching_packets)}")
|
||||
print(f" Bytes: {metrics['pcap_overhead_bytes']}")
|
||||
|
||||
print(f" Matched: {matched}/{len(rows)} queries")
|
||||
|
||||
if matched == 0:
|
||||
print(" ⚠️ WARNING: No queries matched any packets!")
|
||||
print(" This might indicate timestamp misalignment.")
|
||||
|
||||
# Write output
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, 'w', newline='') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
||||
writer.writeheader()
|
||||
writer.writerows(enhanced)
|
||||
|
||||
print(f" ✓ Saved: {output_path}")
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Add PCAP network metrics to DNS CSV files'
|
||||
)
|
||||
parser.add_argument('input_dir', help='Input directory (e.g., results_merged)')
|
||||
parser.add_argument(
|
||||
'--output',
|
||||
default='./results_enhanced',
|
||||
help='Output directory (default: ./results_enhanced)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Preview files without processing'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--debug',
|
||||
action='store_true',
|
||||
help='Show detailed timing information'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("ENHANCE DNS CSVs WITH PCAP METRICS")
|
||||
print("=" * 60)
|
||||
print(f"Input: {args.input_dir}")
|
||||
print(f"Output: {args.output}")
|
||||
if args.debug:
|
||||
print("Debug: ENABLED")
|
||||
print()
|
||||
|
||||
# Find CSV files
|
||||
csv_files = list(Path(args.input_dir).rglob('*.csv'))
|
||||
|
||||
if not csv_files:
|
||||
print("❌ No CSV files found")
|
||||
return 1
|
||||
|
||||
print(f"Found {len(csv_files)} CSV files\n")
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN - would process:")
|
||||
for csv_path in csv_files:
|
||||
pcap_path = csv_path.with_suffix('.pcap')
|
||||
print(f" {csv_path.relative_to(args.input_dir)}")
|
||||
print(f" PCAP: {'✓' if pcap_path.exists() else '✗'}")
|
||||
return 0
|
||||
|
||||
# Process files
|
||||
success = 0
|
||||
failed = 0
|
||||
|
||||
for csv_path in csv_files:
|
||||
pcap_path = csv_path.with_suffix('.pcap')
|
||||
rel_path = csv_path.relative_to(args.input_dir)
|
||||
output_path = Path(args.output) / rel_path
|
||||
|
||||
if enhance_csv(str(csv_path), str(pcap_path), str(output_path),
|
||||
args.debug):
|
||||
success += 1
|
||||
else:
|
||||
failed += 1
|
||||
print()
|
||||
|
||||
# Summary
|
||||
print("=" * 60)
|
||||
print(f"✓ Success: {success}")
|
||||
print(f"✗ Failed: {failed}")
|
||||
print(f"Total: {len(csv_files)}")
|
||||
print(f"\nOutput: {args.output}")
|
||||
|
||||
return 0 if failed == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
367
scripts/tools/clean_pcaps.py
Normal file
367
scripts/tools/clean_pcaps.py
Normal file
@@ -0,0 +1,367 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Advanced PCAP filter for DNS traffic (with IPv6 support).
|
||||
|
||||
Filters out:
|
||||
- Local network traffic except test machine (IPv4: 10.0.0.50; IPv6: specific addresses)
|
||||
- AdGuard DNS servers (for non-AdGuard captures)
|
||||
- Non-DNS traffic based on protocol-specific ports
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
# Test machine IPs (IPv4 and IPv6 from your provided info)
|
||||
TEST_IPV4 = '10.0.0.50'
|
||||
TEST_IPV6_GLOBAL = '2001:818:e73e:ba00:5506:dfd4:ed8b:96e'
|
||||
TEST_IPV6_LINKLOCAL = 'fe80::fe98:c62e:4463:9a2d'
|
||||
|
||||
# Port mappings
|
||||
PORT_MAP = {
|
||||
'udp': [53], # DNS-over-UDP
|
||||
'tls': [53, 853], # DNS-over-TLS
|
||||
'https': [53, 443], # DNS-over-HTTPS (DoH)
|
||||
'doq': [53, 784, 8853], # DNS-over-QUIC
|
||||
'doh3': [53, 443] # DNS-over-HTTP/3
|
||||
}
|
||||
|
||||
# AdGuard DNS IPs to filter out (for non-AdGuard captures)
|
||||
ADGUARD_IPS = [
|
||||
'94.140.14.14',
|
||||
'94.140.15.15',
|
||||
'2a10:50c0::ad1:ff',
|
||||
'2a10:50c0::ad2:ff'
|
||||
]
|
||||
|
||||
def parse_filename(filename):
|
||||
"""Extract protocol from filename"""
|
||||
base = filename.replace('.pcap', '').replace('.csv', '')
|
||||
parts = base.split('-')
|
||||
|
||||
if len(parts) < 1: # Minimum: protocol
|
||||
return None
|
||||
|
||||
protocol = parts[0].lower()
|
||||
return protocol
|
||||
|
||||
def extract_resolver_from_path(pcap_path):
|
||||
"""Extract resolver name from directory structure"""
|
||||
parts = Path(pcap_path).parts
|
||||
|
||||
for part in parts:
|
||||
if part.lower() in ['cloudflare', 'google', 'quad9', 'adguard']:
|
||||
return part.lower()
|
||||
|
||||
return None
|
||||
|
||||
def build_filter_expression(protocol, resolver):
|
||||
"""
|
||||
Build tshark filter expression.
|
||||
|
||||
Strategy:
|
||||
1. Only protocol-specific DNS ports
|
||||
2. Keep only traffic involving the test machine (IPv4/IPv6)
|
||||
3. Exclude AdGuard IPs for non-AdGuard captures
|
||||
"""
|
||||
|
||||
# Get ports for this protocol
|
||||
ports = PORT_MAP.get(protocol, [53, 443, 853, 784, 8853])
|
||||
|
||||
# Build port filter (UDP or TCP on these ports)
|
||||
port_conditions = []
|
||||
for port in ports:
|
||||
port_conditions.append(f'(udp.port == {port} or tcp.port == {port})')
|
||||
|
||||
port_filter = ' or '.join(port_conditions)
|
||||
|
||||
# Build test machine filter (keep if src or dst is test machine IP)
|
||||
machine_conditions = [f'(ip.addr == {TEST_IPV4})']
|
||||
if TEST_IPV6_GLOBAL:
|
||||
machine_conditions.append(f'(ipv6.addr == {TEST_IPV6_GLOBAL})')
|
||||
if TEST_IPV6_LINKLOCAL:
|
||||
machine_conditions.append(f'(ipv6.addr == {TEST_IPV6_LINKLOCAL})')
|
||||
|
||||
machine_filter = ' or '.join(machine_conditions)
|
||||
|
||||
# Build AdGuard exclusion filter
|
||||
adguard_exclusions = []
|
||||
if resolver != 'adguard':
|
||||
for ip in ADGUARD_IPS:
|
||||
if ':' in ip: # IPv6
|
||||
adguard_exclusions.append(f'!(ipv6.addr == {ip})')
|
||||
else: # IPv4
|
||||
adguard_exclusions.append(f'!(ip.addr == {ip})')
|
||||
|
||||
# Combine all filters
|
||||
filters = [f'({port_filter})', f'({machine_filter})']
|
||||
|
||||
if adguard_exclusions:
|
||||
adguard_filter = ' and '.join(adguard_exclusions)
|
||||
filters.append(f'({adguard_filter})')
|
||||
|
||||
final_filter = ' and '.join(filters)
|
||||
|
||||
return final_filter
|
||||
|
||||
def filter_pcap(input_path, output_path, filter_expr, verbose=False):
|
||||
"""Apply filter to PCAP file using tshark"""
|
||||
|
||||
cmd = [
|
||||
'tshark',
|
||||
'-r', input_path,
|
||||
'-Y', filter_expr,
|
||||
'-w', output_path,
|
||||
'-F', 'pcap'
|
||||
]
|
||||
|
||||
try:
|
||||
if verbose:
|
||||
print(f" Filter: {filter_expr}")
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f" ✗ Error: {result.stderr.strip()}")
|
||||
return False
|
||||
|
||||
if not os.path.exists(output_path):
|
||||
print(f" ✗ Output file not created")
|
||||
return False
|
||||
|
||||
output_size = os.path.getsize(output_path)
|
||||
if output_size < 24:
|
||||
print(f" ⚠ Warning: Output is empty")
|
||||
|
||||
return True
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print(f" ✗ Timeout (>5 minutes)")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ✗ Exception: {e}")
|
||||
return False
|
||||
|
||||
def find_pcap_files(root_dir):
|
||||
"""Recursively find all PCAP files"""
|
||||
pcap_files = []
|
||||
for root, dirs, files in os.walk(root_dir):
|
||||
for file in files:
|
||||
if file.endswith('.pcap'):
|
||||
full_path = os.path.join(root, file)
|
||||
pcap_files.append(full_path)
|
||||
return sorted(pcap_files)
|
||||
|
||||
def format_bytes(bytes_val):
|
||||
"""Format bytes as human readable"""
|
||||
for unit in ['B', 'KB', 'MB', 'GB']:
|
||||
if bytes_val < 1024.0:
|
||||
return f"{bytes_val:.1f} {unit}"
|
||||
bytes_val /= 1024.0
|
||||
return f"{bytes_val:.1f} TB"
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Advanced PCAP filter for DNS traffic (IPv4/IPv6)',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog='''
|
||||
Filtering rules:
|
||||
1. Only include traffic on protocol-specific DNS ports
|
||||
2. Keep only packets involving the test machine (10.0.0.50 or its IPv6 addresses)
|
||||
3. Exclude AdGuard IPs for non-AdGuard captures
|
||||
|
||||
Protocol-specific ports:
|
||||
udp: 53
|
||||
tls: 53, 853
|
||||
https: 53, 443
|
||||
doq: 53, 784, 8853
|
||||
doh3: 53, 443
|
||||
|
||||
Examples:
|
||||
# Dry run
|
||||
%(prog)s ./results --dry-run
|
||||
|
||||
# Filter with verbose output
|
||||
%(prog)s ./results --verbose
|
||||
|
||||
# Custom output directory
|
||||
%(prog)s ./results --output ./cleaned
|
||||
'''
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'input_dir',
|
||||
help='Input directory containing PCAP files'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
default='./results_filtered',
|
||||
help='Output directory (default: ./results_filtered)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be done without filtering'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--limit',
|
||||
type=int,
|
||||
help='Only process first N files (for testing)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-v', '--verbose',
|
||||
action='store_true',
|
||||
help='Verbose output (show filter expressions)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--overwrite',
|
||||
action='store_true',
|
||||
help='Overwrite existing filtered files'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check for tshark
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['tshark', '-v'],
|
||||
capture_output=True,
|
||||
check=True
|
||||
)
|
||||
if args.verbose:
|
||||
version = result.stdout.decode().split('\n')[0]
|
||||
print(f"Using: {version}\n")
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
print("Error: tshark not found. Install Wireshark/tshark:")
|
||||
print(" Ubuntu/Debian: sudo apt-get install tshark")
|
||||
print(" macOS: brew install wireshark")
|
||||
return 1
|
||||
|
||||
print("=" * 80)
|
||||
print("ADVANCED DNS PCAP FILTER (IPv4/IPv6)")
|
||||
print("=" * 80)
|
||||
print("Filters:")
|
||||
print(" 1. Protocol-specific DNS ports only")
|
||||
print(" 2. Keep only traffic involving test machine (10.0.0.50 / IPv6 addresses)")
|
||||
print(" 3. Exclude AdGuard IPs (for non-AdGuard captures)")
|
||||
print(f"\nInput: {args.input_dir}")
|
||||
print(f"Output: {args.output}")
|
||||
|
||||
# Find PCAP files
|
||||
print(f"\nScanning for PCAP files...")
|
||||
pcap_files = find_pcap_files(args.input_dir)
|
||||
|
||||
if not pcap_files:
|
||||
print(f"No PCAP files found in {args.input_dir}")
|
||||
return 1
|
||||
|
||||
print(f"Found {len(pcap_files)} PCAP files")
|
||||
|
||||
total_input_size = sum(os.path.getsize(f) for f in pcap_files)
|
||||
print(f"Total size: {format_bytes(total_input_size)}")
|
||||
|
||||
if args.limit:
|
||||
pcap_files = pcap_files[:args.limit]
|
||||
print(f"Limiting to first {args.limit} files")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n*** DRY RUN MODE ***\n")
|
||||
else:
|
||||
print()
|
||||
|
||||
# Process files
|
||||
success_count = 0
|
||||
skip_count = 0
|
||||
fail_count = 0
|
||||
total_output_size = 0
|
||||
|
||||
for i, input_path in enumerate(pcap_files, 1):
|
||||
# Extract info from path
|
||||
filename = Path(input_path).name
|
||||
protocol = parse_filename(filename)
|
||||
resolver = extract_resolver_from_path(input_path)
|
||||
|
||||
if not protocol:
|
||||
print(f"[{i}/{len(pcap_files)}] {filename}")
|
||||
print(f" ⚠ Could not parse protocol, skipping")
|
||||
skip_count += 1
|
||||
continue
|
||||
|
||||
# Create output path
|
||||
rel_path = os.path.relpath(input_path, args.input_dir)
|
||||
output_path = os.path.join(args.output, rel_path)
|
||||
|
||||
input_size = os.path.getsize(input_path)
|
||||
|
||||
print(f"[{i}/{len(pcap_files)}] {rel_path}")
|
||||
print(f" Protocol: {protocol.upper()}")
|
||||
print(f" Resolver: {resolver or 'unknown'}")
|
||||
print(f" Size: {format_bytes(input_size)}")
|
||||
|
||||
# Check if already filtered
|
||||
if os.path.exists(output_path) and not args.overwrite:
|
||||
output_size = os.path.getsize(output_path)
|
||||
reduction = ((input_size - output_size) / input_size * 100) if input_size > 0 else 0
|
||||
print(f" ⊙ Already filtered: {format_bytes(output_size)} "
|
||||
f"({reduction:.1f}% reduction)")
|
||||
skip_count += 1
|
||||
total_output_size += output_size
|
||||
continue
|
||||
|
||||
# Build filter
|
||||
filter_expr = build_filter_expression(protocol, resolver)
|
||||
|
||||
if args.dry_run:
|
||||
print(f" → Would filter")
|
||||
if args.verbose:
|
||||
print(f" Filter: {filter_expr}")
|
||||
continue
|
||||
|
||||
# Create output directory
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
# Filter
|
||||
success = filter_pcap(input_path, output_path, filter_expr, args.verbose)
|
||||
|
||||
if success:
|
||||
output_size = os.path.getsize(output_path)
|
||||
reduction = ((input_size - output_size) / input_size * 100) if input_size > 0 else 0
|
||||
print(f" ✓ Filtered: {format_bytes(output_size)} "
|
||||
f"({reduction:.1f}% reduction)")
|
||||
success_count += 1
|
||||
total_output_size += output_size
|
||||
else:
|
||||
fail_count += 1
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 80)
|
||||
print("SUMMARY")
|
||||
print("=" * 80)
|
||||
|
||||
if args.dry_run:
|
||||
print(f"Would process: {len(pcap_files)} files")
|
||||
else:
|
||||
print(f"Successful: {success_count}")
|
||||
print(f"Skipped: {skip_count} (already filtered or unparseable)")
|
||||
print(f"Failed: {fail_count}")
|
||||
print(f"Total: {len(pcap_files)}")
|
||||
|
||||
if success_count > 0 or skip_count > 0:
|
||||
print(f"\nInput size: {format_bytes(total_input_size)}")
|
||||
print(f"Output size: {format_bytes(total_output_size)}")
|
||||
if total_input_size > 0:
|
||||
reduction = ((total_input_size - total_output_size) /
|
||||
total_input_size * 100)
|
||||
print(f"Reduction: {reduction:.1f}%")
|
||||
print(f"\nOutput directory: {args.output}")
|
||||
|
||||
return 0 if fail_count == 0 else 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
274
scripts/tools/merge_files.py
Normal file
274
scripts/tools/merge_files.py
Normal file
@@ -0,0 +1,274 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Merge DNS test files by configuration.
|
||||
|
||||
- Merges CSVs of same config (adds 'run_id' column for traceability)
|
||||
- Optionally merges PCAPs using mergecap
|
||||
- Flattens date structure
|
||||
"""
|
||||
|
||||
import os
|
||||
import csv
|
||||
import subprocess
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
|
||||
def parse_filename(filename):
|
||||
"""
|
||||
Extract config key from filename.
|
||||
Format: protocol[-flags]-timestamp.{csv,pcap}
|
||||
Config key: protocol[-flags] (ignores timestamp)
|
||||
"""
|
||||
base = filename.replace('.csv', '').replace('.pcap', '')
|
||||
parts = base.split('-')
|
||||
|
||||
if len(parts) < 2:
|
||||
return None
|
||||
|
||||
# Config is everything except timestamp
|
||||
config = '-'.join(parts[:-1])
|
||||
timestamp = parts[-1]
|
||||
|
||||
return config, timestamp
|
||||
|
||||
def extract_resolver_from_path(file_path):
|
||||
"""Extract resolver name from path"""
|
||||
parts = Path(file_path).parts
|
||||
for part in parts:
|
||||
if part.lower() in ['cloudflare', 'google', 'quad9', 'adguard']:
|
||||
return part.lower()
|
||||
return None
|
||||
|
||||
def find_files(root_dir, extension):
|
||||
"""Find all files with given extension"""
|
||||
files = []
|
||||
for root, dirs, filenames in os.walk(root_dir):
|
||||
for filename in filenames:
|
||||
if filename.endswith(extension):
|
||||
full_path = os.path.join(root, filename)
|
||||
files.append(full_path)
|
||||
return sorted(files)
|
||||
|
||||
def merge_csvs(csv_files, output_path, fieldnames):
|
||||
"""Merge multiple CSVs into one, adding 'run_id' column"""
|
||||
with open(output_path, 'w', newline='') as outfile:
|
||||
writer = csv.DictWriter(outfile, fieldnames=fieldnames + ['run_id'])
|
||||
writer.writeheader()
|
||||
|
||||
for csv_path in csv_files:
|
||||
# Use timestamp as run_id
|
||||
filename = Path(csv_path).name
|
||||
_, timestamp = parse_filename(filename)
|
||||
run_id = timestamp # Or add date if needed
|
||||
|
||||
with open(csv_path, 'r', newline='') as infile:
|
||||
reader = csv.DictReader(infile)
|
||||
for row in reader:
|
||||
row['run_id'] = run_id
|
||||
writer.writerow(row)
|
||||
|
||||
def merge_pcaps(pcap_files, output_path):
|
||||
"""Merge PCAP files using mergecap"""
|
||||
cmd = ['mergecap', '-w', output_path] + pcap_files
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, check=True)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f" ✗ mergecap error: {e.stderr.decode()}")
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
print("Error: mergecap not found. Install Wireshark:")
|
||||
print(" Ubuntu: sudo apt install wireshark-common")
|
||||
print(" macOS: brew install wireshark")
|
||||
return False
|
||||
|
||||
def format_bytes(bytes_val):
|
||||
"""Format bytes as human readable"""
|
||||
for unit in ['B', 'KB', 'MB', 'GB']:
|
||||
if bytes_val < 1024.0:
|
||||
return f"{bytes_val:.1f} {unit}"
|
||||
bytes_val /= 1024.0
|
||||
return f"{bytes_val:.1f} TB"
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Merge DNS test files by configuration',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog='''
|
||||
Merges files of same config across dates/timestamps.
|
||||
Output: ./results_merged/[resolver]/[config].csv (merged)
|
||||
./results_merged/[resolver]/[config].pcap (merged, if --merge-pcaps)
|
||||
|
||||
Examples:
|
||||
# Dry run to preview
|
||||
%(prog)s ./results --dry-run
|
||||
|
||||
# Merge CSVs only (recommended)
|
||||
%(prog)s ./results
|
||||
|
||||
# Merge CSVs and PCAPs
|
||||
%(prog)s ./results --merge-pcaps
|
||||
|
||||
# Custom output directory
|
||||
%(prog)s ./results --output ./merged_data
|
||||
'''
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'input_dir',
|
||||
help='Input directory (e.g., ./results)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output',
|
||||
default='./results_merged',
|
||||
help='Output directory (default: ./results_merged)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--merge-pcaps',
|
||||
action='store_true',
|
||||
help='Merge PCAP files (requires mergecap from Wireshark)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be done without merging'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-y', '--yes',
|
||||
action='store_true',
|
||||
help='Skip confirmation prompt'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.isdir(args.input_dir):
|
||||
print(f"Error: Input directory not found: {args.input_dir}")
|
||||
return 1
|
||||
|
||||
# Find all files
|
||||
print("=" * 80)
|
||||
print("MERGE DNS TEST FILES")
|
||||
print("=" * 80)
|
||||
print(f"Input: {args.input_dir}")
|
||||
print(f"Output: {args.output}")
|
||||
print(f"Merge PCAPs: {'Yes' if args.merge_pcaps else 'No'}")
|
||||
|
||||
csv_files = find_files(args.input_dir, '.csv')
|
||||
pcap_files = find_files(args.input_dir, '.pcap') if args.merge_pcaps else []
|
||||
|
||||
if not csv_files and not pcap_files:
|
||||
print("\nNo CSV/PCAP files found")
|
||||
return 1
|
||||
|
||||
print(f"\nFound {len(csv_files)} CSV files")
|
||||
if args.merge_pcaps:
|
||||
print(f"Found {len(pcap_files)} PCAP files")
|
||||
|
||||
# Group files by resolver and config
|
||||
csv_groups = defaultdict(list)
|
||||
pcap_groups = defaultdict(list)
|
||||
|
||||
for csv_path in csv_files:
|
||||
config, _ = parse_filename(Path(csv_path).name)
|
||||
resolver = extract_resolver_from_path(csv_path)
|
||||
if config and resolver:
|
||||
key = (resolver, config)
|
||||
csv_groups[key].append(csv_path)
|
||||
|
||||
for pcap_path in pcap_files:
|
||||
config, _ = parse_filename(Path(pcap_path).name)
|
||||
resolver = extract_resolver_from_path(pcap_path)
|
||||
if config and resolver:
|
||||
key = (resolver, config)
|
||||
pcap_groups[key].append(pcap_path)
|
||||
|
||||
# Summary
|
||||
print("\nConfigs to merge:")
|
||||
print("-" * 80)
|
||||
for (resolver, config), files in sorted(csv_groups.items()):
|
||||
print(f" {resolver}/{config}: {len(files)} runs")
|
||||
|
||||
total_runs = sum(len(files) for files in csv_groups.values())
|
||||
print(f"\nTotal configs: {len(csv_groups)}")
|
||||
print(f"Total runs: {total_runs}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n*** DRY RUN MODE ***\n")
|
||||
for (resolver, config) in sorted(csv_groups.keys()):
|
||||
print(f"Would merge: {resolver}/{config} ({len(csv_groups[(resolver, config)])} CSVs)")
|
||||
if args.merge_pcaps and (resolver, config) in pcap_groups:
|
||||
print(f"Would merge: {resolver}/{config} ({len(pcap_groups[(resolver, config)])} PCAPs)")
|
||||
return 0
|
||||
|
||||
# Confirmation
|
||||
if not args.yes:
|
||||
response = input(f"\nMerge all into {args.output}? [y/N] ")
|
||||
if response.lower() not in ['y', 'yes']:
|
||||
print("Cancelled")
|
||||
return 0
|
||||
|
||||
# Merge
|
||||
print("\n" + "=" * 80)
|
||||
print("MERGING FILES")
|
||||
print("=" * 80)
|
||||
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
total_queries = 0
|
||||
total_size = 0
|
||||
|
||||
# Get standard CSV fieldnames (from first file)
|
||||
first_csv = next(iter(csv_files))
|
||||
with open(first_csv, 'r') as f:
|
||||
reader = csv.DictReader(f)
|
||||
fieldnames = reader.fieldnames
|
||||
|
||||
for (resolver, config), files in sorted(csv_groups.items()):
|
||||
print(f"\n{resolver}/{config} ({len(files)} runs)")
|
||||
|
||||
# Merge CSVs
|
||||
output_csv = os.path.join(args.output, resolver, f"{config}.csv")
|
||||
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
|
||||
|
||||
merge_csvs(files, output_csv, fieldnames)
|
||||
|
||||
# Count queries in merged file
|
||||
with open(output_csv, 'r') as f:
|
||||
query_count = sum(1 for _ in csv.reader(f)) - 1 # Minus header
|
||||
|
||||
print(f" ✓ Merged CSV: {query_count:,} queries")
|
||||
total_queries += query_count
|
||||
success_count += 1
|
||||
|
||||
# Merge PCAPs if requested
|
||||
if args.merge_pcaps and (resolver, config) in pcap_groups:
|
||||
output_pcap = os.path.join(args.output, resolver, f"{config}.pcap")
|
||||
pcap_list = pcap_groups[(resolver, config)]
|
||||
|
||||
if merge_pcaps(pcap_list, output_pcap):
|
||||
merged_size = os.path.getsize(output_pcap)
|
||||
orig_size = sum(os.path.getsize(p) for p in pcap_list)
|
||||
print(f" ✓ Merged PCAP: {format_bytes(merged_size)} "
|
||||
f"(from {format_bytes(orig_size)})")
|
||||
total_size += merged_size
|
||||
else:
|
||||
print(f" ✗ PCAP merge failed")
|
||||
fail_count += 1
|
||||
|
||||
# Final summary
|
||||
print("\n" + "=" * 80)
|
||||
print("COMPLETE")
|
||||
print("=" * 80)
|
||||
print(f"Successful configs: {success_count}")
|
||||
print(f"Failed: {fail_count}")
|
||||
print(f"Total queries: {total_queries:,}")
|
||||
if args.merge_pcaps:
|
||||
print(f"Total PCAP size: {format_bytes(total_size)}")
|
||||
print(f"\nMerged files in: {args.output}")
|
||||
|
||||
return 0 if fail_count == 0 else 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
Reference in New Issue
Block a user