mirror of
https://github.com/20kaushik02/real-time-traffic-analysis-clickhouse.git
synced 2025-12-06 09:44:06 +00:00
aggregating data
This commit is contained in:
parent
afdddbaf54
commit
e6c0182724
41
preprocessing/pcap_aggregation.sh
Normal file
41
preprocessing/pcap_aggregation.sh
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
data_year=2023
|
||||||
|
data_month=10
|
||||||
|
|
||||||
|
# some info
|
||||||
|
|
||||||
|
total_size=0
|
||||||
|
for data_day in {01..31}; do
|
||||||
|
pcap_size=$(curl -sI "http://mawi.nezu.wide.ad.jp/mawi/samplepoint-F/${data_year}/${data_year}${data_month}${i}1400.pcap.gz" |
|
||||||
|
grep Content-Length |
|
||||||
|
awk '{printf "%.3f", $2/1024/1024/1024}')
|
||||||
|
echo "${data_year}-${data_month}-${data_day} - ${pcap_size} GB"
|
||||||
|
total_size=$(echo $total_size + $pcap_size | bc -l)
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Total size (compressed) of ${data_year}-${data_month} - ${total_size} GB"
|
||||||
|
# Total size (compressed) of 2023-10 - 193.292 GB
|
||||||
|
|
||||||
|
# extracting data
|
||||||
|
|
||||||
|
mkdir -p csv_files
|
||||||
|
|
||||||
|
for data_day in {01..31}; do
|
||||||
|
if [[ ! -f "${data_year}${data_month}${data_day}1400.pcap.gz" ]]; then
|
||||||
|
wget "http://mawi.nezu.wide.ad.jp/mawi/samplepoint-F/${data_year}/${data_year}${data_month}${data_day}1400.pcap.gz"
|
||||||
|
fi
|
||||||
|
gzip -d "${data_year}${data_month}${data_day}1400.pcap.gz"
|
||||||
|
|
||||||
|
# 10000 packets from each day
|
||||||
|
python3 pcap_processor.py \
|
||||||
|
--pcap_file "${data_year}${data_month}${data_day}1400.pcap" \
|
||||||
|
--out_file csv_files/${data_day}.csv \
|
||||||
|
--sample \
|
||||||
|
--stream_size 10000
|
||||||
|
|
||||||
|
rm "${data_year}${data_month}${data_day}1400.pcap"
|
||||||
|
done
|
||||||
|
|
||||||
|
# merge all CSV together
|
||||||
|
awk '(NR == 1) || (FNR > 1)' csv_files/*.csv > csv_files/merged.csv
|
||||||
@ -7,22 +7,9 @@ from scapy.utils import PcapReader
|
|||||||
from scapy.layers.inet import IP, TCP, UDP
|
from scapy.layers.inet import IP, TCP, UDP
|
||||||
|
|
||||||
from kafka import KafkaProducer
|
from kafka import KafkaProducer
|
||||||
import json
|
|
||||||
|
|
||||||
dbg_print = lambda *x: DEBUG and print(f"[DEBUG] {x}")
|
dbg_print = lambda *x: DEBUG and print(f"[DEBUG] {x}")
|
||||||
|
|
||||||
# Kafka Configuration
|
|
||||||
KAFKA_TOPIC = 'pcap_stream'
|
|
||||||
KAFKA_SERVER = 'localhost:9092' # Adjust to your Kafka server
|
|
||||||
#KAFKA_SERVER = 'kafka_service:9092'
|
|
||||||
|
|
||||||
# Initialize Kafka Producer
|
|
||||||
producer = KafkaProducer(
|
|
||||||
bootstrap_servers=KAFKA_SERVER,
|
|
||||||
#value_serializer=lambda v: json.dumps(v).encode('utf-8') # Encode data as JSON
|
|
||||||
value_serializer=lambda v: v.encode('utf-8') if isinstance(v, str) else str(v).encode('utf-8') #remove intermediate JSON encoding
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def pkt_filter(pkt: Packet) -> bool:
|
def pkt_filter(pkt: Packet) -> bool:
|
||||||
"""filter to include/exclude a packet"""
|
"""filter to include/exclude a packet"""
|
||||||
@ -87,11 +74,12 @@ def create_pkt_object(pkt: Packet) -> dict:
|
|||||||
"dst_addr": pkt[IP].dst,
|
"dst_addr": pkt[IP].dst,
|
||||||
"src_port": pkt[l4_proto].sport,
|
"src_port": pkt[l4_proto].sport,
|
||||||
"dst_port": pkt[l4_proto].dport,
|
"dst_port": pkt[l4_proto].dport,
|
||||||
"pkt_len": len(pkt)
|
"pkt_len": len(pkt),
|
||||||
}
|
}
|
||||||
|
|
||||||
return res_json
|
return res_json
|
||||||
|
|
||||||
|
|
||||||
def prep_csv(out_file: str):
|
def prep_csv(out_file: str):
|
||||||
with open(out_file, "w", newline="") as csvfile:
|
with open(out_file, "w", newline="") as csvfile:
|
||||||
writer = csv.writer(csvfile)
|
writer = csv.writer(csvfile)
|
||||||
@ -120,7 +108,9 @@ if __name__ == "__main__":
|
|||||||
argp = ArgumentParser()
|
argp = ArgumentParser()
|
||||||
argp.add_argument("-f", "--pcap_file", required=True, dest="_pcap")
|
argp.add_argument("-f", "--pcap_file", required=True, dest="_pcap")
|
||||||
argp.add_argument("-o", "--out_file", required=False, dest="_out")
|
argp.add_argument("-o", "--out_file", required=False, dest="_out")
|
||||||
argp.add_argument("--stream_size", required=False, default=10000, dest="_streamsize")
|
argp.add_argument(
|
||||||
|
"--stream_size", required=False, default=100000, dest="_streamsize"
|
||||||
|
)
|
||||||
argp.add_argument(
|
argp.add_argument(
|
||||||
"-x",
|
"-x",
|
||||||
"--sample",
|
"--sample",
|
||||||
@ -151,12 +141,11 @@ if __name__ == "__main__":
|
|||||||
out_file = args._out
|
out_file = args._out
|
||||||
streaming = args._stream
|
streaming = args._stream
|
||||||
sample = args._sample
|
sample = args._sample
|
||||||
samplesize = int(args._streamsize)
|
|
||||||
|
|
||||||
DEBUG = args._debug
|
DEBUG = args._debug
|
||||||
|
|
||||||
sample_size = samplesize #1000000
|
sample_size = int(args._streamsize) # 100000
|
||||||
batch_size = 100 #100000
|
batch_size = 10000 # 10000
|
||||||
|
|
||||||
pcap_rdr = PcapReader(pcap_file)
|
pcap_rdr = PcapReader(pcap_file)
|
||||||
if not streaming:
|
if not streaming:
|
||||||
@ -180,13 +169,26 @@ if __name__ == "__main__":
|
|||||||
pkts_write_csv(pkts, out_file)
|
pkts_write_csv(pkts, out_file)
|
||||||
pkts = []
|
pkts = []
|
||||||
else:
|
else:
|
||||||
# direct streaming to kafka goes here
|
# Kafka Configuration
|
||||||
|
KAFKA_TOPIC = "pcap_stream"
|
||||||
|
KAFKA_SERVER = "localhost:9092" # Adjust to your Kafka server
|
||||||
|
# KAFKA_SERVER = 'kafka_service:9092'
|
||||||
|
|
||||||
|
# Initialize Kafka Producer
|
||||||
|
producer = KafkaProducer(
|
||||||
|
bootstrap_servers=KAFKA_SERVER,
|
||||||
|
# value_serializer=lambda v: json.dumps(v).encode('utf-8') # Encode data as JSON
|
||||||
|
value_serializer=lambda v: (
|
||||||
|
v.encode("utf-8") if isinstance(v, str) else str(v).encode("utf-8")
|
||||||
|
), # remove intermediate JSON encoding
|
||||||
|
)
|
||||||
|
|
||||||
packet_data = create_pkt_object(pkt)
|
packet_data = create_pkt_object(pkt)
|
||||||
producer.send(KAFKA_TOPIC, packet_data)
|
producer.send(KAFKA_TOPIC, packet_data)
|
||||||
print(f"streamed packet at index {idx} ")
|
print(f"streamed packet at index {idx} ")
|
||||||
if idx > sample_size: break
|
if idx > sample_size:
|
||||||
|
break
|
||||||
|
|
||||||
# flush remaining
|
# flush remaining
|
||||||
if not streaming and len(pkts) > 0:
|
if not streaming and len(pkts) > 0:
|
||||||
pkts_write_csv(pkts, out_file)
|
pkts_write_csv(pkts, out_file)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user