added kafka streaming part

This commit is contained in:
Akash Sivakumar 2024-11-03 18:33:11 -07:00
parent 599741326e
commit d8631a5cfc
2 changed files with 58 additions and 3 deletions

View File

@ -14,3 +14,16 @@
- TCP/UDP - ports - sport, dport - TCP/UDP - ports - sport, dport
- Packet size - in bytes - Packet size - in bytes
- `sample_output.csv` contains a partial subset of `202310081400.pcap`, ~600K packets - `sample_output.csv` contains a partial subset of `202310081400.pcap`, ~600K packets
# Streaming from pcap file using Kafka
- Run pcap_processor.py file
- Arguments
- -f or --pcap_file: pcap file path, mandatory argument
- -o or --out_file: output csv file path
- -x or --sample: boolean value indicating if data has to be sampled
- -s or --stream: boolean value indicating if kafka streaming should happen
- --stream_size: integer indicating number of sampled packets
- -d or --debug: boolean value indicating if program is run in debug mode
python pcap_processor.py -f C:/Users/akash/storage/Asu/sem3/dds/project/202310081400.pcap -s --sample-size 1000

View File

@ -6,8 +6,21 @@ from scapy.packet import Packet
from scapy.utils import PcapReader from scapy.utils import PcapReader
from scapy.layers.inet import IP, TCP, UDP from scapy.layers.inet import IP, TCP, UDP
from kafka import KafkaProducer
import json
dbg_print = lambda *x: DEBUG and print(f"[DEBUG] {x}") dbg_print = lambda *x: DEBUG and print(f"[DEBUG] {x}")
# Kafka Configuration
KAFKA_TOPIC = 'pcap_stream'
KAFKA_SERVER = 'localhost:9092' # Adjust to your Kafka server
# Initialize Kafka Producer
producer = KafkaProducer(
bootstrap_servers=KAFKA_SERVER,
value_serializer=lambda v: json.dumps(v).encode('utf-8') # Encode data as JSON
)
def pkt_filter(pkt: Packet) -> bool: def pkt_filter(pkt: Packet) -> bool:
"""filter to include/exclude a packet""" """filter to include/exclude a packet"""
@ -53,6 +66,30 @@ def pkt_extract(pkt: Packet) -> list:
return pkt_attrs return pkt_attrs
def create_pkt_object(pkt: Packet) -> dict:
"""create a dictionary of packet attributes"""
"""key: attribute name, value: attribute value"""
l4_proto = None
if TCP in pkt:
l4_proto = TCP
elif UDP in pkt:
l4_proto = UDP
pkt_time_str = str(datetime.fromtimestamp(float(pkt.time), timezone.utc))
res_json = {
"time": float(pkt.time),
"l4_proto": pkt.getlayer(l4_proto).name,
"src_addr": pkt[IP].src,
"dst_addr": pkt[IP].dst,
"src_port": pkt[l4_proto].sport,
"dst_port": pkt[l4_proto].dport,
"pkt_len": len(pkt)
}
return res_json
def prep_csv(out_file: str): def prep_csv(out_file: str):
with open(out_file, "w", newline="") as csvfile: with open(out_file, "w", newline="") as csvfile:
writer = csv.writer(csvfile) writer = csv.writer(csvfile)
@ -81,6 +118,7 @@ if __name__ == "__main__":
argp = ArgumentParser() argp = ArgumentParser()
argp.add_argument("-f", "--pcap_file", required=True, dest="_pcap") argp.add_argument("-f", "--pcap_file", required=True, dest="_pcap")
argp.add_argument("-o", "--out_file", required=False, dest="_out") argp.add_argument("-o", "--out_file", required=False, dest="_out")
argp.add_argument("--stream_size", required=False, default=10000, dest="_streamsize")
argp.add_argument( argp.add_argument(
"-x", "-x",
"--sample", "--sample",
@ -111,11 +149,12 @@ if __name__ == "__main__":
out_file = args._out out_file = args._out
streaming = args._stream streaming = args._stream
sample = args._sample sample = args._sample
samplesize = int(args._streamsize)
DEBUG = args._debug DEBUG = args._debug
sample_size = 1000000 sample_size = samplesize #1000000
batch_size = 100000 batch_size = 100 #100000
pcap_rdr = PcapReader(pcap_file) pcap_rdr = PcapReader(pcap_file)
if not streaming: if not streaming:
@ -140,7 +179,10 @@ if __name__ == "__main__":
pkts = [] pkts = []
else: else:
# direct streaming to kafka goes here # direct streaming to kafka goes here
pass packet_data = create_pkt_object(pkt)
producer.send(KAFKA_TOPIC, packet_data)
print(f"streamed packet at index {idx} ")
if idx > sample_size: break
# flush remaining # flush remaining
if not streaming and len(pkts) > 0: if not streaming and len(pkts) > 0: