From 6d29dcf343ecf260a254ceee769f0f238512da71 Mon Sep 17 00:00:00 2001 From: Akash Sivakumar <73591598+Akash-0818@users.noreply.github.com> Date: Sun, 3 Nov 2024 18:47:05 -0700 Subject: [PATCH 1/3] Update README.md --- preprocessing/README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/preprocessing/README.md b/preprocessing/README.md index ab835fb..59df36e 100644 --- a/preprocessing/README.md +++ b/preprocessing/README.md @@ -15,7 +15,22 @@ - Packet size - in bytes - `sample_output.csv` contains a partial subset of `202310081400.pcap`, ~600K packets +# Setting up Kafka +- Download and install kafka [from here](https://kafka.apache.org/downloads) +- Run all commands in separate terminals from installation location +- Zookeeper: + - Windows: `.\bin\windows\zookeeper-server-start.bat .\config\zookeeper.properties` + - Mac: `bin/zookeeper-server-start.sh config/zookeeper.properties` +- Kafka Broker: + - Windows: `.\bin\windows\kafka-server-start.bat .\config\server.properties` + - Mac: `bin/kafka-server-start.sh config/server.properties` +- Creating a Kafka topic: +- Windows: `.\bin\windows\kafka-topics.bat --create --topic %topicname% --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1` +- Mac: `bin/kafka-topics.sh --create --topic %topicname% --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1` + + # Streaming from pcap file using Kafka +- Start zookeeper and Kafka broker whenever python code is run after machine reboot - Run pcap_processor.py file - Arguments - -f or --pcap_file: pcap file path, mandatory argument @@ -26,4 +41,4 @@ - -d or --debug: boolean value indicating if program is run in debug mode -python pcap_processor.py -f C:/Users/akash/storage/Asu/sem3/dds/project/202310081400.pcap -s --sample-size 1000 \ No newline at end of file +python pcap_processor.py -f C:/Users/akash/storage/Asu/sem3/dds/project/202310081400.pcap -s --sample-size 1000 From b4a777c3686106bcf6e309475d75cf9e7314031d Mon Sep 17 00:00:00 2001 From: Akash Sivakumar <73591598+Akash-0818@users.noreply.github.com> Date: Sun, 3 Nov 2024 18:47:39 -0700 Subject: [PATCH 2/3] Update README.md --- preprocessing/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocessing/README.md b/preprocessing/README.md index 59df36e..f7c3851 100644 --- a/preprocessing/README.md +++ b/preprocessing/README.md @@ -25,8 +25,8 @@ - Windows: `.\bin\windows\kafka-server-start.bat .\config\server.properties` - Mac: `bin/kafka-server-start.sh config/server.properties` - Creating a Kafka topic: -- Windows: `.\bin\windows\kafka-topics.bat --create --topic %topicname% --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1` -- Mac: `bin/kafka-topics.sh --create --topic %topicname% --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1` + - Windows: `.\bin\windows\kafka-topics.bat --create --topic %topicname% --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1` + - Mac: `bin/kafka-topics.sh --create --topic %topicname% --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1` # Streaming from pcap file using Kafka From 38afa8d9fd16661dec5959f3e0743a09e7dbfdd7 Mon Sep 17 00:00:00 2001 From: Kaushik Narayan R Date: Mon, 4 Nov 2024 18:02:12 -0700 Subject: [PATCH 3/3] ip2location data --- preprocessing/README.md | 7 ++++++ preprocessing/ip2loc_prep.py | 49 ++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 preprocessing/ip2loc_prep.py diff --git a/preprocessing/README.md b/preprocessing/README.md index f7c3851..bfd5208 100644 --- a/preprocessing/README.md +++ b/preprocessing/README.md @@ -1,5 +1,7 @@ # Data filtering, preprocessing and selection for further use +## Traffic data + - IP packet traces are taken [from here](https://mawi.wide.ad.jp/mawi/samplepoint-F/2023/) - Filtering - L4 - Limit to TCP and UDP @@ -15,6 +17,11 @@ - Packet size - in bytes - `sample_output.csv` contains a partial subset of `202310081400.pcap`, ~600K packets +## IP geolocation database + +- This project uses the IP2Location LITE database for [IP geolocation](https://lite.ip2location.com) +- bit of preprocessing to leave out country code and convert IP address from decimal format to dotted string format + # Setting up Kafka - Download and install kafka [from here](https://kafka.apache.org/downloads) - Run all commands in separate terminals from installation location diff --git a/preprocessing/ip2loc_prep.py b/preprocessing/ip2loc_prep.py new file mode 100644 index 0000000..1f604e5 --- /dev/null +++ b/preprocessing/ip2loc_prep.py @@ -0,0 +1,49 @@ +import struct +import socket +import csv + +sample_size = 100 +batch_size = 10000 + +sample = True + + +def int_to_ipv4(num: int) -> str: + return socket.inet_ntoa(struct.pack("!L", num)) + + +with open("IP2LOCATION-LITE-DB3.csv", "r") as input_file, open( + "geoip.csv", "w", newline="" +) as output_file: + reader = csv.reader(input_file) + writer = csv.writer(output_file) + + # header row + writer.writerow( + [ + "ip_from", + "ip_to", + "country", + "region", + "city", + ] + ) + + records = [] + for idx, record in enumerate(reader): + new_record = [ + int_to_ipv4(int(record[0])), + int_to_ipv4(int(record[1])), + record[3], + record[4], + record[5], + ] + records.append(new_record) + if sample and idx > sample_size: + break + if idx > 0 and idx % batch_size == 0: + writer.writerows(records) + records = [] + + if len(records) > 0: + writer.writerows(records)