mirror of
https://github.com/20kaushik02/real-time-traffic-analysis-clickhouse.git
synced 2025-12-06 10:44:06 +00:00
ip2location data
This commit is contained in:
parent
b4a777c368
commit
38afa8d9fd
@ -1,5 +1,7 @@
|
|||||||
# Data filtering, preprocessing and selection for further use
|
# Data filtering, preprocessing and selection for further use
|
||||||
|
|
||||||
|
## Traffic data
|
||||||
|
|
||||||
- IP packet traces are taken [from here](https://mawi.wide.ad.jp/mawi/samplepoint-F/2023/)
|
- IP packet traces are taken [from here](https://mawi.wide.ad.jp/mawi/samplepoint-F/2023/)
|
||||||
- Filtering
|
- Filtering
|
||||||
- L4 - Limit to TCP and UDP
|
- L4 - Limit to TCP and UDP
|
||||||
@ -15,6 +17,11 @@
|
|||||||
- Packet size - in bytes
|
- Packet size - in bytes
|
||||||
- `sample_output.csv` contains a partial subset of `202310081400.pcap`, ~600K packets
|
- `sample_output.csv` contains a partial subset of `202310081400.pcap`, ~600K packets
|
||||||
|
|
||||||
|
## IP geolocation database
|
||||||
|
|
||||||
|
- This project uses the IP2Location LITE database for [IP geolocation](https://lite.ip2location.com)
|
||||||
|
- bit of preprocessing to leave out country code and convert IP address from decimal format to dotted string format
|
||||||
|
|
||||||
# Setting up Kafka
|
# Setting up Kafka
|
||||||
- Download and install kafka [from here](https://kafka.apache.org/downloads)
|
- Download and install kafka [from here](https://kafka.apache.org/downloads)
|
||||||
- Run all commands in separate terminals from installation location
|
- Run all commands in separate terminals from installation location
|
||||||
|
|||||||
49
preprocessing/ip2loc_prep.py
Normal file
49
preprocessing/ip2loc_prep.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import struct
|
||||||
|
import socket
|
||||||
|
import csv
|
||||||
|
|
||||||
|
sample_size = 100
|
||||||
|
batch_size = 10000
|
||||||
|
|
||||||
|
sample = True
|
||||||
|
|
||||||
|
|
||||||
|
def int_to_ipv4(num: int) -> str:
|
||||||
|
return socket.inet_ntoa(struct.pack("!L", num))
|
||||||
|
|
||||||
|
|
||||||
|
with open("IP2LOCATION-LITE-DB3.csv", "r") as input_file, open(
|
||||||
|
"geoip.csv", "w", newline=""
|
||||||
|
) as output_file:
|
||||||
|
reader = csv.reader(input_file)
|
||||||
|
writer = csv.writer(output_file)
|
||||||
|
|
||||||
|
# header row
|
||||||
|
writer.writerow(
|
||||||
|
[
|
||||||
|
"ip_from",
|
||||||
|
"ip_to",
|
||||||
|
"country",
|
||||||
|
"region",
|
||||||
|
"city",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
records = []
|
||||||
|
for idx, record in enumerate(reader):
|
||||||
|
new_record = [
|
||||||
|
int_to_ipv4(int(record[0])),
|
||||||
|
int_to_ipv4(int(record[1])),
|
||||||
|
record[3],
|
||||||
|
record[4],
|
||||||
|
record[5],
|
||||||
|
]
|
||||||
|
records.append(new_record)
|
||||||
|
if sample and idx > sample_size:
|
||||||
|
break
|
||||||
|
if idx > 0 and idx % batch_size == 0:
|
||||||
|
writer.writerows(records)
|
||||||
|
records = []
|
||||||
|
|
||||||
|
if len(records) > 0:
|
||||||
|
writer.writerows(records)
|
||||||
Loading…
x
Reference in New Issue
Block a user