mirror of
https://github.com/20kaushik02/real-time-traffic-analysis-clickhouse.git
synced 2026-01-25 08:04:04 +00:00
Merge branch 'preprocessing' into integration_2
This commit is contained in:
@@ -49,3 +49,4 @@
|
||||
|
||||
|
||||
python pcap_processor.py -f C:/Users/akash/storage/Asu/sem3/dds/project/202310081400.pcap -s --stream_size 1000
|
||||
python pcap_procesor.py -c sample_output.csv -s --stream_size 1000
|
||||
|
||||
252127
preprocessing/geoip.csv
Normal file
252127
preprocessing/geoip.csv
Normal file
File diff suppressed because it is too large
Load Diff
@@ -5,14 +5,15 @@ import csv
|
||||
sample_size = 100
|
||||
batch_size = 10000
|
||||
|
||||
sample = True
|
||||
sample = False
|
||||
|
||||
|
||||
def int_to_ipv4(num: int) -> str:
|
||||
return socket.inet_ntoa(struct.pack("!L", num))
|
||||
|
||||
|
||||
with open("IP2LOCATION-LITE-DB3.csv", "r") as input_file, open(
|
||||
# with open("IP2LOCATION-LITE-DB3.csv", "r") as input_file, open(
|
||||
with open("IP2LOCATION-LITE-DB1.csv", "r") as input_file, open(
|
||||
"geoip.csv", "w", newline=""
|
||||
) as output_file:
|
||||
reader = csv.reader(input_file)
|
||||
@@ -21,11 +22,11 @@ with open("IP2LOCATION-LITE-DB3.csv", "r") as input_file, open(
|
||||
# header row
|
||||
writer.writerow(
|
||||
[
|
||||
"ip_from",
|
||||
"ip_to",
|
||||
"ip_range_start",
|
||||
"ip_range_end",
|
||||
"country",
|
||||
"region",
|
||||
"city",
|
||||
# "region",
|
||||
# "city",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -35,8 +36,8 @@ with open("IP2LOCATION-LITE-DB3.csv", "r") as input_file, open(
|
||||
int_to_ipv4(int(record[0])),
|
||||
int_to_ipv4(int(record[1])),
|
||||
record[3],
|
||||
record[4],
|
||||
record[5],
|
||||
# record[4],
|
||||
# record[5],
|
||||
]
|
||||
records.append(new_record)
|
||||
if sample and idx > sample_size:
|
||||
|
||||
@@ -19,7 +19,7 @@ class KafkaClient:
|
||||
self.topic_name = topic_name
|
||||
if mode == 'producer':
|
||||
self.client = KafkaProducer(
|
||||
bootstrap_servers=['localhost:9092'],
|
||||
bootstrap_servers=['kafka:9092'],
|
||||
max_request_size = 200000000,
|
||||
#api_version=(0,11,5),
|
||||
value_serializer=lambda x: json.dumps(x).encode('utf-8'))
|
||||
@@ -192,11 +192,12 @@ if __name__ == "__main__":
|
||||
|
||||
DEBUG = args._debug
|
||||
|
||||
sample_size = int(args._samplesize) #1000000
|
||||
sample_size = int(args._streamsize) # 100000
|
||||
batch_size = 100 #100000
|
||||
|
||||
# if preprocessed data ready for streaming
|
||||
if csv_file:
|
||||
#print("true")
|
||||
with open(csv_file, newline="") as f:
|
||||
csv_rdr = csv.reader(f)
|
||||
next(csv_rdr) # skip headers
|
||||
@@ -206,10 +207,10 @@ if __name__ == "__main__":
|
||||
# direct streaming to kafka goes here
|
||||
producer.client.send(KAFKA_TOPIC, row_to_dict(row))
|
||||
dbg_print(row_to_dict(row))
|
||||
dbg_print("streamed packet", idx)
|
||||
print("streamed packet", idx)
|
||||
if idx > sample_size:
|
||||
break
|
||||
dbg_print(f"total streamed: {idx}")
|
||||
print(f"total streamed: {idx}")
|
||||
|
||||
# otherwise, process packets
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user