mirror of
https://github.com/20kaushik02/real-time-traffic-analysis-clickhouse.git
synced 2025-12-06 08:14:07 +00:00
kafka ch integration done
This commit is contained in:
parent
6fb0ff4b4c
commit
75bca2d704
@ -19,7 +19,7 @@ SETTINGS storage_policy = 'hot_cold';
|
||||
CREATE TABLE ip_region_map (
|
||||
ip_range_start IPv4,
|
||||
ip_range_end IPv4,
|
||||
region String,
|
||||
region LowCardinality(String),
|
||||
INDEX region_idx region TYPE bloom_filter
|
||||
) ENGINE = ReplicatedMergeTree(
|
||||
'/clickhouse/tables/{shard}/ip_region_map',
|
||||
28
clickhouse/ddl/main/01_table_create.sql
Normal file
28
clickhouse/ddl/main/01_table_create.sql
Normal file
@ -0,0 +1,28 @@
|
||||
-- local table creation
|
||||
CREATE TABLE traffic_records (
|
||||
time_stamp DateTime64 (6, 'Japan') CODEC (Delta, ZSTD),
|
||||
l4_protocol Enum8 ('TCP' = 1, 'UDP' = 2),
|
||||
src_ip IPv4,
|
||||
dst_ip IPv4,
|
||||
src_port UInt16 CODEC (ZSTD),
|
||||
dst_port UInt16 CODEC (ZSTD),
|
||||
pkt_len UInt16 CODEC (ZSTD),
|
||||
INDEX port_idx src_port TYPE bloom_filter GRANULARITY 10
|
||||
) ENGINE = ReplicatedMergeTree(
|
||||
'/clickhouse/tables/{shard}/traffic_records',
|
||||
'{replica}'
|
||||
)
|
||||
ORDER BY time_stamp
|
||||
TTL toDateTime(time_stamp) + INTERVAL 15 DAY TO VOLUME 'cold_vol'
|
||||
SETTINGS storage_policy = 'hot_cold';
|
||||
|
||||
CREATE TABLE ip_region_map (
|
||||
ip_range_start IPv4,
|
||||
ip_range_end IPv4,
|
||||
region LowCardinality(String),
|
||||
INDEX region_idx region TYPE bloom_filter
|
||||
) ENGINE = ReplicatedMergeTree(
|
||||
'/clickhouse/tables/{shard}/ip_region_map',
|
||||
'{replica}'
|
||||
)
|
||||
ORDER BY ip_range_start;
|
||||
22
clickhouse/ddl/main/03_create_kafka_table.sql
Normal file
22
clickhouse/ddl/main/03_create_kafka_table.sql
Normal file
@ -0,0 +1,22 @@
|
||||
CREATE TABLE traffic_records_kafka_queue (
|
||||
time Float64,
|
||||
l4_proto String,
|
||||
src_addr String,
|
||||
dst_addr String,
|
||||
src_port UInt16,
|
||||
dst_port UInt16,
|
||||
pkt_len UInt32
|
||||
) ENGINE = Kafka() SETTINGS kafka_broker_list = 'kafka:9092',
|
||||
kafka_topic_list = 'traffic_records_stream',
|
||||
kafka_group_name = 'clickhouse_consumer',
|
||||
kafka_format = 'JSONEachRow',
|
||||
kafka_num_consumers = 1;
|
||||
CREATE MATERIALIZED VIEW traffic_records_kafka_view TO traffic_records_all AS
|
||||
SELECT time AS time_stamp,
|
||||
l4_proto AS l4_protocol,
|
||||
src_addr AS src_ip,
|
||||
dst_addr AS dst_ip,
|
||||
src_port,
|
||||
dst_port,
|
||||
pkt_len
|
||||
FROM traffic_records_kafka_queue;
|
||||
@ -41,8 +41,7 @@ services:
|
||||
container_name: clickhouse-server1
|
||||
volumes:
|
||||
- ../clickhouse/node1-config/:/etc/clickhouse-server/config.d/
|
||||
- ../clickhouse/ddl/common/table_create.sql:/docker-entrypoint-initdb.d/common_table_create.sql
|
||||
- ../clickhouse/ddl/distr/table_create.sql:/docker-entrypoint-initdb.d/distr_table_create.sql
|
||||
- ../clickhouse/ddl/main:/docker-entrypoint-initdb.d
|
||||
- clickhouse_server1_data:/var/lib/clickhouse
|
||||
- clickhouse_server1_TTL:/clickhouse_data/server1
|
||||
networks:
|
||||
@ -79,7 +78,7 @@ services:
|
||||
container_name: clickhouse-server2
|
||||
volumes:
|
||||
- ../clickhouse/node2-config/:/etc/clickhouse-server/config.d/
|
||||
- ../clickhouse/ddl/common/table_create.sql:/docker-entrypoint-initdb.d/common_table_create.sql
|
||||
- ../clickhouse/ddl/common:/docker-entrypoint-initdb.d
|
||||
- clickhouse_server2_data:/var/lib/clickhouse
|
||||
- clickhouse_server2_TTL:/clickhouse_data/server2
|
||||
networks:
|
||||
|
||||
@ -28,6 +28,8 @@ services:
|
||||
KAFKA_BROKER_ID: 1
|
||||
KAFKA_MESSAGE_MAX_BYTES: 200000000
|
||||
KAFKA_REPLICA_FETCH_MAX_BYTES: 200000000
|
||||
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
|
||||
command: sh -c "/etc/confluent/docker/run && kafka-topics --create --bootstrap-server kafka:9092 --replication-factor 1 --partitions 1 --topic traffic_records_stream"
|
||||
networks:
|
||||
data-network:
|
||||
aliases:
|
||||
|
||||
@ -12,36 +12,33 @@ import json
|
||||
dbg_print = lambda *x: DEBUG and print(f"[DEBUG] {x}")
|
||||
|
||||
|
||||
# Kafka Configuration
|
||||
KAFKA_TOPIC = "traffic_records_stream"
|
||||
KAFKA_SERVER = "kafka:9092" # Adjust to your Kafka server
|
||||
|
||||
|
||||
class KafkaClient:
|
||||
def __init__(self, topic_name=None, mode='producer'):
|
||||
def __init__(self, topic_name=None, mode="producer"):
|
||||
self.mode = mode
|
||||
self.topic_name = topic_name
|
||||
if mode == 'producer':
|
||||
if mode == "producer":
|
||||
self.client = KafkaProducer(
|
||||
bootstrap_servers=['kafka:9092'],
|
||||
bootstrap_servers=[KAFKA_SERVER],
|
||||
max_request_size=200000000,
|
||||
# api_version=(0,11,5),
|
||||
value_serializer=lambda x: json.dumps(x).encode('utf-8'))
|
||||
elif mode == 'consumer' and topic_name is not None:
|
||||
value_serializer=lambda x: json.dumps(x).encode("utf-8"),
|
||||
)
|
||||
elif mode == "consumer" and topic_name is not None:
|
||||
self.client = KafkaConsumer(
|
||||
topic_name,
|
||||
bootstrap_servers=['localhost:9092'],
|
||||
bootstrap_servers=["localhost:9092"],
|
||||
api_version=(0, 11, 5),
|
||||
value_deserializer=lambda x: json.loads(x.decode('utf-8')))
|
||||
value_deserializer=lambda x: json.loads(x.decode("utf-8")),
|
||||
)
|
||||
else:
|
||||
raise ValueError("Consumer mode requires a topic_name")
|
||||
|
||||
# Kafka Configuration
|
||||
KAFKA_TOPIC = 'pcap_stream_new'
|
||||
KAFKA_SERVER = 'kafka:9092' # Adjust to your Kafka server
|
||||
#KAFKA_SERVER = 'kafka_service:9092'
|
||||
|
||||
# Initialize Kafka Producer
|
||||
# producer = KafkaProducer(
|
||||
# bootstrap_servers=KAFKA_SERVER,
|
||||
# value_serializer=lambda v: v.encode('utf-8') if isinstance(v, str) else str(v).encode('utf-8') #remove intermediate JSON encoding
|
||||
# )
|
||||
producer = KafkaClient(topic_name=KAFKA_TOPIC)
|
||||
|
||||
|
||||
@ -108,7 +105,7 @@ def create_pkt_object(pkt: Packet) -> dict:
|
||||
"dst_addr": pkt[IP].dst,
|
||||
"src_port": pkt[l4_proto].sport,
|
||||
"dst_port": pkt[l4_proto].dport,
|
||||
"pkt_len": len(pkt)
|
||||
"pkt_len": len(pkt),
|
||||
}
|
||||
|
||||
return res_json
|
||||
@ -157,7 +154,9 @@ if __name__ == "__main__":
|
||||
argp.add_argument("-f", "--pcap_file", required=False, dest="_pcap")
|
||||
argp.add_argument("-c", "--csv_file", required=False, dest="_csv")
|
||||
argp.add_argument("-o", "--out_file", required=False, dest="_out")
|
||||
argp.add_argument("--stream_size", required=False, default=10000, dest="_streamsize")
|
||||
argp.add_argument(
|
||||
"--stream_size", required=False, default=10000, dest="_streamsize"
|
||||
)
|
||||
argp.add_argument(
|
||||
"-x",
|
||||
"--sample",
|
||||
@ -197,17 +196,17 @@ if __name__ == "__main__":
|
||||
|
||||
# if preprocessed data ready for streaming
|
||||
if csv_file:
|
||||
#print("true")
|
||||
with open(csv_file, newline="") as f:
|
||||
csv_rdr = csv.reader(f)
|
||||
next(csv_rdr) # skip headers
|
||||
pkts = []
|
||||
|
||||
print("started stream from csv")
|
||||
for idx, row in enumerate(csv_rdr):
|
||||
# direct streaming to kafka goes here
|
||||
producer.client.send(KAFKA_TOPIC, row_to_dict(row))
|
||||
dbg_print(row_to_dict(row))
|
||||
print("streamed packet", idx)
|
||||
dbg_print("streamed packet", idx)
|
||||
if sample and idx > sample_size:
|
||||
break
|
||||
print(f"total streamed: {idx}")
|
||||
@ -222,6 +221,8 @@ if __name__ == "__main__":
|
||||
pkts = []
|
||||
cnt = 0
|
||||
seen_count = 0
|
||||
|
||||
print("started stream from pcap")
|
||||
for idx, pkt in enumerate(pcap_rdr):
|
||||
seen_count += 1
|
||||
# filter packets
|
||||
@ -244,7 +245,8 @@ if __name__ == "__main__":
|
||||
producer.client.send(KAFKA_TOPIC, packet_data)
|
||||
cnt += 1
|
||||
# print(f"streamed packet at index {idx} ")
|
||||
if idx > sample_size: break
|
||||
if idx > sample_size:
|
||||
break
|
||||
|
||||
print(f"total seen: {seen_count-1}")
|
||||
print(f"total streamed: {cnt}")
|
||||
|
||||
@ -18,14 +18,11 @@ if ($downStack) {
|
||||
elseif ($MasterNode) {
|
||||
Write-Output "[+] swarm master"
|
||||
|
||||
# cleanup
|
||||
docker stack rm $stackName
|
||||
docker service rm registry
|
||||
|
||||
# data streaming
|
||||
Set-Location $scriptDir/../preprocessing
|
||||
docker service create --name registry -p 5000:5000 registry:2
|
||||
docker build -t 127.0.0.1:5000/data-streamer:latest --no-cache --push -f Dockerfile.python .
|
||||
# docker build -t 127.0.0.1:5000/data-streamer:latest --no-cache --push -f Dockerfile.python .
|
||||
docker build -t 127.0.0.1:5000/data-streamer:latest --push -f Dockerfile.python .
|
||||
|
||||
# execute
|
||||
Set-Location $scriptDir
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user