mirror of
https://github.com/20kaushik02/real-time-traffic-analysis-clickhouse.git
synced 2025-12-06 09:34:07 +00:00
preprocessing data
This commit is contained in:
parent
a96c4320b0
commit
957202d86f
15
preprocessing/README.md
Normal file
15
preprocessing/README.md
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
# Data filtering, preprocessing and selection for further use
|
||||||
|
|
||||||
|
- IP packet traces are taken [from here](https://mawi.wide.ad.jp/mawi/samplepoint-F/2023/), specifically from 2023/10/01-2023/10/31 (yet to confirm)
|
||||||
|
- Filtering - TODO
|
||||||
|
- L4 - Limit to TCP and UDP
|
||||||
|
- maybe GRE for VPN usage?
|
||||||
|
- L3 - IPv6 is only around 10%, let's drop it
|
||||||
|
- Selection (of fields):
|
||||||
|
- Timestamp - note: capture window is from 0500-0515 UTC
|
||||||
|
- IP
|
||||||
|
- addresses - src, dst
|
||||||
|
- protocol - 6 (TCP) or 17 (UDP). cld go for boolean to save space
|
||||||
|
- TCP
|
||||||
|
- ports - sport, dport
|
||||||
|
- Packet size - in bytes - could exclude L2?
|
||||||
41
preprocessing/scratch.py
Normal file
41
preprocessing/scratch.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from scapy.utils import PcapReader
|
||||||
|
from scapy.layers.inet import IP, TCP, UDP
|
||||||
|
|
||||||
|
pcap_rdr = PcapReader("202310081400.pcap")
|
||||||
|
sample_size = 100
|
||||||
|
|
||||||
|
for idx, pkt in enumerate(pcap_rdr):
|
||||||
|
try:
|
||||||
|
assert (IP in pkt)
|
||||||
|
assert (pkt[IP].version == 4)
|
||||||
|
assert (TCP in pkt) or (UDP in pkt)
|
||||||
|
except AssertionError:
|
||||||
|
continue
|
||||||
|
# pkt.show()
|
||||||
|
if TCP in pkt:
|
||||||
|
print(
|
||||||
|
"[{}] TCP {}:{} -> {}:{} - {} bytes".format(
|
||||||
|
datetime.fromtimestamp(float(pkt.time), timezone.utc),
|
||||||
|
pkt[IP].src,
|
||||||
|
pkt[TCP].sport,
|
||||||
|
pkt[IP].dst,
|
||||||
|
pkt[TCP].dport,
|
||||||
|
len(pkt),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif UDP in pkt:
|
||||||
|
print(
|
||||||
|
"[{}] UDP {}:{} -> {}:{} - {} bytes".format(
|
||||||
|
datetime.fromtimestamp(float(pkt.time), timezone.utc),
|
||||||
|
pkt[IP].src,
|
||||||
|
pkt[UDP].sport,
|
||||||
|
pkt[IP].dst,
|
||||||
|
pkt[UDP].dport,
|
||||||
|
len(pkt),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if idx > sample_size:
|
||||||
|
break
|
||||||
Loading…
x
Reference in New Issue
Block a user