diff --git a/preprocessing/README.md b/preprocessing/README.md new file mode 100644 index 0000000..0bf69bd --- /dev/null +++ b/preprocessing/README.md @@ -0,0 +1,15 @@ +# Data filtering, preprocessing and selection for further use + +- IP packet traces are taken [from here](https://mawi.wide.ad.jp/mawi/samplepoint-F/2023/), specifically from 2023/10/01-2023/10/31 (yet to confirm) +- Filtering - TODO + - L4 - Limit to TCP and UDP + - maybe GRE for VPN usage? + - L3 - IPv6 is only around 10%, let's drop it +- Selection (of fields): + - Timestamp - note: capture window is from 0500-0515 UTC + - IP + - addresses - src, dst + - protocol - 6 (TCP) or 17 (UDP). cld go for boolean to save space + - TCP + - ports - sport, dport + - Packet size - in bytes - could exclude L2? diff --git a/preprocessing/scratch.py b/preprocessing/scratch.py new file mode 100644 index 0000000..3c37151 --- /dev/null +++ b/preprocessing/scratch.py @@ -0,0 +1,41 @@ +from datetime import datetime, timezone + +from scapy.utils import PcapReader +from scapy.layers.inet import IP, TCP, UDP + +pcap_rdr = PcapReader("202310081400.pcap") +sample_size = 100 + +for idx, pkt in enumerate(pcap_rdr): + try: + assert (IP in pkt) + assert (pkt[IP].version == 4) + assert (TCP in pkt) or (UDP in pkt) + except AssertionError: + continue + # pkt.show() + if TCP in pkt: + print( + "[{}] TCP {}:{} -> {}:{} - {} bytes".format( + datetime.fromtimestamp(float(pkt.time), timezone.utc), + pkt[IP].src, + pkt[TCP].sport, + pkt[IP].dst, + pkt[TCP].dport, + len(pkt), + ) + ) + elif UDP in pkt: + print( + "[{}] UDP {}:{} -> {}:{} - {} bytes".format( + datetime.fromtimestamp(float(pkt.time), timezone.utc), + pkt[IP].src, + pkt[UDP].sport, + pkt[IP].dst, + pkt[UDP].dport, + len(pkt), + ) + ) + + if idx > sample_size: + break