TCP-RL/TCP-RL-Agent.py
2022-05-14 23:02:03 +05:30

259 lines
7.1 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import argparse
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import tensorflow as tf
from ns3gym import ns3env
from tcp_base import TcpTimeBased, TcpEventBased
try:
w_file = open('run.log', 'w')
except:
w_file = sys.stdout
parser = argparse.ArgumentParser(description='Start simulation script on/off')
parser.add_argument('--start',
type=int,
default=1,
help='Start ns-3 simulation script 0/1, Default: 1')
parser.add_argument('--iterations',
type=int,
default=1,
help='Number of iterations, Default: 1')
parser.add_argument('--steps',
type=int,
default=100,
help='Number of steps, Default 100')
parser.add_argument('--debug',
type=int,
default=0,
help='Show debug output 0/1, Default 0')
args = parser.parse_args()
startSim = bool(args.start)
iterationNum = int(args.iterations)
maxSteps = int(args.steps)
port = 5555
simTime = maxSteps / 10.0 # seconds
stepTime = simTime / 200.0 # seconds
seed = 12
simArgs = {"--duration": simTime,}
dashes = "-"*18
input("[{}Press Enter to start{}]".format(dashes, dashes))
# create environment
env = ns3env.Ns3Env(port=port, stepTime=stepTime, startSim=startSim, simSeed=seed, simArgs=simArgs)
ob_space = env.observation_space
ac_space = env.action_space
# TODO: right now, the next action is selected inside the loop, rather than using get_action.
# this is because we use the decaying epsilon-greedy algo which needs to use the live model
# somehow change or put that logic in an `RLTCP` class that inherits from the Tcp class, like in tcp_base.py,
# then move the class to tcp_base.py and use that agent here
def get_agent(state):
socketUuid = state[0]
tcpEnvType = state[1]
tcpAgent = get_agent.tcpAgents.get(socketUuid, None)
if tcpAgent is None:
# get a new agent based on the selected env type
if tcpEnvType == 0:
# event-based = 0
tcpAgent = TcpEventBased()
else:
# time-based = 1
tcpAgent = TcpTimeBased()
tcpAgent.set_spaces(get_agent.ob_space, get_agent.ac_space)
get_agent.tcpAgents[socketUuid] = tcpAgent
return tcpAgent
# initialize agent variables
# (useless until the above todo is fixed)
get_agent.tcpAgents = {}
get_agent.ob_space = ob_space
get_agent.ac_space = ac_space
def modeler(input_size, output_size):
"""
Designs a fully connected neural network.
"""
model = tf.keras.Sequential()
# input layer
model.add(tf.keras.layers.Dense((input_size + output_size) // 2, input_shape=(input_size,), activation='relu'))
# hidden layer of mean size of input and output
# model.add(tf.keras.layers.Dense((input_size + output_size) // 2, activation='relu'))
# output layer
# maps previous layer of input_size units to output_size units
# this is a classifier network
model.add(tf.keras.layers.Dense(output_size, activation='softmax'))
return model
state_size = ob_space.shape[0] - 4 # ignoring 4 env attributes
action_size = 3
action_mapping = {} # dict faster than list
action_mapping[0] = 0
action_mapping[1] = 600
action_mapping[2] = -150
# build model
model = modeler(state_size, action_size)
model.compile(
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2),
loss='categorical_crossentropy',
metrics=['accuracy']
)
model.summary()
# initialize decaying epsilon-greedy algorithm
# fine-tune to ensure balance of exploration and exploitation
epsilon = 1.0
epsilon_decay_param = iterationNum * 5
min_epsilon = 0.1
epsilon_decay = (((epsilon_decay_param*maxSteps) - 1.0) / (epsilon_decay_param*maxSteps))
# initialize Q-learning's discount factor
discount_factor = 0.95
rewardsum = 0
rew_history = []
cWnd_history = []
pred_cWnd_history = []
rtt_history = []
done = False
pretty_slash = ['\\', '|', '/', '-']
for iteration in range(iterationNum):
# set initial state
state = env.reset()
# ignore env attributes: socketID, env type, sim time, nodeID
state = state[4:]
cWnd = state[1]
init_cWnd = cWnd
state = np.reshape(state, [1, state_size])
try:
for step in range(maxSteps):
pretty_index = step % 4
print("\r{}\r[{}] Logging to file {} {}".format(
' '*(25+len(w_file.name)),
pretty_slash[pretty_index],
w_file.name,
'.'*(pretty_index+1)
), end='')
print("[+] Step: {}".format(step+1), file=w_file)
# Epsilon-greedy selection
if step == 0 or np.random.rand(1) < epsilon:
# explore new situation
action_index = np.random.randint(0, action_size)
print("\t[*] Random exploration. Selected action: {}".format(action_index), file=w_file)
else:
# exploit gained knowledge
action_index = np.argmax(model.predict(state)[0])
print("\t[*] Exploiting gained knowledge. Selected action: {}".format(action_index), file=w_file)
# Calculate action
# Note: prevent new_cWnd from falling too low to avoid negative values
new_cWnd = cWnd + action_mapping[action_index]
new_ssThresh = int(cWnd/2)
actions = [new_ssThresh, new_cWnd]
# Take action step on environment and get feedback
next_state, reward, done, _ = env.step(actions)
rewardsum += reward
next_state = next_state[4:]
cWnd = next_state[1]
rtt = next_state[7]
print("\t[#] Next state: ", next_state, file=w_file)
print("\t[!] Reward: ", reward, file=w_file)
next_state = np.reshape(next_state, [1, state_size])
# Train incrementally
# DQN - function approximation using neural networks
target = reward
if not done:
target = (reward + discount_factor * np.amax(model.predict(next_state)[0]))
target_f = model.predict(state)
target_f[0][action_index] = target
model.fit(state, target_f, epochs=1, verbose=0)
# Update state
state = next_state
if done:
print("[X] Stopping: step: {}, reward sum: {}, epsilon: {:.2}"
.format(step+1, rewardsum, epsilon),
file=w_file)
break
if epsilon > min_epsilon:
epsilon *= epsilon_decay
# Record information
rew_history.append(rewardsum)
rtt_history.append(rtt)
cWnd_history.append(cWnd)
pred_cWnd_history.append(new_cWnd)
print("\n[O] Iteration over.", file=w_file)
print("[-] Final epsilon value: ", epsilon, file=w_file)
print("[-] Final reward sum: ", rewardsum, file=w_file)
print()
finally:
print()
if iteration+1 == iterationNum:
break
# if str(input("[?] Continue to next iteration? [Y/n]: ") or "Y").lower() != "y":
# break
mpl.rcdefaults()
mpl.rcParams.update({'font.size': 12})
fig, ax = plt.subplots(2, 2, figsize=(4,2))
plt.tight_layout(pad=0.3)
ax[0, 0].plot(range(len(cWnd_history)), cWnd_history, marker="", linestyle="-")
ax[0, 0].set_title('Congestion windows')
ax[0, 0].set_xlabel('Steps')
ax[0, 0].set_ylabel('Actual CWND')
ax[0, 1].plot(range(len(pred_cWnd_history)), pred_cWnd_history, marker="", linestyle="-")
ax[0, 1].set_title('Predicted values')
ax[0, 1].set_xlabel('Steps')
ax[0, 1].set_ylabel('Predicted CWND')
ax[1, 0].plot(range(len(rtt_history)), rtt_history, marker="", linestyle="-")
ax[1, 0].set_title('RTT over time')
ax[1, 0].set_xlabel('Steps')
ax[1, 0].set_ylabel('RTT (microseconds)')
ax[1, 1].plot(range(len(rew_history)), rew_history, marker="", linestyle="-")
ax[1, 1].set_title('Reward sum plot')
ax[1, 1].set_xlabel('Steps')
ax[1, 1].set_ylabel('Reward sum')
plt.show()