#!/usr/bin/env python3 # -*- coding: utf-8 -*- import sys import argparse import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import tensorflow as tf from ns3gym import ns3env from tcp_base import TcpTimeBased, TcpEventBased try: w_file = open('run.log', 'w') except: w_file = sys.stdout parser = argparse.ArgumentParser(description='Start simulation script on/off') parser.add_argument('--start', type=int, default=1, help='Start ns-3 simulation script 0/1, Default: 1') parser.add_argument('--iterations', type=int, default=1, help='Number of iterations, Default: 1') parser.add_argument('--steps', type=int, default=100, help='Number of steps, Default 100') parser.add_argument('--debug', type=int, default=0, help='Show debug output 0/1, Default 0') args = parser.parse_args() startSim = bool(args.start) iterationNum = int(args.iterations) maxSteps = int(args.steps) port = 5555 simTime = maxSteps / 10.0 # seconds stepTime = simTime / 200.0 # seconds seed = 12 simArgs = {"--duration": simTime,} dashes = "-"*18 input("[{}Press Enter to start{}]".format(dashes, dashes)) # create environment env = ns3env.Ns3Env(port=port, stepTime=stepTime, startSim=startSim, simSeed=seed, simArgs=simArgs) ob_space = env.observation_space ac_space = env.action_space # TODO: right now, the next action is selected inside the loop, rather than using get_action. # this is because we use the decaying epsilon-greedy algo which needs to use the live model # somehow change or put that logic in an `RLTCP` class that inherits from the Tcp class, like in tcp_base.py, # then move the class to tcp_base.py and use that agent here def get_agent(state): socketUuid = state[0] tcpEnvType = state[1] tcpAgent = get_agent.tcpAgents.get(socketUuid, None) if tcpAgent is None: # get a new agent based on the selected env type if tcpEnvType == 0: # event-based = 0 tcpAgent = TcpEventBased() else: # time-based = 1 tcpAgent = TcpTimeBased() tcpAgent.set_spaces(get_agent.ob_space, get_agent.ac_space) get_agent.tcpAgents[socketUuid] = tcpAgent return tcpAgent # initialize agent variables # (useless until the above todo is fixed) get_agent.tcpAgents = {} get_agent.ob_space = ob_space get_agent.ac_space = ac_space def modeler(input_size, output_size): """ Designs a fully connected neural network. """ model = tf.keras.Sequential() # input layer model.add(tf.keras.layers.Dense((input_size + output_size) // 2, input_shape=(input_size,), activation='relu')) # hidden layer of mean size of input and output # model.add(tf.keras.layers.Dense((input_size + output_size) // 2, activation='relu')) # output layer # maps previous layer of input_size units to output_size units # this is a classifier network model.add(tf.keras.layers.Dense(output_size, activation='softmax')) return model state_size = ob_space.shape[0] - 4 # ignoring 4 env attributes action_size = 3 action_mapping = {} # dict faster than list action_mapping[0] = 0 action_mapping[1] = 600 action_mapping[2] = -150 # build model model = modeler(state_size, action_size) model.compile( optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2), loss='categorical_crossentropy', metrics=['accuracy'] ) model.summary() # initialize decaying epsilon-greedy algorithm # fine-tune to ensure balance of exploration and exploitation epsilon = 1.0 epsilon_decay_param = iterationNum * 5 min_epsilon = 0.1 epsilon_decay = (((epsilon_decay_param*maxSteps) - 1.0) / (epsilon_decay_param*maxSteps)) # initialize Q-learning's discount factor discount_factor = 0.95 rewardsum = 0 rew_history = [] cWnd_history = [] pred_cWnd_history = [] rtt_history = [] done = False pretty_slash = ['\\', '|', '/', '-'] for iteration in range(iterationNum): # set initial state state = env.reset() # ignore env attributes: socketID, env type, sim time, nodeID state = state[4:] cWnd = state[1] init_cWnd = cWnd state = np.reshape(state, [1, state_size]) try: for step in range(maxSteps): pretty_index = step % 4 print("\r{}\r[{}] Logging to file {} {}".format( ' '*(25+len(w_file.name)), pretty_slash[pretty_index], w_file.name, '.'*(pretty_index+1) ), end='') print("[+] Step: {}".format(step+1), file=w_file) # Epsilon-greedy selection if step == 0 or np.random.rand(1) < epsilon: # explore new situation action_index = np.random.randint(0, action_size) print("\t[*] Random exploration. Selected action: {}".format(action_index), file=w_file) else: # exploit gained knowledge action_index = np.argmax(model.predict(state)[0]) print("\t[*] Exploiting gained knowledge. Selected action: {}".format(action_index), file=w_file) # Calculate action # Note: prevent new_cWnd from falling too low to avoid negative values new_cWnd = cWnd + action_mapping[action_index] new_ssThresh = int(cWnd/2) actions = [new_ssThresh, new_cWnd] # Take action step on environment and get feedback next_state, reward, done, _ = env.step(actions) rewardsum += reward next_state = next_state[4:] cWnd = next_state[1] rtt = next_state[7] print("\t[#] Next state: ", next_state, file=w_file) print("\t[!] Reward: ", reward, file=w_file) next_state = np.reshape(next_state, [1, state_size]) # Train incrementally # DQN - function approximation using neural networks target = reward if not done: target = (reward + discount_factor * np.amax(model.predict(next_state)[0])) target_f = model.predict(state) target_f[0][action_index] = target model.fit(state, target_f, epochs=1, verbose=0) # Update state state = next_state if done: print("[X] Stopping: step: {}, reward sum: {}, epsilon: {:.2}" .format(step+1, rewardsum, epsilon), file=w_file) break if epsilon > min_epsilon: epsilon *= epsilon_decay # Record information rew_history.append(rewardsum) rtt_history.append(rtt) cWnd_history.append(cWnd) pred_cWnd_history.append(new_cWnd) print("\n[O] Iteration over.", file=w_file) print("[-] Final epsilon value: ", epsilon, file=w_file) print("[-] Final reward sum: ", rewardsum, file=w_file) print() finally: print() if iteration+1 == iterationNum: break # if str(input("[?] Continue to next iteration? [Y/n]: ") or "Y").lower() != "y": # break mpl.rcdefaults() mpl.rcParams.update({'font.size': 12}) fig, ax = plt.subplots(2, 2, figsize=(4,2)) plt.tight_layout(pad=0.3) ax[0, 0].plot(range(len(cWnd_history)), cWnd_history, marker="", linestyle="-") ax[0, 0].set_title('Congestion windows') ax[0, 0].set_xlabel('Steps') ax[0, 0].set_ylabel('Actual CWND') ax[0, 1].plot(range(len(pred_cWnd_history)), pred_cWnd_history, marker="", linestyle="-") ax[0, 1].set_title('Predicted values') ax[0, 1].set_xlabel('Steps') ax[0, 1].set_ylabel('Predicted CWND') ax[1, 0].plot(range(len(rtt_history)), rtt_history, marker="", linestyle="-") ax[1, 0].set_title('RTT over time') ax[1, 0].set_xlabel('Steps') ax[1, 0].set_ylabel('RTT (microseconds)') ax[1, 1].plot(range(len(rew_history)), rew_history, marker="", linestyle="-") ax[1, 1].set_title('Reward sum plot') ax[1, 1].set_xlabel('Steps') ax[1, 1].set_ylabel('Reward sum') plt.show()