[153] | 1 | package agents.rlboa;
|
---|
[67] | 2 |
|
---|
| 3 | import genius.core.StrategyParameters;
|
---|
| 4 | import genius.core.bidding.BidDetails;
|
---|
| 5 | import genius.core.boaframework.NegotiationSession;
|
---|
| 6 | import genius.core.boaframework.OfferingStrategy;
|
---|
| 7 | import genius.core.boaframework.OpponentModel;
|
---|
| 8 | import genius.core.boaframework.OutcomeSpace;
|
---|
| 9 | import genius.core.boaframework.SortedOutcomeSpace;
|
---|
| 10 | import genius.core.misc.Range;
|
---|
| 11 |
|
---|
| 12 | import java.util.ArrayList;
|
---|
| 13 | import java.util.Collections;
|
---|
| 14 | import java.util.HashMap;
|
---|
| 15 | import java.util.List;
|
---|
| 16 | import java.util.Random;
|
---|
| 17 |
|
---|
| 18 | public class QlearningStrategy extends OfferingStrategy {
|
---|
| 19 |
|
---|
| 20 | protected HashMap<Integer, ArrayList<Double>> qTable;
|
---|
| 21 | protected ArrayList<Integer> actions = new ArrayList<Integer>();
|
---|
| 22 | protected int bins;
|
---|
| 23 | protected Double eps;
|
---|
| 24 | protected Double alpha;
|
---|
| 25 | protected Double gamma;
|
---|
[153] | 26 | protected AbstractState state;
|
---|
| 27 | protected String mode;
|
---|
| 28 | protected int timeBins;
|
---|
| 29 | protected Range minMaxBin;
|
---|
[67] | 30 |
|
---|
| 31 | public QlearningStrategy(NegotiationSession negotiationSession, OpponentModel opponentModel) {
|
---|
| 32 | super.init(negotiationSession, null);
|
---|
| 33 | this.opponentModel = opponentModel;
|
---|
| 34 | this.endNegotiation = false;
|
---|
| 35 | this.state = State.INITIAL;
|
---|
[153] | 36 |
|
---|
[67] | 37 | OutcomeSpace outcomeSpace = new SortedOutcomeSpace(negotiationSession.getUtilitySpace());
|
---|
| 38 | this.negotiationSession.setOutcomeSpace(outcomeSpace);
|
---|
| 39 | }
|
---|
| 40 |
|
---|
| 41 | public ArrayList<Integer> getActions() {
|
---|
| 42 | return this.actions;
|
---|
| 43 | }
|
---|
| 44 |
|
---|
| 45 | /**
|
---|
| 46 | * @return int representing the last action taken by the strategy
|
---|
| 47 | * @throws IndexOutOfBoundsException if called before any action has been performed
|
---|
| 48 | */
|
---|
| 49 | public int getLastAction() throws IndexOutOfBoundsException {
|
---|
| 50 | return this.actions.get(this.actions.size() - 1);
|
---|
| 51 | }
|
---|
[153] | 52 |
|
---|
| 53 | public void setMinMaxBin(Range minMaxBin) {
|
---|
| 54 | this.minMaxBin = minMaxBin;
|
---|
| 55 | }
|
---|
| 56 |
|
---|
| 57 | public Range getMinMaxBin() {
|
---|
| 58 | return this.minMaxBin;
|
---|
| 59 | }
|
---|
[67] | 60 |
|
---|
| 61 | protected void initQTable() {
|
---|
| 62 | this.qTable = new HashMap<Integer, ArrayList<Double>>();
|
---|
| 63 |
|
---|
| 64 | // Initial state has different action space
|
---|
| 65 | this.qTable.putIfAbsent(this.state.hash(), new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0)));
|
---|
| 66 | }
|
---|
| 67 |
|
---|
| 68 | public void initQtable(HashMap<Integer, ArrayList<Double>> qTable) {
|
---|
| 69 | if (qTable != null) {
|
---|
| 70 | this.qTable = qTable;
|
---|
| 71 | }
|
---|
| 72 | else {
|
---|
| 73 | this.initQTable();
|
---|
| 74 | }
|
---|
| 75 | }
|
---|
| 76 |
|
---|
| 77 | @Override
|
---|
| 78 | public BidDetails determineOpeningBid() {
|
---|
| 79 | // Open the negotiation with a free bid (one of N bins)
|
---|
| 80 | int targetBin = this.determineOpeningBin();
|
---|
| 81 | return this.pickBidInBin(targetBin);
|
---|
| 82 | }
|
---|
| 83 |
|
---|
| 84 | @Override
|
---|
| 85 | public BidDetails determineNextBid() {
|
---|
[153] | 86 | // HACK(?) this QlearningStrategy works for all states that represent the world in bins,
|
---|
| 87 | // so we needed a way to recognize these. Therefore the interface BinnedRepresentation
|
---|
| 88 | int targetBin = this.determineTargetBin(((BinnedRepresentation) this.state).getMyBin());
|
---|
[67] | 89 | return this.pickBidInBin(targetBin);
|
---|
| 90 | }
|
---|
| 91 |
|
---|
| 92 | @Override
|
---|
| 93 | public String getName() {
|
---|
| 94 | return "Q-Offering";
|
---|
| 95 | }
|
---|
| 96 |
|
---|
| 97 | /**
|
---|
[153] | 98 | * Check if the bid falls inside the lower and upper bounds
|
---|
| 99 | * @param lower lower bound of utility (inclusive)
|
---|
| 100 | * @param upper upper bound of utility (exclusive)
|
---|
| 101 | * @param bidDetails bid to check (has util and time)
|
---|
| 102 | * @return boolean
|
---|
| 103 | */
|
---|
| 104 | private boolean isInBin(double lower, double upper, BidDetails bidDetails) {
|
---|
| 105 | double myUtil = bidDetails.getMyUndiscountedUtil();
|
---|
| 106 | return myUtil < upper && myUtil >= lower;
|
---|
| 107 | }
|
---|
| 108 |
|
---|
| 109 | /**
|
---|
[67] | 110 | * Make the opponent model select a bid that is in the provided target bin
|
---|
| 111 | * @param targetBin index of the bin in which to pick a bid
|
---|
| 112 | * @return BidDetails of the selected bid
|
---|
| 113 | */
|
---|
| 114 | protected BidDetails pickBidInBin(int targetBin) {
|
---|
| 115 |
|
---|
[153] | 116 | double lowerBound = targetBin * this.getBinSize();
|
---|
| 117 | double upperBound = lowerBound + this.getBinSize();
|
---|
[67] | 118 |
|
---|
[153] | 119 | // getBidsInRange behaves weirdly and returns bids that are outise of there range (false positives)
|
---|
[67] | 120 | List<BidDetails> bidsInRange = this.negotiationSession.getOutcomeSpace().getBidsinRange(new Range(lowerBound, upperBound));
|
---|
[153] | 121 | bidsInRange.removeIf( bid -> !this.isInBin(lowerBound, upperBound, bid) );
|
---|
[67] | 122 |
|
---|
[153] | 123 | // If there are no bids possible within this bin, recursively choose another bin by the following logic:
|
---|
| 124 | // if you conceded this round, concede further, etc.
|
---|
| 125 | if (bidsInRange.isEmpty()) {
|
---|
| 126 |
|
---|
| 127 | Random random = new Random();
|
---|
| 128 | int newBin = 0;
|
---|
| 129 | int direction = -1;
|
---|
| 130 |
|
---|
| 131 | // Check if this is the opening action or not; if it is we just pick randomly
|
---|
| 132 | if (this.actions.size() > 1) {
|
---|
| 133 | direction = this.actions.get(this.actions.size() - 1);
|
---|
| 134 | } else {
|
---|
| 135 | newBin = random.nextInt(this.bins);
|
---|
| 136 | }
|
---|
| 137 |
|
---|
| 138 | // conceded last time
|
---|
| 139 | if (direction == 0) {
|
---|
| 140 | newBin = determineTargetBin(targetBin - 1);
|
---|
| 141 | }
|
---|
| 142 |
|
---|
| 143 | // retracted last time
|
---|
| 144 | if (direction == 1) {
|
---|
| 145 | newBin = determineTargetBin(targetBin + 1);
|
---|
| 146 | }
|
---|
| 147 |
|
---|
| 148 | // stayed last time
|
---|
| 149 | if (direction == 2) {
|
---|
| 150 | int randomUpOrDown = random.nextBoolean() ? 1 : -1;
|
---|
| 151 | newBin = determineTargetBin(targetBin + randomUpOrDown);
|
---|
| 152 | }
|
---|
| 153 |
|
---|
| 154 | return this.pickBidInBin(newBin);
|
---|
| 155 | }
|
---|
| 156 |
|
---|
| 157 |
|
---|
[67] | 158 | return this.maxBidForOpponent(bidsInRange);
|
---|
| 159 | }
|
---|
| 160 |
|
---|
| 161 | /**
|
---|
| 162 | * This is the general action function for the RL-agent. We determine a bin by either
|
---|
| 163 | * moving up (retracting offer), doing nothing or moving down (conceding offer).
|
---|
| 164 | * @param currentBin
|
---|
| 165 | * @return
|
---|
| 166 | */
|
---|
| 167 | protected int determineTargetBin(int currentBin) {
|
---|
| 168 | int targetBin = currentBin;
|
---|
| 169 | ArrayList<Double> defaultActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
|
---|
| 170 |
|
---|
| 171 | List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultActionValues);
|
---|
| 172 | int action = this.epsilonGreedy(qValues);
|
---|
| 173 | this.actions.add(action);
|
---|
| 174 |
|
---|
| 175 | // Apply action current bin (ie. move up, down or stay)
|
---|
| 176 | switch (action) {
|
---|
| 177 | case 0: targetBin--;
|
---|
| 178 | break;
|
---|
| 179 | case 1: targetBin++;
|
---|
| 180 | break;
|
---|
| 181 | case 2: break;
|
---|
| 182 | }
|
---|
| 183 |
|
---|
[153] | 184 | // Can't go outside of the range of relevant bins.
|
---|
| 185 | targetBin = Math.min(targetBin, (int) this.minMaxBin.getUpperbound());
|
---|
| 186 | targetBin = Math.max(targetBin, (int) this.minMaxBin.getLowerbound());
|
---|
[67] | 187 |
|
---|
| 188 | return targetBin;
|
---|
| 189 | }
|
---|
| 190 |
|
---|
| 191 | protected int determineOpeningBin() {
|
---|
| 192 | ArrayList<Double> defaultInitialActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
|
---|
| 193 | List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultInitialActionValues);
|
---|
| 194 | int action = this.epsilonGreedy(qValues);
|
---|
| 195 | this.actions.add(action);
|
---|
| 196 |
|
---|
| 197 | return action;
|
---|
| 198 | }
|
---|
| 199 |
|
---|
| 200 | /**
|
---|
| 201 | * @param list List of doubles
|
---|
| 202 | * @return The index of the highest value in the list
|
---|
| 203 | */
|
---|
| 204 | protected int indifferentArgMax(List<Double> list) {
|
---|
| 205 | double maximum = Collections.max(list);
|
---|
| 206 |
|
---|
| 207 | List<Integer> maximaIdxs = new ArrayList<Integer>();
|
---|
| 208 |
|
---|
[153] | 209 | // collect indices of all occurrences of maximum
|
---|
[67] | 210 | for (int i = 0; i < list.size(); i++) {
|
---|
| 211 | if (list.get(i) == maximum) {
|
---|
| 212 | maximaIdxs.add(i);
|
---|
| 213 | }
|
---|
| 214 | }
|
---|
| 215 |
|
---|
| 216 | // pick a random index from the list (this is the indifferent part)
|
---|
| 217 | Random rnd = new Random();
|
---|
| 218 | int choice = rnd.nextInt(maximaIdxs.size());
|
---|
| 219 |
|
---|
| 220 | return maximaIdxs.get(choice);
|
---|
| 221 | }
|
---|
| 222 |
|
---|
| 223 | protected int epsilonGreedy(List<Double> qValues) {
|
---|
| 224 | int action;
|
---|
| 225 |
|
---|
[153] | 226 | // With probability epsilon, pick a random action (epsilon greedy)
|
---|
| 227 | if (Math.random() < this.eps && this.isTraining()) {
|
---|
[67] | 228 | Random random = new Random();
|
---|
| 229 | action = random.nextInt(qValues.size());
|
---|
| 230 | }
|
---|
| 231 | else {
|
---|
| 232 | action = this.indifferentArgMax(qValues);
|
---|
| 233 | }
|
---|
| 234 |
|
---|
| 235 | return action;
|
---|
| 236 | }
|
---|
| 237 |
|
---|
| 238 | /**
|
---|
| 239 | * @return The number of bins in which the each utility axis is divided
|
---|
| 240 | */
|
---|
| 241 | int getNBins() {
|
---|
| 242 | return this.bins;
|
---|
| 243 | }
|
---|
| 244 |
|
---|
| 245 | /**
|
---|
| 246 | * @return The width of the bins in which the each utility axis is divided
|
---|
| 247 | */
|
---|
| 248 | protected double getBinSize() {
|
---|
| 249 | return 1.0 / this.getNBins();
|
---|
| 250 | }
|
---|
| 251 |
|
---|
| 252 | /**
|
---|
| 253 | * Setter for the state property
|
---|
| 254 | * @param state new {@link State}
|
---|
| 255 | *
|
---|
| 256 | */
|
---|
| 257 | protected void setState(State state) {
|
---|
| 258 | this.state = state;
|
---|
| 259 | }
|
---|
| 260 |
|
---|
| 261 | /**
|
---|
| 262 | * Getter for the state property
|
---|
| 263 | * @return
|
---|
| 264 | */
|
---|
[153] | 265 | protected AbstractState getState() {
|
---|
[67] | 266 | return this.state;
|
---|
| 267 | }
|
---|
| 268 |
|
---|
| 269 | /**
|
---|
| 270 | * Determine the bid with the highest expected utility for the opponent from a list of bids
|
---|
| 271 | * @param bids
|
---|
| 272 | * @return BidDetails with representing the maximum bid
|
---|
| 273 | */
|
---|
| 274 | protected BidDetails maxBidForOpponent(List<BidDetails> bids) {
|
---|
| 275 | BidDetails maxBid = null;
|
---|
| 276 |
|
---|
| 277 | for (BidDetails bid : bids) {
|
---|
| 278 | if (maxBid == null) {
|
---|
| 279 | maxBid = bid;
|
---|
| 280 | }
|
---|
| 281 | else if (this.opponentModel.getBidEvaluation(bid.getBid()) > this.opponentModel.getBidEvaluation(maxBid.getBid())) {
|
---|
| 282 | maxBid = bid;
|
---|
| 283 | }
|
---|
| 284 | }
|
---|
| 285 |
|
---|
| 286 | return maxBid;
|
---|
| 287 | }
|
---|
| 288 |
|
---|
| 289 | /**
|
---|
| 290 | * Gets called by Negotiator when a relevant negotiation event occurs
|
---|
| 291 | * @param reward
|
---|
| 292 | * @param newState
|
---|
| 293 | */
|
---|
[153] | 294 | public void observeEnvironment(double reward, AbstractState newState) {
|
---|
[67] | 295 |
|
---|
| 296 | // Only start updating after an action is performed
|
---|
[153] | 297 | // Only update if training is enabled
|
---|
| 298 | if (this.actions.size() > 0 && this.isTraining()) {
|
---|
[67] | 299 | this.updateQFuction(this.state, this.getLastAction(), reward, newState);
|
---|
| 300 | }
|
---|
| 301 | this.state = newState;
|
---|
| 302 | }
|
---|
| 303 |
|
---|
| 304 | public HashMap<Integer, ArrayList<Double>> getQTable() {
|
---|
| 305 | return this.qTable;
|
---|
| 306 | }
|
---|
| 307 |
|
---|
[153] | 308 | protected void updateQFuction(AbstractState state, int action, double reward, AbstractState newState) {
|
---|
[67] | 309 | // initialize state if it is new
|
---|
| 310 |
|
---|
| 311 | // If agent hasn't done a opening bid, initialize action values to number of bins, otherwise
|
---|
| 312 | // just 3 values (up/down/nothing).
|
---|
| 313 | ArrayList<Double> stateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(state.getActionSize(), 0.0));
|
---|
| 314 | ArrayList<Double> newStateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(newState.getActionSize(), 0.0));
|
---|
| 315 |
|
---|
| 316 | // Make entries in qTable if they don't exist yet
|
---|
| 317 | this.qTable.putIfAbsent(state.hash(), stateDefaultActionValues);
|
---|
| 318 | this.qTable.putIfAbsent(newState.hash(), newStateDefaultActionValues);
|
---|
| 319 |
|
---|
[153] | 320 |
|
---|
| 321 | // To remind ourselves that the below function is correct =>
|
---|
| 322 | // the gamma comes from the domain/preference profile through reward which is discounted.
|
---|
| 323 | Double gamma = 1.0;
|
---|
[67] | 324 | // Perform update
|
---|
| 325 | Double Qnext = this.maxActionValue(newState);
|
---|
[153] | 326 | Double newActionValue = this.qFunction(state, action) + this.alpha * (reward + gamma * Qnext - this.qFunction(state, action));
|
---|
[67] | 327 | this.qTable.get(state.hash()).set(action, newActionValue);
|
---|
| 328 | }
|
---|
| 329 |
|
---|
| 330 | /**
|
---|
| 331 | * Determine max_a Q(s,a)
|
---|
| 332 | * @param state The hash of the state for which to retrieve the max action value
|
---|
| 333 | * @return Value of optimal action given that can be taken in the provided state (0 if state is unknown)
|
---|
| 334 | */
|
---|
[153] | 335 | protected Double maxActionValue(AbstractState state) {
|
---|
[67] | 336 | return Collections.max(this.qTable.get(state.hash()));
|
---|
| 337 | }
|
---|
| 338 |
|
---|
| 339 | /**
|
---|
| 340 | * Get the Q value associated with the provided (state, action) pair.
|
---|
| 341 | * @param state
|
---|
| 342 | * @param action
|
---|
| 343 | * @return
|
---|
| 344 | */
|
---|
[153] | 345 | protected Double qFunction(AbstractState state, int action) {
|
---|
[67] | 346 | ArrayList<Double> actionValues = this.qTable.get(state.hash());
|
---|
| 347 | return actionValues.get(action);
|
---|
| 348 | }
|
---|
| 349 |
|
---|
| 350 | public void setHyperparameters(StrategyParameters properties) {
|
---|
| 351 | this.eps = properties.getValueAsDouble("epsilon");
|
---|
| 352 | this.alpha = properties.getValueAsDouble("alpha");
|
---|
| 353 | this.bins = (int) properties.getValueAsDouble("bins");
|
---|
[153] | 354 | this.mode = properties.getValueAsString("_mode");
|
---|
| 355 | this.timeBins = (int) properties.getValueAsDouble("time_bins");
|
---|
[67] | 356 | }
|
---|
[153] | 357 |
|
---|
| 358 | protected boolean isTraining() {
|
---|
| 359 | return this.mode.equals("train");
|
---|
| 360 | }
|
---|
[67] | 361 | }
|
---|