source: src/main/java/uva/projectai/y2018/jasparon/QlearningStrategy.java@ 126

Last change on this file since 126 was 126, checked in by Aron Hammond, 6 years ago

Added function to calculate opposition to MultiLateralAnalysis.java

Moved code to add RLBOA listeners to RLBOAUtils is misc package

Added input for strategyParameters to SessionPanel (gui)

!! close SessionInfo after tournament; this caused /tmp/ to fill up with GeniusData files

Our own package:

  • Added opponents and strategies that are mentioned in the report
  • Change class hierarchy, agents can now extend from RLBOAagentBilateral to inherit RL functionality.
  • States extend from AbstractState
File size: 10.8 KB
Line 
1package uva.projectai.y2018.jasparon;
2
3import genius.core.StrategyParameters;
4import genius.core.bidding.BidDetails;
5import genius.core.boaframework.NegotiationSession;
6import genius.core.boaframework.OfferingStrategy;
7import genius.core.boaframework.OpponentModel;
8import genius.core.boaframework.OutcomeSpace;
9import genius.core.boaframework.SortedOutcomeSpace;
10import genius.core.misc.Range;
11
12import java.util.ArrayList;
13import java.util.Collections;
14import java.util.HashMap;
15import java.util.List;
16import java.util.Random;
17
18public class QlearningStrategy extends OfferingStrategy {
19
20 protected HashMap<Integer, ArrayList<Double>> qTable;
21 protected ArrayList<Integer> actions = new ArrayList<Integer>();
22 protected int bins;
23 protected Double eps;
24 protected Double alpha;
25 protected Double gamma;
26 protected AbstractState state;
27 protected String mode;
28 protected int timeBins;
29 protected Range minMaxBin;
30
31 public QlearningStrategy(NegotiationSession negotiationSession, OpponentModel opponentModel) {
32 super.init(negotiationSession, null);
33 this.opponentModel = opponentModel;
34 this.endNegotiation = false;
35 this.state = State.INITIAL;
36
37 OutcomeSpace outcomeSpace = new SortedOutcomeSpace(negotiationSession.getUtilitySpace());
38 this.negotiationSession.setOutcomeSpace(outcomeSpace);
39 }
40
41 public ArrayList<Integer> getActions() {
42 return this.actions;
43 }
44
45 /**
46 * @return int representing the last action taken by the strategy
47 * @throws IndexOutOfBoundsException if called before any action has been performed
48 */
49 public int getLastAction() throws IndexOutOfBoundsException {
50 return this.actions.get(this.actions.size() - 1);
51 }
52
53 public void setMinMaxBin(Range minMaxBin) {
54 this.minMaxBin = minMaxBin;
55 }
56
57 public Range getMinMaxBin() {
58 return this.minMaxBin;
59 }
60
61 protected void initQTable() {
62 this.qTable = new HashMap<Integer, ArrayList<Double>>();
63
64 // Initial state has different action space
65 this.qTable.putIfAbsent(this.state.hash(), new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0)));
66 }
67
68 public void initQtable(HashMap<Integer, ArrayList<Double>> qTable) {
69 if (qTable != null) {
70 this.qTable = qTable;
71 }
72 else {
73 this.initQTable();
74 }
75 }
76
77 @Override
78 public BidDetails determineOpeningBid() {
79 // Open the negotiation with a free bid (one of N bins)
80 int targetBin = this.determineOpeningBin();
81 return this.pickBidInBin(targetBin);
82 }
83
84 @Override
85 public BidDetails determineNextBid() {
86 // HACK(?) this QlearningStrategy works for all states that represent the world in bins,
87 // so we needed a way to recognize these. Therefore the interface BinnedRepresentation
88 int targetBin = this.determineTargetBin(((BinnedRepresentation) this.state).getMyBin());
89 return this.pickBidInBin(targetBin);
90 }
91
92 @Override
93 public String getName() {
94 return "Q-Offering";
95 }
96
97 /**
98 * Check if the bid falls inside the lower and upper bounds
99 * @param lower lower bound of utility (inclusive)
100 * @param upper upper bound of utility (exclusive)
101 * @param bidDetails bid to check (has util and time)
102 * @return boolean
103 */
104 private boolean isInBin(double lower, double upper, BidDetails bidDetails) {
105 double myUtil = bidDetails.getMyUndiscountedUtil();
106 return myUtil < upper && myUtil >= lower;
107 }
108
109 /**
110 * Make the opponent model select a bid that is in the provided target bin
111 * @param targetBin index of the bin in which to pick a bid
112 * @return BidDetails of the selected bid
113 */
114 protected BidDetails pickBidInBin(int targetBin) {
115
116 double lowerBound = targetBin * this.getBinSize();
117 double upperBound = lowerBound + this.getBinSize();
118
119 // getBidsInRange behaves weirdly and returns bids that are outise of there range (false positives)
120 List<BidDetails> bidsInRange = this.negotiationSession.getOutcomeSpace().getBidsinRange(new Range(lowerBound, upperBound));
121 bidsInRange.removeIf( bid -> !this.isInBin(lowerBound, upperBound, bid) );
122
123 // If there are no bids possible within this bin, recursively choose another bin by the following logic:
124 // if you conceded this round, concede further, etc.
125 if (bidsInRange.isEmpty()) {
126
127 Random random = new Random();
128 int newBin = 0;
129 int direction = -1;
130
131 // Check if this is the opening action or not; if it is we just pick randomly
132 if (this.actions.size() > 1) {
133 direction = this.actions.get(this.actions.size() - 1);
134 } else {
135 newBin = random.nextInt(this.bins);
136 }
137
138 // conceded last time
139 if (direction == 0) {
140 newBin = determineTargetBin(targetBin - 1);
141 }
142
143 // retracted last time
144 if (direction == 1) {
145 newBin = determineTargetBin(targetBin + 1);
146 }
147
148 // stayed last time
149 if (direction == 2) {
150 int randomUpOrDown = random.nextBoolean() ? 1 : -1;
151 newBin = determineTargetBin(targetBin + randomUpOrDown);
152 }
153
154 return this.pickBidInBin(newBin);
155 }
156
157
158 return this.maxBidForOpponent(bidsInRange);
159 }
160
161 /**
162 * This is the general action function for the RL-agent. We determine a bin by either
163 * moving up (retracting offer), doing nothing or moving down (conceding offer).
164 * @param currentBin
165 * @return
166 */
167 protected int determineTargetBin(int currentBin) {
168 int targetBin = currentBin;
169 ArrayList<Double> defaultActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
170
171 List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultActionValues);
172 int action = this.epsilonGreedy(qValues);
173 this.actions.add(action);
174
175 // Apply action current bin (ie. move up, down or stay)
176 switch (action) {
177 case 0: targetBin--;
178 break;
179 case 1: targetBin++;
180 break;
181 case 2: break;
182 }
183
184 // Can't go outside of the range of relevant bins.
185 targetBin = Math.min(targetBin, (int) this.minMaxBin.getUpperbound());
186 targetBin = Math.max(targetBin, (int) this.minMaxBin.getLowerbound());
187
188 return targetBin;
189 }
190
191 protected int determineOpeningBin() {
192 ArrayList<Double> defaultInitialActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
193 List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultInitialActionValues);
194 int action = this.epsilonGreedy(qValues);
195 this.actions.add(action);
196
197 return action;
198 }
199
200 /**
201 * @param list List of doubles
202 * @return The index of the highest value in the list
203 */
204 protected int indifferentArgMax(List<Double> list) {
205 double maximum = Collections.max(list);
206
207 List<Integer> maximaIdxs = new ArrayList<Integer>();
208
209 // collect indices of all occurrences of maximum
210 for (int i = 0; i < list.size(); i++) {
211 if (list.get(i) == maximum) {
212 maximaIdxs.add(i);
213 }
214 }
215
216 // pick a random index from the list (this is the indifferent part)
217 Random rnd = new Random();
218 int choice = rnd.nextInt(maximaIdxs.size());
219
220 return maximaIdxs.get(choice);
221 }
222
223 protected int epsilonGreedy(List<Double> qValues) {
224 int action;
225
226 // With probability epsilon, pick a random action (epsilon greedy)
227 if (Math.random() < this.eps && this.isTraining()) {
228 Random random = new Random();
229 action = random.nextInt(qValues.size());
230 }
231 else {
232 action = this.indifferentArgMax(qValues);
233 }
234
235 return action;
236 }
237
238 /**
239 * @return The number of bins in which the each utility axis is divided
240 */
241 int getNBins() {
242 return this.bins;
243 }
244
245 /**
246 * @return The width of the bins in which the each utility axis is divided
247 */
248 protected double getBinSize() {
249 return 1.0 / this.getNBins();
250 }
251
252 /**
253 * Setter for the state property
254 * @param state new {@link State}
255 *
256 */
257 protected void setState(State state) {
258 this.state = state;
259 }
260
261 /**
262 * Getter for the state property
263 * @return
264 */
265 protected AbstractState getState() {
266 return this.state;
267 }
268
269 /**
270 * Determine the bid with the highest expected utility for the opponent from a list of bids
271 * @param bids
272 * @return BidDetails with representing the maximum bid
273 */
274 protected BidDetails maxBidForOpponent(List<BidDetails> bids) {
275 BidDetails maxBid = null;
276
277 for (BidDetails bid : bids) {
278 if (maxBid == null) {
279 maxBid = bid;
280 }
281 else if (this.opponentModel.getBidEvaluation(bid.getBid()) > this.opponentModel.getBidEvaluation(maxBid.getBid())) {
282 maxBid = bid;
283 }
284 }
285
286 return maxBid;
287 }
288
289 /**
290 * Gets called by Negotiator when a relevant negotiation event occurs
291 * @param reward
292 * @param newState
293 */
294 public void observeEnvironment(double reward, AbstractState newState) {
295
296 // Only start updating after an action is performed
297 // Only update if training is enabled
298 if (this.actions.size() > 0 && this.isTraining()) {
299 this.updateQFuction(this.state, this.getLastAction(), reward, newState);
300 }
301 this.state = newState;
302 }
303
304 public HashMap<Integer, ArrayList<Double>> getQTable() {
305 return this.qTable;
306 }
307
308 protected void updateQFuction(AbstractState state, int action, double reward, AbstractState newState) {
309 // initialize state if it is new
310
311 // If agent hasn't done a opening bid, initialize action values to number of bins, otherwise
312 // just 3 values (up/down/nothing).
313 ArrayList<Double> stateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(state.getActionSize(), 0.0));
314 ArrayList<Double> newStateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(newState.getActionSize(), 0.0));
315
316 // Make entries in qTable if they don't exist yet
317 this.qTable.putIfAbsent(state.hash(), stateDefaultActionValues);
318 this.qTable.putIfAbsent(newState.hash(), newStateDefaultActionValues);
319
320
321 // To remind ourselves that the below function is correct =>
322 // the gamma comes from the domain/preference profile through reward which is discounted.
323 Double gamma = 1.0;
324 // Perform update
325 Double Qnext = this.maxActionValue(newState);
326 Double newActionValue = this.qFunction(state, action) + this.alpha * (reward + gamma * Qnext - this.qFunction(state, action));
327 this.qTable.get(state.hash()).set(action, newActionValue);
328 }
329
330 /**
331 * Determine max_a Q(s,a)
332 * @param state The hash of the state for which to retrieve the max action value
333 * @return Value of optimal action given that can be taken in the provided state (0 if state is unknown)
334 */
335 protected Double maxActionValue(AbstractState state) {
336 return Collections.max(this.qTable.get(state.hash()));
337 }
338
339 /**
340 * Get the Q value associated with the provided (state, action) pair.
341 * @param state
342 * @param action
343 * @return
344 */
345 protected Double qFunction(AbstractState state, int action) {
346 ArrayList<Double> actionValues = this.qTable.get(state.hash());
347 return actionValues.get(action);
348 }
349
350 public void setHyperparameters(StrategyParameters properties) {
351 this.eps = properties.getValueAsDouble("epsilon");
352 this.alpha = properties.getValueAsDouble("alpha");
353 this.bins = (int) properties.getValueAsDouble("bins");
354 this.mode = properties.getValueAsString("_mode");
355 this.timeBins = (int) properties.getValueAsDouble("time_bins");
356 }
357
358 protected boolean isTraining() {
359 return this.mode.equals("train");
360 }
361}
Note: See TracBrowser for help on using the repository browser.