source: src/main/java/agents/rlboa/QlearningStrategy.java

Last change on this file was 153, checked in by Aron Hammond, 6 years ago

Added function to calculate opposition to MultiLateralAnalysis.java

Moved code to add RLBOA listeners to RLBOAUtils is misc package

!! close SessionInfo after tournament; this caused /tmp/ to fill up with GeniusData files

This commit finalized the RLBOA project and it is now ready for use

Our own package (uva.project.:

  • Moved to agents.rlboa
  • Added opponents and strategies that are mentioned in the report
  • Change class hierarchy, agents can now extend from RLBOAagentBilateral to inherit RL functionality.
  • States extend from AbstractState
File size: 10.8 KB
Line 
1package agents.rlboa;
2
3import genius.core.StrategyParameters;
4import genius.core.bidding.BidDetails;
5import genius.core.boaframework.NegotiationSession;
6import genius.core.boaframework.OfferingStrategy;
7import genius.core.boaframework.OpponentModel;
8import genius.core.boaframework.OutcomeSpace;
9import genius.core.boaframework.SortedOutcomeSpace;
10import genius.core.misc.Range;
11
12import java.util.ArrayList;
13import java.util.Collections;
14import java.util.HashMap;
15import java.util.List;
16import java.util.Random;
17
18public class QlearningStrategy extends OfferingStrategy {
19
20 protected HashMap<Integer, ArrayList<Double>> qTable;
21 protected ArrayList<Integer> actions = new ArrayList<Integer>();
22 protected int bins;
23 protected Double eps;
24 protected Double alpha;
25 protected Double gamma;
26 protected AbstractState state;
27 protected String mode;
28 protected int timeBins;
29 protected Range minMaxBin;
30
31 public QlearningStrategy(NegotiationSession negotiationSession, OpponentModel opponentModel) {
32 super.init(negotiationSession, null);
33 this.opponentModel = opponentModel;
34 this.endNegotiation = false;
35 this.state = State.INITIAL;
36
37 OutcomeSpace outcomeSpace = new SortedOutcomeSpace(negotiationSession.getUtilitySpace());
38 this.negotiationSession.setOutcomeSpace(outcomeSpace);
39 }
40
41 public ArrayList<Integer> getActions() {
42 return this.actions;
43 }
44
45 /**
46 * @return int representing the last action taken by the strategy
47 * @throws IndexOutOfBoundsException if called before any action has been performed
48 */
49 public int getLastAction() throws IndexOutOfBoundsException {
50 return this.actions.get(this.actions.size() - 1);
51 }
52
53 public void setMinMaxBin(Range minMaxBin) {
54 this.minMaxBin = minMaxBin;
55 }
56
57 public Range getMinMaxBin() {
58 return this.minMaxBin;
59 }
60
61 protected void initQTable() {
62 this.qTable = new HashMap<Integer, ArrayList<Double>>();
63
64 // Initial state has different action space
65 this.qTable.putIfAbsent(this.state.hash(), new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0)));
66 }
67
68 public void initQtable(HashMap<Integer, ArrayList<Double>> qTable) {
69 if (qTable != null) {
70 this.qTable = qTable;
71 }
72 else {
73 this.initQTable();
74 }
75 }
76
77 @Override
78 public BidDetails determineOpeningBid() {
79 // Open the negotiation with a free bid (one of N bins)
80 int targetBin = this.determineOpeningBin();
81 return this.pickBidInBin(targetBin);
82 }
83
84 @Override
85 public BidDetails determineNextBid() {
86 // HACK(?) this QlearningStrategy works for all states that represent the world in bins,
87 // so we needed a way to recognize these. Therefore the interface BinnedRepresentation
88 int targetBin = this.determineTargetBin(((BinnedRepresentation) this.state).getMyBin());
89 return this.pickBidInBin(targetBin);
90 }
91
92 @Override
93 public String getName() {
94 return "Q-Offering";
95 }
96
97 /**
98 * Check if the bid falls inside the lower and upper bounds
99 * @param lower lower bound of utility (inclusive)
100 * @param upper upper bound of utility (exclusive)
101 * @param bidDetails bid to check (has util and time)
102 * @return boolean
103 */
104 private boolean isInBin(double lower, double upper, BidDetails bidDetails) {
105 double myUtil = bidDetails.getMyUndiscountedUtil();
106 return myUtil < upper && myUtil >= lower;
107 }
108
109 /**
110 * Make the opponent model select a bid that is in the provided target bin
111 * @param targetBin index of the bin in which to pick a bid
112 * @return BidDetails of the selected bid
113 */
114 protected BidDetails pickBidInBin(int targetBin) {
115
116 double lowerBound = targetBin * this.getBinSize();
117 double upperBound = lowerBound + this.getBinSize();
118
119 // getBidsInRange behaves weirdly and returns bids that are outise of there range (false positives)
120 List<BidDetails> bidsInRange = this.negotiationSession.getOutcomeSpace().getBidsinRange(new Range(lowerBound, upperBound));
121 bidsInRange.removeIf( bid -> !this.isInBin(lowerBound, upperBound, bid) );
122
123 // If there are no bids possible within this bin, recursively choose another bin by the following logic:
124 // if you conceded this round, concede further, etc.
125 if (bidsInRange.isEmpty()) {
126
127 Random random = new Random();
128 int newBin = 0;
129 int direction = -1;
130
131 // Check if this is the opening action or not; if it is we just pick randomly
132 if (this.actions.size() > 1) {
133 direction = this.actions.get(this.actions.size() - 1);
134 } else {
135 newBin = random.nextInt(this.bins);
136 }
137
138 // conceded last time
139 if (direction == 0) {
140 newBin = determineTargetBin(targetBin - 1);
141 }
142
143 // retracted last time
144 if (direction == 1) {
145 newBin = determineTargetBin(targetBin + 1);
146 }
147
148 // stayed last time
149 if (direction == 2) {
150 int randomUpOrDown = random.nextBoolean() ? 1 : -1;
151 newBin = determineTargetBin(targetBin + randomUpOrDown);
152 }
153
154 return this.pickBidInBin(newBin);
155 }
156
157
158 return this.maxBidForOpponent(bidsInRange);
159 }
160
161 /**
162 * This is the general action function for the RL-agent. We determine a bin by either
163 * moving up (retracting offer), doing nothing or moving down (conceding offer).
164 * @param currentBin
165 * @return
166 */
167 protected int determineTargetBin(int currentBin) {
168 int targetBin = currentBin;
169 ArrayList<Double> defaultActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
170
171 List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultActionValues);
172 int action = this.epsilonGreedy(qValues);
173 this.actions.add(action);
174
175 // Apply action current bin (ie. move up, down or stay)
176 switch (action) {
177 case 0: targetBin--;
178 break;
179 case 1: targetBin++;
180 break;
181 case 2: break;
182 }
183
184 // Can't go outside of the range of relevant bins.
185 targetBin = Math.min(targetBin, (int) this.minMaxBin.getUpperbound());
186 targetBin = Math.max(targetBin, (int) this.minMaxBin.getLowerbound());
187
188 return targetBin;
189 }
190
191 protected int determineOpeningBin() {
192 ArrayList<Double> defaultInitialActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
193 List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultInitialActionValues);
194 int action = this.epsilonGreedy(qValues);
195 this.actions.add(action);
196
197 return action;
198 }
199
200 /**
201 * @param list List of doubles
202 * @return The index of the highest value in the list
203 */
204 protected int indifferentArgMax(List<Double> list) {
205 double maximum = Collections.max(list);
206
207 List<Integer> maximaIdxs = new ArrayList<Integer>();
208
209 // collect indices of all occurrences of maximum
210 for (int i = 0; i < list.size(); i++) {
211 if (list.get(i) == maximum) {
212 maximaIdxs.add(i);
213 }
214 }
215
216 // pick a random index from the list (this is the indifferent part)
217 Random rnd = new Random();
218 int choice = rnd.nextInt(maximaIdxs.size());
219
220 return maximaIdxs.get(choice);
221 }
222
223 protected int epsilonGreedy(List<Double> qValues) {
224 int action;
225
226 // With probability epsilon, pick a random action (epsilon greedy)
227 if (Math.random() < this.eps && this.isTraining()) {
228 Random random = new Random();
229 action = random.nextInt(qValues.size());
230 }
231 else {
232 action = this.indifferentArgMax(qValues);
233 }
234
235 return action;
236 }
237
238 /**
239 * @return The number of bins in which the each utility axis is divided
240 */
241 int getNBins() {
242 return this.bins;
243 }
244
245 /**
246 * @return The width of the bins in which the each utility axis is divided
247 */
248 protected double getBinSize() {
249 return 1.0 / this.getNBins();
250 }
251
252 /**
253 * Setter for the state property
254 * @param state new {@link State}
255 *
256 */
257 protected void setState(State state) {
258 this.state = state;
259 }
260
261 /**
262 * Getter for the state property
263 * @return
264 */
265 protected AbstractState getState() {
266 return this.state;
267 }
268
269 /**
270 * Determine the bid with the highest expected utility for the opponent from a list of bids
271 * @param bids
272 * @return BidDetails with representing the maximum bid
273 */
274 protected BidDetails maxBidForOpponent(List<BidDetails> bids) {
275 BidDetails maxBid = null;
276
277 for (BidDetails bid : bids) {
278 if (maxBid == null) {
279 maxBid = bid;
280 }
281 else if (this.opponentModel.getBidEvaluation(bid.getBid()) > this.opponentModel.getBidEvaluation(maxBid.getBid())) {
282 maxBid = bid;
283 }
284 }
285
286 return maxBid;
287 }
288
289 /**
290 * Gets called by Negotiator when a relevant negotiation event occurs
291 * @param reward
292 * @param newState
293 */
294 public void observeEnvironment(double reward, AbstractState newState) {
295
296 // Only start updating after an action is performed
297 // Only update if training is enabled
298 if (this.actions.size() > 0 && this.isTraining()) {
299 this.updateQFuction(this.state, this.getLastAction(), reward, newState);
300 }
301 this.state = newState;
302 }
303
304 public HashMap<Integer, ArrayList<Double>> getQTable() {
305 return this.qTable;
306 }
307
308 protected void updateQFuction(AbstractState state, int action, double reward, AbstractState newState) {
309 // initialize state if it is new
310
311 // If agent hasn't done a opening bid, initialize action values to number of bins, otherwise
312 // just 3 values (up/down/nothing).
313 ArrayList<Double> stateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(state.getActionSize(), 0.0));
314 ArrayList<Double> newStateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(newState.getActionSize(), 0.0));
315
316 // Make entries in qTable if they don't exist yet
317 this.qTable.putIfAbsent(state.hash(), stateDefaultActionValues);
318 this.qTable.putIfAbsent(newState.hash(), newStateDefaultActionValues);
319
320
321 // To remind ourselves that the below function is correct =>
322 // the gamma comes from the domain/preference profile through reward which is discounted.
323 Double gamma = 1.0;
324 // Perform update
325 Double Qnext = this.maxActionValue(newState);
326 Double newActionValue = this.qFunction(state, action) + this.alpha * (reward + gamma * Qnext - this.qFunction(state, action));
327 this.qTable.get(state.hash()).set(action, newActionValue);
328 }
329
330 /**
331 * Determine max_a Q(s,a)
332 * @param state The hash of the state for which to retrieve the max action value
333 * @return Value of optimal action given that can be taken in the provided state (0 if state is unknown)
334 */
335 protected Double maxActionValue(AbstractState state) {
336 return Collections.max(this.qTable.get(state.hash()));
337 }
338
339 /**
340 * Get the Q value associated with the provided (state, action) pair.
341 * @param state
342 * @param action
343 * @return
344 */
345 protected Double qFunction(AbstractState state, int action) {
346 ArrayList<Double> actionValues = this.qTable.get(state.hash());
347 return actionValues.get(action);
348 }
349
350 public void setHyperparameters(StrategyParameters properties) {
351 this.eps = properties.getValueAsDouble("epsilon");
352 this.alpha = properties.getValueAsDouble("alpha");
353 this.bins = (int) properties.getValueAsDouble("bins");
354 this.mode = properties.getValueAsString("_mode");
355 this.timeBins = (int) properties.getValueAsDouble("time_bins");
356 }
357
358 protected boolean isTraining() {
359 return this.mode.equals("train");
360 }
361}
Note: See TracBrowser for help on using the repository browser.