1 | package agents.rlboa;
|
---|
2 |
|
---|
3 | import genius.core.StrategyParameters;
|
---|
4 | import genius.core.bidding.BidDetails;
|
---|
5 | import genius.core.boaframework.NegotiationSession;
|
---|
6 | import genius.core.boaframework.OfferingStrategy;
|
---|
7 | import genius.core.boaframework.OpponentModel;
|
---|
8 | import genius.core.boaframework.OutcomeSpace;
|
---|
9 | import genius.core.boaframework.SortedOutcomeSpace;
|
---|
10 | import genius.core.misc.Range;
|
---|
11 |
|
---|
12 | import java.util.ArrayList;
|
---|
13 | import java.util.Collections;
|
---|
14 | import java.util.HashMap;
|
---|
15 | import java.util.List;
|
---|
16 | import java.util.Random;
|
---|
17 |
|
---|
18 | public class QlearningStrategy extends OfferingStrategy {
|
---|
19 |
|
---|
20 | protected HashMap<Integer, ArrayList<Double>> qTable;
|
---|
21 | protected ArrayList<Integer> actions = new ArrayList<Integer>();
|
---|
22 | protected int bins;
|
---|
23 | protected Double eps;
|
---|
24 | protected Double alpha;
|
---|
25 | protected Double gamma;
|
---|
26 | protected AbstractState state;
|
---|
27 | protected String mode;
|
---|
28 | protected int timeBins;
|
---|
29 | protected Range minMaxBin;
|
---|
30 |
|
---|
31 | public QlearningStrategy(NegotiationSession negotiationSession, OpponentModel opponentModel) {
|
---|
32 | super.init(negotiationSession, null);
|
---|
33 | this.opponentModel = opponentModel;
|
---|
34 | this.endNegotiation = false;
|
---|
35 | this.state = State.INITIAL;
|
---|
36 |
|
---|
37 | OutcomeSpace outcomeSpace = new SortedOutcomeSpace(negotiationSession.getUtilitySpace());
|
---|
38 | this.negotiationSession.setOutcomeSpace(outcomeSpace);
|
---|
39 | }
|
---|
40 |
|
---|
41 | public ArrayList<Integer> getActions() {
|
---|
42 | return this.actions;
|
---|
43 | }
|
---|
44 |
|
---|
45 | /**
|
---|
46 | * @return int representing the last action taken by the strategy
|
---|
47 | * @throws IndexOutOfBoundsException if called before any action has been performed
|
---|
48 | */
|
---|
49 | public int getLastAction() throws IndexOutOfBoundsException {
|
---|
50 | return this.actions.get(this.actions.size() - 1);
|
---|
51 | }
|
---|
52 |
|
---|
53 | public void setMinMaxBin(Range minMaxBin) {
|
---|
54 | this.minMaxBin = minMaxBin;
|
---|
55 | }
|
---|
56 |
|
---|
57 | public Range getMinMaxBin() {
|
---|
58 | return this.minMaxBin;
|
---|
59 | }
|
---|
60 |
|
---|
61 | protected void initQTable() {
|
---|
62 | this.qTable = new HashMap<Integer, ArrayList<Double>>();
|
---|
63 |
|
---|
64 | // Initial state has different action space
|
---|
65 | this.qTable.putIfAbsent(this.state.hash(), new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0)));
|
---|
66 | }
|
---|
67 |
|
---|
68 | public void initQtable(HashMap<Integer, ArrayList<Double>> qTable) {
|
---|
69 | if (qTable != null) {
|
---|
70 | this.qTable = qTable;
|
---|
71 | }
|
---|
72 | else {
|
---|
73 | this.initQTable();
|
---|
74 | }
|
---|
75 | }
|
---|
76 |
|
---|
77 | @Override
|
---|
78 | public BidDetails determineOpeningBid() {
|
---|
79 | // Open the negotiation with a free bid (one of N bins)
|
---|
80 | int targetBin = this.determineOpeningBin();
|
---|
81 | return this.pickBidInBin(targetBin);
|
---|
82 | }
|
---|
83 |
|
---|
84 | @Override
|
---|
85 | public BidDetails determineNextBid() {
|
---|
86 | // HACK(?) this QlearningStrategy works for all states that represent the world in bins,
|
---|
87 | // so we needed a way to recognize these. Therefore the interface BinnedRepresentation
|
---|
88 | int targetBin = this.determineTargetBin(((BinnedRepresentation) this.state).getMyBin());
|
---|
89 | return this.pickBidInBin(targetBin);
|
---|
90 | }
|
---|
91 |
|
---|
92 | @Override
|
---|
93 | public String getName() {
|
---|
94 | return "Q-Offering";
|
---|
95 | }
|
---|
96 |
|
---|
97 | /**
|
---|
98 | * Check if the bid falls inside the lower and upper bounds
|
---|
99 | * @param lower lower bound of utility (inclusive)
|
---|
100 | * @param upper upper bound of utility (exclusive)
|
---|
101 | * @param bidDetails bid to check (has util and time)
|
---|
102 | * @return boolean
|
---|
103 | */
|
---|
104 | private boolean isInBin(double lower, double upper, BidDetails bidDetails) {
|
---|
105 | double myUtil = bidDetails.getMyUndiscountedUtil();
|
---|
106 | return myUtil < upper && myUtil >= lower;
|
---|
107 | }
|
---|
108 |
|
---|
109 | /**
|
---|
110 | * Make the opponent model select a bid that is in the provided target bin
|
---|
111 | * @param targetBin index of the bin in which to pick a bid
|
---|
112 | * @return BidDetails of the selected bid
|
---|
113 | */
|
---|
114 | protected BidDetails pickBidInBin(int targetBin) {
|
---|
115 |
|
---|
116 | double lowerBound = targetBin * this.getBinSize();
|
---|
117 | double upperBound = lowerBound + this.getBinSize();
|
---|
118 |
|
---|
119 | // getBidsInRange behaves weirdly and returns bids that are outise of there range (false positives)
|
---|
120 | List<BidDetails> bidsInRange = this.negotiationSession.getOutcomeSpace().getBidsinRange(new Range(lowerBound, upperBound));
|
---|
121 | bidsInRange.removeIf( bid -> !this.isInBin(lowerBound, upperBound, bid) );
|
---|
122 |
|
---|
123 | // If there are no bids possible within this bin, recursively choose another bin by the following logic:
|
---|
124 | // if you conceded this round, concede further, etc.
|
---|
125 | if (bidsInRange.isEmpty()) {
|
---|
126 |
|
---|
127 | Random random = new Random();
|
---|
128 | int newBin = 0;
|
---|
129 | int direction = -1;
|
---|
130 |
|
---|
131 | // Check if this is the opening action or not; if it is we just pick randomly
|
---|
132 | if (this.actions.size() > 1) {
|
---|
133 | direction = this.actions.get(this.actions.size() - 1);
|
---|
134 | } else {
|
---|
135 | newBin = random.nextInt(this.bins);
|
---|
136 | }
|
---|
137 |
|
---|
138 | // conceded last time
|
---|
139 | if (direction == 0) {
|
---|
140 | newBin = determineTargetBin(targetBin - 1);
|
---|
141 | }
|
---|
142 |
|
---|
143 | // retracted last time
|
---|
144 | if (direction == 1) {
|
---|
145 | newBin = determineTargetBin(targetBin + 1);
|
---|
146 | }
|
---|
147 |
|
---|
148 | // stayed last time
|
---|
149 | if (direction == 2) {
|
---|
150 | int randomUpOrDown = random.nextBoolean() ? 1 : -1;
|
---|
151 | newBin = determineTargetBin(targetBin + randomUpOrDown);
|
---|
152 | }
|
---|
153 |
|
---|
154 | return this.pickBidInBin(newBin);
|
---|
155 | }
|
---|
156 |
|
---|
157 |
|
---|
158 | return this.maxBidForOpponent(bidsInRange);
|
---|
159 | }
|
---|
160 |
|
---|
161 | /**
|
---|
162 | * This is the general action function for the RL-agent. We determine a bin by either
|
---|
163 | * moving up (retracting offer), doing nothing or moving down (conceding offer).
|
---|
164 | * @param currentBin
|
---|
165 | * @return
|
---|
166 | */
|
---|
167 | protected int determineTargetBin(int currentBin) {
|
---|
168 | int targetBin = currentBin;
|
---|
169 | ArrayList<Double> defaultActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
|
---|
170 |
|
---|
171 | List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultActionValues);
|
---|
172 | int action = this.epsilonGreedy(qValues);
|
---|
173 | this.actions.add(action);
|
---|
174 |
|
---|
175 | // Apply action current bin (ie. move up, down or stay)
|
---|
176 | switch (action) {
|
---|
177 | case 0: targetBin--;
|
---|
178 | break;
|
---|
179 | case 1: targetBin++;
|
---|
180 | break;
|
---|
181 | case 2: break;
|
---|
182 | }
|
---|
183 |
|
---|
184 | // Can't go outside of the range of relevant bins.
|
---|
185 | targetBin = Math.min(targetBin, (int) this.minMaxBin.getUpperbound());
|
---|
186 | targetBin = Math.max(targetBin, (int) this.minMaxBin.getLowerbound());
|
---|
187 |
|
---|
188 | return targetBin;
|
---|
189 | }
|
---|
190 |
|
---|
191 | protected int determineOpeningBin() {
|
---|
192 | ArrayList<Double> defaultInitialActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
|
---|
193 | List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultInitialActionValues);
|
---|
194 | int action = this.epsilonGreedy(qValues);
|
---|
195 | this.actions.add(action);
|
---|
196 |
|
---|
197 | return action;
|
---|
198 | }
|
---|
199 |
|
---|
200 | /**
|
---|
201 | * @param list List of doubles
|
---|
202 | * @return The index of the highest value in the list
|
---|
203 | */
|
---|
204 | protected int indifferentArgMax(List<Double> list) {
|
---|
205 | double maximum = Collections.max(list);
|
---|
206 |
|
---|
207 | List<Integer> maximaIdxs = new ArrayList<Integer>();
|
---|
208 |
|
---|
209 | // collect indices of all occurrences of maximum
|
---|
210 | for (int i = 0; i < list.size(); i++) {
|
---|
211 | if (list.get(i) == maximum) {
|
---|
212 | maximaIdxs.add(i);
|
---|
213 | }
|
---|
214 | }
|
---|
215 |
|
---|
216 | // pick a random index from the list (this is the indifferent part)
|
---|
217 | Random rnd = new Random();
|
---|
218 | int choice = rnd.nextInt(maximaIdxs.size());
|
---|
219 |
|
---|
220 | return maximaIdxs.get(choice);
|
---|
221 | }
|
---|
222 |
|
---|
223 | protected int epsilonGreedy(List<Double> qValues) {
|
---|
224 | int action;
|
---|
225 |
|
---|
226 | // With probability epsilon, pick a random action (epsilon greedy)
|
---|
227 | if (Math.random() < this.eps && this.isTraining()) {
|
---|
228 | Random random = new Random();
|
---|
229 | action = random.nextInt(qValues.size());
|
---|
230 | }
|
---|
231 | else {
|
---|
232 | action = this.indifferentArgMax(qValues);
|
---|
233 | }
|
---|
234 |
|
---|
235 | return action;
|
---|
236 | }
|
---|
237 |
|
---|
238 | /**
|
---|
239 | * @return The number of bins in which the each utility axis is divided
|
---|
240 | */
|
---|
241 | int getNBins() {
|
---|
242 | return this.bins;
|
---|
243 | }
|
---|
244 |
|
---|
245 | /**
|
---|
246 | * @return The width of the bins in which the each utility axis is divided
|
---|
247 | */
|
---|
248 | protected double getBinSize() {
|
---|
249 | return 1.0 / this.getNBins();
|
---|
250 | }
|
---|
251 |
|
---|
252 | /**
|
---|
253 | * Setter for the state property
|
---|
254 | * @param state new {@link State}
|
---|
255 | *
|
---|
256 | */
|
---|
257 | protected void setState(State state) {
|
---|
258 | this.state = state;
|
---|
259 | }
|
---|
260 |
|
---|
261 | /**
|
---|
262 | * Getter for the state property
|
---|
263 | * @return
|
---|
264 | */
|
---|
265 | protected AbstractState getState() {
|
---|
266 | return this.state;
|
---|
267 | }
|
---|
268 |
|
---|
269 | /**
|
---|
270 | * Determine the bid with the highest expected utility for the opponent from a list of bids
|
---|
271 | * @param bids
|
---|
272 | * @return BidDetails with representing the maximum bid
|
---|
273 | */
|
---|
274 | protected BidDetails maxBidForOpponent(List<BidDetails> bids) {
|
---|
275 | BidDetails maxBid = null;
|
---|
276 |
|
---|
277 | for (BidDetails bid : bids) {
|
---|
278 | if (maxBid == null) {
|
---|
279 | maxBid = bid;
|
---|
280 | }
|
---|
281 | else if (this.opponentModel.getBidEvaluation(bid.getBid()) > this.opponentModel.getBidEvaluation(maxBid.getBid())) {
|
---|
282 | maxBid = bid;
|
---|
283 | }
|
---|
284 | }
|
---|
285 |
|
---|
286 | return maxBid;
|
---|
287 | }
|
---|
288 |
|
---|
289 | /**
|
---|
290 | * Gets called by Negotiator when a relevant negotiation event occurs
|
---|
291 | * @param reward
|
---|
292 | * @param newState
|
---|
293 | */
|
---|
294 | public void observeEnvironment(double reward, AbstractState newState) {
|
---|
295 |
|
---|
296 | // Only start updating after an action is performed
|
---|
297 | // Only update if training is enabled
|
---|
298 | if (this.actions.size() > 0 && this.isTraining()) {
|
---|
299 | this.updateQFuction(this.state, this.getLastAction(), reward, newState);
|
---|
300 | }
|
---|
301 | this.state = newState;
|
---|
302 | }
|
---|
303 |
|
---|
304 | public HashMap<Integer, ArrayList<Double>> getQTable() {
|
---|
305 | return this.qTable;
|
---|
306 | }
|
---|
307 |
|
---|
308 | protected void updateQFuction(AbstractState state, int action, double reward, AbstractState newState) {
|
---|
309 | // initialize state if it is new
|
---|
310 |
|
---|
311 | // If agent hasn't done a opening bid, initialize action values to number of bins, otherwise
|
---|
312 | // just 3 values (up/down/nothing).
|
---|
313 | ArrayList<Double> stateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(state.getActionSize(), 0.0));
|
---|
314 | ArrayList<Double> newStateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(newState.getActionSize(), 0.0));
|
---|
315 |
|
---|
316 | // Make entries in qTable if they don't exist yet
|
---|
317 | this.qTable.putIfAbsent(state.hash(), stateDefaultActionValues);
|
---|
318 | this.qTable.putIfAbsent(newState.hash(), newStateDefaultActionValues);
|
---|
319 |
|
---|
320 |
|
---|
321 | // To remind ourselves that the below function is correct =>
|
---|
322 | // the gamma comes from the domain/preference profile through reward which is discounted.
|
---|
323 | Double gamma = 1.0;
|
---|
324 | // Perform update
|
---|
325 | Double Qnext = this.maxActionValue(newState);
|
---|
326 | Double newActionValue = this.qFunction(state, action) + this.alpha * (reward + gamma * Qnext - this.qFunction(state, action));
|
---|
327 | this.qTable.get(state.hash()).set(action, newActionValue);
|
---|
328 | }
|
---|
329 |
|
---|
330 | /**
|
---|
331 | * Determine max_a Q(s,a)
|
---|
332 | * @param state The hash of the state for which to retrieve the max action value
|
---|
333 | * @return Value of optimal action given that can be taken in the provided state (0 if state is unknown)
|
---|
334 | */
|
---|
335 | protected Double maxActionValue(AbstractState state) {
|
---|
336 | return Collections.max(this.qTable.get(state.hash()));
|
---|
337 | }
|
---|
338 |
|
---|
339 | /**
|
---|
340 | * Get the Q value associated with the provided (state, action) pair.
|
---|
341 | * @param state
|
---|
342 | * @param action
|
---|
343 | * @return
|
---|
344 | */
|
---|
345 | protected Double qFunction(AbstractState state, int action) {
|
---|
346 | ArrayList<Double> actionValues = this.qTable.get(state.hash());
|
---|
347 | return actionValues.get(action);
|
---|
348 | }
|
---|
349 |
|
---|
350 | public void setHyperparameters(StrategyParameters properties) {
|
---|
351 | this.eps = properties.getValueAsDouble("epsilon");
|
---|
352 | this.alpha = properties.getValueAsDouble("alpha");
|
---|
353 | this.bins = (int) properties.getValueAsDouble("bins");
|
---|
354 | this.mode = properties.getValueAsString("_mode");
|
---|
355 | this.timeBins = (int) properties.getValueAsDouble("time_bins");
|
---|
356 | }
|
---|
357 |
|
---|
358 | protected boolean isTraining() {
|
---|
359 | return this.mode.equals("train");
|
---|
360 | }
|
---|
361 | }
|
---|