source: src/main/java/uva/projectai/y2018/jasparon/QlearningStrategy.java@ 89

Last change on this file since 89 was 67, checked in by Aron Hammond, 6 years ago

Added support for agents that learn via ReinforcementLearning, including an implementation of an agent that uses tabular Q-learning

File size: 8.5 KB
Line 
1package uva.projectai.y2018.jasparon;
2
3import genius.core.StrategyParameters;
4import genius.core.bidding.BidDetails;
5import genius.core.boaframework.NegotiationSession;
6import genius.core.boaframework.OfferingStrategy;
7import genius.core.boaframework.OpponentModel;
8import genius.core.boaframework.OutcomeSpace;
9import genius.core.boaframework.SortedOutcomeSpace;
10import genius.core.misc.Range;
11
12import java.util.ArrayList;
13import java.util.Collections;
14import java.util.HashMap;
15import java.util.List;
16import java.util.Random;
17
18public class QlearningStrategy extends OfferingStrategy {
19
20 protected HashMap<Integer, ArrayList<Double>> qTable;
21 protected ArrayList<Integer> actions = new ArrayList<Integer>();
22 protected int bins;
23 protected Double eps;
24 protected Double alpha;
25 protected Double gamma;
26 protected State state;
27 protected boolean optimistic;
28
29 public QlearningStrategy(NegotiationSession negotiationSession, OpponentModel opponentModel) {
30 super.init(negotiationSession, null);
31 this.opponentModel = opponentModel;
32 this.endNegotiation = false;
33 this.state = State.INITIAL;
34
35 OutcomeSpace outcomeSpace = new SortedOutcomeSpace(negotiationSession.getUtilitySpace());
36 this.negotiationSession.setOutcomeSpace(outcomeSpace);
37 }
38
39 public ArrayList<Integer> getActions() {
40 return this.actions;
41 }
42
43 /**
44 * @return int representing the last action taken by the strategy
45 * @throws IndexOutOfBoundsException if called before any action has been performed
46 */
47 public int getLastAction() throws IndexOutOfBoundsException {
48 return this.actions.get(this.actions.size() - 1);
49 }
50
51 protected void initQTable() {
52 this.qTable = new HashMap<Integer, ArrayList<Double>>();
53
54 // Initial state has different action space
55 this.qTable.putIfAbsent(this.state.hash(), new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0)));
56 }
57
58 public void initQtable(HashMap<Integer, ArrayList<Double>> qTable) {
59 if (qTable != null) {
60 this.qTable = qTable;
61 }
62 else {
63 this.initQTable();
64 }
65 }
66
67 @Override
68 public BidDetails determineOpeningBid() {
69 // Open the negotiation with a free bid (one of N bins)
70 int targetBin = this.determineOpeningBin();
71 return this.pickBidInBin(targetBin);
72 }
73
74 @Override
75 public BidDetails determineNextBid() {
76 int targetBin = this.determineTargetBin(this.state.getMyBin());
77 return this.pickBidInBin(targetBin);
78 }
79
80 @Override
81 public String getName() {
82 return "Q-Offering";
83 }
84
85 /**
86 * Make the opponent model select a bid that is in the provided target bin
87 * @param targetBin index of the bin in which to pick a bid
88 * @return BidDetails of the selected bid
89 */
90 protected BidDetails pickBidInBin(int targetBin) {
91
92 double lowerBound = 0.0 + targetBin * this.getBinSize();
93 double upperBound = lowerBound + this.getBinSize() - 0.01;
94
95 List<BidDetails> bidsInRange = this.negotiationSession.getOutcomeSpace().getBidsinRange(new Range(lowerBound, upperBound));
96
97 return this.maxBidForOpponent(bidsInRange);
98 }
99
100 /**
101 * This is the general action function for the RL-agent. We determine a bin by either
102 * moving up (retracting offer), doing nothing or moving down (conceding offer).
103 * @param currentBin
104 * @return
105 */
106 protected int determineTargetBin(int currentBin) {
107 int targetBin = currentBin;
108 ArrayList<Double> defaultActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
109
110 List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultActionValues);
111 int action = this.epsilonGreedy(qValues);
112 this.actions.add(action);
113
114 // Apply action current bin (ie. move up, down or stay)
115 switch (action) {
116 case 0: targetBin--;
117 break;
118 case 1: targetBin++;
119 break;
120 case 2: break;
121 }
122
123 // Can't go out of bounds
124 // TODO: Discuss impact on learning algorithm
125 targetBin = Math.min(targetBin, this.getNBins() - 1);
126 targetBin = Math.max(targetBin, 0);
127
128 return targetBin;
129
130
131 }
132
133 protected int determineOpeningBin() {
134 ArrayList<Double> defaultInitialActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
135 List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultInitialActionValues);
136 int action = this.epsilonGreedy(qValues);
137 this.actions.add(action);
138
139 return action;
140 }
141
142 /**
143 * @param list List of doubles
144 * @return The index of the highest value in the list
145 */
146 protected int indifferentArgMax(List<Double> list) {
147 double maximum = Collections.max(list);
148
149 List<Integer> maximaIdxs = new ArrayList<Integer>();
150
151 // collect indices of all occurences of maximum
152 for (int i = 0; i < list.size(); i++) {
153 if (list.get(i) == maximum) {
154 maximaIdxs.add(i);
155 }
156 }
157
158 // pick a random index from the list (this is the indifferent part)
159 Random rnd = new Random();
160 int choice = rnd.nextInt(maximaIdxs.size());
161
162 return maximaIdxs.get(choice);
163 }
164
165 protected int epsilonGreedy(List<Double> qValues) {
166 int action;
167
168 // With probability epsilon, pick a random action (espilon greedy)
169 if (Math.random() < this.eps) {
170 Random random = new Random();
171 action = random.nextInt(qValues.size());
172 }
173 else {
174 action = this.indifferentArgMax(qValues);
175 }
176
177 return action;
178 }
179
180 /**
181 * @return The number of bins in which the each utility axis is divided
182 */
183 int getNBins() {
184 return this.bins;
185 }
186
187 /**
188 * @return The width of the bins in which the each utility axis is divided
189 */
190 protected double getBinSize() {
191 return 1.0 / this.getNBins();
192 }
193
194 /**
195 * Setter for the state property
196 * @param state new {@link State}
197 *
198 */
199 protected void setState(State state) {
200 this.state = state;
201 }
202
203 /**
204 * Getter for the state property
205 * @return
206 */
207 protected State getState() {
208 return this.state;
209 }
210
211 /**
212 * Determine the bid with the highest expected utility for the opponent from a list of bids
213 * @param bids
214 * @return BidDetails with representing the maximum bid
215 */
216 protected BidDetails maxBidForOpponent(List<BidDetails> bids) {
217 BidDetails maxBid = null;
218
219 for (BidDetails bid : bids) {
220 if (maxBid == null) {
221 maxBid = bid;
222 }
223 else if (this.opponentModel.getBidEvaluation(bid.getBid()) > this.opponentModel.getBidEvaluation(maxBid.getBid())) {
224 maxBid = bid;
225 }
226 }
227
228 return maxBid;
229 }
230
231 /**
232 * Gets called by Negotiator when a relevant negotiation event occurs
233 * @param reward
234 * @param newState
235 */
236 public void observeEnvironment(double reward, State newState) {
237
238 // Only start updating after an action is performed
239 if (this.actions.size() > 0) {
240 this.updateQFuction(this.state, this.getLastAction(), reward, newState);
241 }
242 this.state = newState;
243 }
244
245 public HashMap<Integer, ArrayList<Double>> getQTable() {
246 return this.qTable;
247 }
248
249 protected void updateQFuction(State state, int action, double reward, State newState) {
250 // initialize state if it is new
251
252 // If agent hasn't done a opening bid, initialize action values to number of bins, otherwise
253 // just 3 values (up/down/nothing).
254 ArrayList<Double> stateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(state.getActionSize(), 0.0));
255 ArrayList<Double> newStateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(newState.getActionSize(), 0.0));
256
257 // Make entries in qTable if they don't exist yet
258 this.qTable.putIfAbsent(state.hash(), stateDefaultActionValues);
259 this.qTable.putIfAbsent(newState.hash(), newStateDefaultActionValues);
260
261 // Perform update
262 Double Qnext = this.maxActionValue(newState);
263 Double newActionValue = this.qFunction(state, action) + this.alpha * (reward + this.gamma * Qnext - this.qFunction(state, action));
264 this.qTable.get(state.hash()).set(action, newActionValue);
265 }
266
267 /**
268 * Determine max_a Q(s,a)
269 * @param state The hash of the state for which to retrieve the max action value
270 * @return Value of optimal action given that can be taken in the provided state (0 if state is unknown)
271 */
272 protected Double maxActionValue(State state) {
273 return Collections.max(this.qTable.get(state.hash()));
274 }
275
276 /**
277 * Get the Q value associated with the provided (state, action) pair.
278 * @param state
279 * @param action
280 * @return
281 */
282 protected Double qFunction(State state, int action) {
283 ArrayList<Double> actionValues = this.qTable.get(state.hash());
284 return actionValues.get(action);
285 }
286
287 public void setHyperparameters(StrategyParameters properties) {
288 this.eps = properties.getValueAsDouble("epsilon");
289 this.gamma = properties.getValueAsDouble("gamma");
290 this.alpha = properties.getValueAsDouble("alpha");
291 this.bins = (int) properties.getValueAsDouble("bins");
292 this.optimistic = properties.getValueAsDouble("optimistic") == 1.0;
293 }
294}
Note: See TracBrowser for help on using the repository browser.