1 | package uva.projectai.y2018.jasparon;
|
---|
2 |
|
---|
3 | import genius.core.StrategyParameters;
|
---|
4 | import genius.core.bidding.BidDetails;
|
---|
5 | import genius.core.boaframework.NegotiationSession;
|
---|
6 | import genius.core.boaframework.OfferingStrategy;
|
---|
7 | import genius.core.boaframework.OpponentModel;
|
---|
8 | import genius.core.boaframework.OutcomeSpace;
|
---|
9 | import genius.core.boaframework.SortedOutcomeSpace;
|
---|
10 | import genius.core.misc.Range;
|
---|
11 |
|
---|
12 | import java.util.ArrayList;
|
---|
13 | import java.util.Collections;
|
---|
14 | import java.util.HashMap;
|
---|
15 | import java.util.List;
|
---|
16 | import java.util.Random;
|
---|
17 |
|
---|
18 | public class QlearningStrategy extends OfferingStrategy {
|
---|
19 |
|
---|
20 | protected HashMap<Integer, ArrayList<Double>> qTable;
|
---|
21 | protected ArrayList<Integer> actions = new ArrayList<Integer>();
|
---|
22 | protected int bins;
|
---|
23 | protected Double eps;
|
---|
24 | protected Double alpha;
|
---|
25 | protected Double gamma;
|
---|
26 | protected State state;
|
---|
27 | protected boolean optimistic;
|
---|
28 |
|
---|
29 | public QlearningStrategy(NegotiationSession negotiationSession, OpponentModel opponentModel) {
|
---|
30 | super.init(negotiationSession, null);
|
---|
31 | this.opponentModel = opponentModel;
|
---|
32 | this.endNegotiation = false;
|
---|
33 | this.state = State.INITIAL;
|
---|
34 |
|
---|
35 | OutcomeSpace outcomeSpace = new SortedOutcomeSpace(negotiationSession.getUtilitySpace());
|
---|
36 | this.negotiationSession.setOutcomeSpace(outcomeSpace);
|
---|
37 | }
|
---|
38 |
|
---|
39 | public ArrayList<Integer> getActions() {
|
---|
40 | return this.actions;
|
---|
41 | }
|
---|
42 |
|
---|
43 | /**
|
---|
44 | * @return int representing the last action taken by the strategy
|
---|
45 | * @throws IndexOutOfBoundsException if called before any action has been performed
|
---|
46 | */
|
---|
47 | public int getLastAction() throws IndexOutOfBoundsException {
|
---|
48 | return this.actions.get(this.actions.size() - 1);
|
---|
49 | }
|
---|
50 |
|
---|
51 | protected void initQTable() {
|
---|
52 | this.qTable = new HashMap<Integer, ArrayList<Double>>();
|
---|
53 |
|
---|
54 | // Initial state has different action space
|
---|
55 | this.qTable.putIfAbsent(this.state.hash(), new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0)));
|
---|
56 | }
|
---|
57 |
|
---|
58 | public void initQtable(HashMap<Integer, ArrayList<Double>> qTable) {
|
---|
59 | if (qTable != null) {
|
---|
60 | this.qTable = qTable;
|
---|
61 | }
|
---|
62 | else {
|
---|
63 | this.initQTable();
|
---|
64 | }
|
---|
65 | }
|
---|
66 |
|
---|
67 | @Override
|
---|
68 | public BidDetails determineOpeningBid() {
|
---|
69 | // Open the negotiation with a free bid (one of N bins)
|
---|
70 | int targetBin = this.determineOpeningBin();
|
---|
71 | return this.pickBidInBin(targetBin);
|
---|
72 | }
|
---|
73 |
|
---|
74 | @Override
|
---|
75 | public BidDetails determineNextBid() {
|
---|
76 | int targetBin = this.determineTargetBin(this.state.getMyBin());
|
---|
77 | return this.pickBidInBin(targetBin);
|
---|
78 | }
|
---|
79 |
|
---|
80 | @Override
|
---|
81 | public String getName() {
|
---|
82 | return "Q-Offering";
|
---|
83 | }
|
---|
84 |
|
---|
85 | /**
|
---|
86 | * Make the opponent model select a bid that is in the provided target bin
|
---|
87 | * @param targetBin index of the bin in which to pick a bid
|
---|
88 | * @return BidDetails of the selected bid
|
---|
89 | */
|
---|
90 | protected BidDetails pickBidInBin(int targetBin) {
|
---|
91 |
|
---|
92 | double lowerBound = 0.0 + targetBin * this.getBinSize();
|
---|
93 | double upperBound = lowerBound + this.getBinSize() - 0.01;
|
---|
94 |
|
---|
95 | List<BidDetails> bidsInRange = this.negotiationSession.getOutcomeSpace().getBidsinRange(new Range(lowerBound, upperBound));
|
---|
96 |
|
---|
97 | return this.maxBidForOpponent(bidsInRange);
|
---|
98 | }
|
---|
99 |
|
---|
100 | /**
|
---|
101 | * This is the general action function for the RL-agent. We determine a bin by either
|
---|
102 | * moving up (retracting offer), doing nothing or moving down (conceding offer).
|
---|
103 | * @param currentBin
|
---|
104 | * @return
|
---|
105 | */
|
---|
106 | protected int determineTargetBin(int currentBin) {
|
---|
107 | int targetBin = currentBin;
|
---|
108 | ArrayList<Double> defaultActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
|
---|
109 |
|
---|
110 | List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultActionValues);
|
---|
111 | int action = this.epsilonGreedy(qValues);
|
---|
112 | this.actions.add(action);
|
---|
113 |
|
---|
114 | // Apply action current bin (ie. move up, down or stay)
|
---|
115 | switch (action) {
|
---|
116 | case 0: targetBin--;
|
---|
117 | break;
|
---|
118 | case 1: targetBin++;
|
---|
119 | break;
|
---|
120 | case 2: break;
|
---|
121 | }
|
---|
122 |
|
---|
123 | // Can't go out of bounds
|
---|
124 | // TODO: Discuss impact on learning algorithm
|
---|
125 | targetBin = Math.min(targetBin, this.getNBins() - 1);
|
---|
126 | targetBin = Math.max(targetBin, 0);
|
---|
127 |
|
---|
128 | return targetBin;
|
---|
129 |
|
---|
130 |
|
---|
131 | }
|
---|
132 |
|
---|
133 | protected int determineOpeningBin() {
|
---|
134 | ArrayList<Double> defaultInitialActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
|
---|
135 | List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultInitialActionValues);
|
---|
136 | int action = this.epsilonGreedy(qValues);
|
---|
137 | this.actions.add(action);
|
---|
138 |
|
---|
139 | return action;
|
---|
140 | }
|
---|
141 |
|
---|
142 | /**
|
---|
143 | * @param list List of doubles
|
---|
144 | * @return The index of the highest value in the list
|
---|
145 | */
|
---|
146 | protected int indifferentArgMax(List<Double> list) {
|
---|
147 | double maximum = Collections.max(list);
|
---|
148 |
|
---|
149 | List<Integer> maximaIdxs = new ArrayList<Integer>();
|
---|
150 |
|
---|
151 | // collect indices of all occurences of maximum
|
---|
152 | for (int i = 0; i < list.size(); i++) {
|
---|
153 | if (list.get(i) == maximum) {
|
---|
154 | maximaIdxs.add(i);
|
---|
155 | }
|
---|
156 | }
|
---|
157 |
|
---|
158 | // pick a random index from the list (this is the indifferent part)
|
---|
159 | Random rnd = new Random();
|
---|
160 | int choice = rnd.nextInt(maximaIdxs.size());
|
---|
161 |
|
---|
162 | return maximaIdxs.get(choice);
|
---|
163 | }
|
---|
164 |
|
---|
165 | protected int epsilonGreedy(List<Double> qValues) {
|
---|
166 | int action;
|
---|
167 |
|
---|
168 | // With probability epsilon, pick a random action (espilon greedy)
|
---|
169 | if (Math.random() < this.eps) {
|
---|
170 | Random random = new Random();
|
---|
171 | action = random.nextInt(qValues.size());
|
---|
172 | }
|
---|
173 | else {
|
---|
174 | action = this.indifferentArgMax(qValues);
|
---|
175 | }
|
---|
176 |
|
---|
177 | return action;
|
---|
178 | }
|
---|
179 |
|
---|
180 | /**
|
---|
181 | * @return The number of bins in which the each utility axis is divided
|
---|
182 | */
|
---|
183 | int getNBins() {
|
---|
184 | return this.bins;
|
---|
185 | }
|
---|
186 |
|
---|
187 | /**
|
---|
188 | * @return The width of the bins in which the each utility axis is divided
|
---|
189 | */
|
---|
190 | protected double getBinSize() {
|
---|
191 | return 1.0 / this.getNBins();
|
---|
192 | }
|
---|
193 |
|
---|
194 | /**
|
---|
195 | * Setter for the state property
|
---|
196 | * @param state new {@link State}
|
---|
197 | *
|
---|
198 | */
|
---|
199 | protected void setState(State state) {
|
---|
200 | this.state = state;
|
---|
201 | }
|
---|
202 |
|
---|
203 | /**
|
---|
204 | * Getter for the state property
|
---|
205 | * @return
|
---|
206 | */
|
---|
207 | protected State getState() {
|
---|
208 | return this.state;
|
---|
209 | }
|
---|
210 |
|
---|
211 | /**
|
---|
212 | * Determine the bid with the highest expected utility for the opponent from a list of bids
|
---|
213 | * @param bids
|
---|
214 | * @return BidDetails with representing the maximum bid
|
---|
215 | */
|
---|
216 | protected BidDetails maxBidForOpponent(List<BidDetails> bids) {
|
---|
217 | BidDetails maxBid = null;
|
---|
218 |
|
---|
219 | for (BidDetails bid : bids) {
|
---|
220 | if (maxBid == null) {
|
---|
221 | maxBid = bid;
|
---|
222 | }
|
---|
223 | else if (this.opponentModel.getBidEvaluation(bid.getBid()) > this.opponentModel.getBidEvaluation(maxBid.getBid())) {
|
---|
224 | maxBid = bid;
|
---|
225 | }
|
---|
226 | }
|
---|
227 |
|
---|
228 | return maxBid;
|
---|
229 | }
|
---|
230 |
|
---|
231 | /**
|
---|
232 | * Gets called by Negotiator when a relevant negotiation event occurs
|
---|
233 | * @param reward
|
---|
234 | * @param newState
|
---|
235 | */
|
---|
236 | public void observeEnvironment(double reward, State newState) {
|
---|
237 |
|
---|
238 | // Only start updating after an action is performed
|
---|
239 | if (this.actions.size() > 0) {
|
---|
240 | this.updateQFuction(this.state, this.getLastAction(), reward, newState);
|
---|
241 | }
|
---|
242 | this.state = newState;
|
---|
243 | }
|
---|
244 |
|
---|
245 | public HashMap<Integer, ArrayList<Double>> getQTable() {
|
---|
246 | return this.qTable;
|
---|
247 | }
|
---|
248 |
|
---|
249 | protected void updateQFuction(State state, int action, double reward, State newState) {
|
---|
250 | // initialize state if it is new
|
---|
251 |
|
---|
252 | // If agent hasn't done a opening bid, initialize action values to number of bins, otherwise
|
---|
253 | // just 3 values (up/down/nothing).
|
---|
254 | ArrayList<Double> stateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(state.getActionSize(), 0.0));
|
---|
255 | ArrayList<Double> newStateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(newState.getActionSize(), 0.0));
|
---|
256 |
|
---|
257 | // Make entries in qTable if they don't exist yet
|
---|
258 | this.qTable.putIfAbsent(state.hash(), stateDefaultActionValues);
|
---|
259 | this.qTable.putIfAbsent(newState.hash(), newStateDefaultActionValues);
|
---|
260 |
|
---|
261 | // Perform update
|
---|
262 | Double Qnext = this.maxActionValue(newState);
|
---|
263 | Double newActionValue = this.qFunction(state, action) + this.alpha * (reward + this.gamma * Qnext - this.qFunction(state, action));
|
---|
264 | this.qTable.get(state.hash()).set(action, newActionValue);
|
---|
265 | }
|
---|
266 |
|
---|
267 | /**
|
---|
268 | * Determine max_a Q(s,a)
|
---|
269 | * @param state The hash of the state for which to retrieve the max action value
|
---|
270 | * @return Value of optimal action given that can be taken in the provided state (0 if state is unknown)
|
---|
271 | */
|
---|
272 | protected Double maxActionValue(State state) {
|
---|
273 | return Collections.max(this.qTable.get(state.hash()));
|
---|
274 | }
|
---|
275 |
|
---|
276 | /**
|
---|
277 | * Get the Q value associated with the provided (state, action) pair.
|
---|
278 | * @param state
|
---|
279 | * @param action
|
---|
280 | * @return
|
---|
281 | */
|
---|
282 | protected Double qFunction(State state, int action) {
|
---|
283 | ArrayList<Double> actionValues = this.qTable.get(state.hash());
|
---|
284 | return actionValues.get(action);
|
---|
285 | }
|
---|
286 |
|
---|
287 | public void setHyperparameters(StrategyParameters properties) {
|
---|
288 | this.eps = properties.getValueAsDouble("epsilon");
|
---|
289 | this.gamma = properties.getValueAsDouble("gamma");
|
---|
290 | this.alpha = properties.getValueAsDouble("alpha");
|
---|
291 | this.bins = (int) properties.getValueAsDouble("bins");
|
---|
292 | this.optimistic = properties.getValueAsDouble("optimistic") == 1.0;
|
---|
293 | }
|
---|
294 | }
|
---|