Context Navigation

QlearningStrategy.java

Last change on this file was 153, checked in by Aron Hammond, 6 years ago

Added function to calculate opposition to MultiLateralAnalysis.java

Moved code to add RLBOA listeners to RLBOAUtils is misc package

!! close SessionInfo after tournament; this caused /tmp/ to fill up with GeniusData files

This commit finalized the RLBOA project and it is now ready for use

Our own package (uva.project.:

Moved to agents.rlboa
Added opponents and strategies that are mentioned in the report
Change class hierarchy, agents can now extend from RLBOAagentBilateral to inherit RL functionality.
States extend from AbstractState

File size: 10.8 KB

Line
1	package agents.rlboa;
2
3	import genius.core.StrategyParameters;
4	import genius.core.bidding.BidDetails;
5	import genius.core.boaframework.NegotiationSession;
6	import genius.core.boaframework.OfferingStrategy;
7	import genius.core.boaframework.OpponentModel;
8	import genius.core.boaframework.OutcomeSpace;
9	import genius.core.boaframework.SortedOutcomeSpace;
10	import genius.core.misc.Range;
11
12	import java.util.ArrayList;
13	import java.util.Collections;
14	import java.util.HashMap;
15	import java.util.List;
16	import java.util.Random;
17
18	public class QlearningStrategy extends OfferingStrategy {
19
20	protected HashMap<Integer, ArrayList<Double>> qTable;
21	protected ArrayList<Integer> actions = new ArrayList<Integer>();
22	protected int bins;
23	protected Double eps;
24	protected Double alpha;
25	protected Double gamma;
26	protected AbstractState state;
27	protected String mode;
28	protected int timeBins;
29	protected Range minMaxBin;
30
31	public QlearningStrategy(NegotiationSession negotiationSession, OpponentModel opponentModel) {
32	super.init(negotiationSession, null);
33	this.opponentModel = opponentModel;
34	this.endNegotiation = false;
35	this.state = State.INITIAL;
36
37	OutcomeSpace outcomeSpace = new SortedOutcomeSpace(negotiationSession.getUtilitySpace());
38	this.negotiationSession.setOutcomeSpace(outcomeSpace);
39	}
40
41	public ArrayList<Integer> getActions() {
42	return this.actions;
43	}
44
45	/**
46	* @return int representing the last action taken by the strategy
47	* @throws IndexOutOfBoundsException if called before any action has been performed
48	*/
49	public int getLastAction() throws IndexOutOfBoundsException {
50	return this.actions.get(this.actions.size() - 1);
51	}
52
53	public void setMinMaxBin(Range minMaxBin) {
54	this.minMaxBin = minMaxBin;
55	}
56
57	public Range getMinMaxBin() {
58	return this.minMaxBin;
59	}
60
61	protected void initQTable() {
62	this.qTable = new HashMap<Integer, ArrayList<Double>>();
63
64	// Initial state has different action space
65	this.qTable.putIfAbsent(this.state.hash(), new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0)));
66	}
67
68	public void initQtable(HashMap<Integer, ArrayList<Double>> qTable) {
69	if (qTable != null) {
70	this.qTable = qTable;
71	}
72	else {
73	this.initQTable();
74	}
75	}
76
77	@Override
78	public BidDetails determineOpeningBid() {
79	// Open the negotiation with a free bid (one of N bins)
80	int targetBin = this.determineOpeningBin();
81	return this.pickBidInBin(targetBin);
82	}
83
84	@Override
85	public BidDetails determineNextBid() {
86	// HACK(?) this QlearningStrategy works for all states that represent the world in bins,
87	// so we needed a way to recognize these. Therefore the interface BinnedRepresentation
88	int targetBin = this.determineTargetBin(((BinnedRepresentation) this.state).getMyBin());
89	return this.pickBidInBin(targetBin);
90	}
91
92	@Override
93	public String getName() {
94	return "Q-Offering";
95	}
96
97	/**
98	* Check if the bid falls inside the lower and upper bounds
99	* @param lower lower bound of utility (inclusive)
100	* @param upper upper bound of utility (exclusive)
101	* @param bidDetails bid to check (has util and time)
102	* @return boolean
103	*/
104	private boolean isInBin(double lower, double upper, BidDetails bidDetails) {
105	double myUtil = bidDetails.getMyUndiscountedUtil();
106	return myUtil < upper && myUtil >= lower;
107	}
108
109	/**
110	* Make the opponent model select a bid that is in the provided target bin
111	* @param targetBin index of the bin in which to pick a bid
112	* @return BidDetails of the selected bid
113	*/
114	protected BidDetails pickBidInBin(int targetBin) {
115
116	double lowerBound = targetBin * this.getBinSize();
117	double upperBound = lowerBound + this.getBinSize();
118
119	// getBidsInRange behaves weirdly and returns bids that are outise of there range (false positives)
120	List<BidDetails> bidsInRange = this.negotiationSession.getOutcomeSpace().getBidsinRange(new Range(lowerBound, upperBound));
121	bidsInRange.removeIf( bid -> !this.isInBin(lowerBound, upperBound, bid) );
122
123	// If there are no bids possible within this bin, recursively choose another bin by the following logic:
124	// if you conceded this round, concede further, etc.
125	if (bidsInRange.isEmpty()) {
126
127	Random random = new Random();
128	int newBin = 0;
129	int direction = -1;
130
131	// Check if this is the opening action or not; if it is we just pick randomly
132	if (this.actions.size() > 1) {
133	direction = this.actions.get(this.actions.size() - 1);
134	} else {
135	newBin = random.nextInt(this.bins);
136	}
137
138	// conceded last time
139	if (direction == 0) {
140	newBin = determineTargetBin(targetBin - 1);
141	}
142
143	// retracted last time
144	if (direction == 1) {
145	newBin = determineTargetBin(targetBin + 1);
146	}
147
148	// stayed last time
149	if (direction == 2) {
150	int randomUpOrDown = random.nextBoolean() ? 1 : -1;
151	newBin = determineTargetBin(targetBin + randomUpOrDown);
152	}
153
154	return this.pickBidInBin(newBin);
155	}
156
157
158	return this.maxBidForOpponent(bidsInRange);
159	}
160
161	/**
162	* This is the general action function for the RL-agent. We determine a bin by either
163	* moving up (retracting offer), doing nothing or moving down (conceding offer).
164	* @param currentBin
165	* @return
166	*/
167	protected int determineTargetBin(int currentBin) {
168	int targetBin = currentBin;
169	ArrayList<Double> defaultActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
170
171	List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultActionValues);
172	int action = this.epsilonGreedy(qValues);
173	this.actions.add(action);
174
175	// Apply action current bin (ie. move up, down or stay)
176	switch (action) {
177	case 0: targetBin--;
178	break;
179	case 1: targetBin++;
180	break;
181	case 2: break;
182	}
183
184	// Can't go outside of the range of relevant bins.
185	targetBin = Math.min(targetBin, (int) this.minMaxBin.getUpperbound());
186	targetBin = Math.max(targetBin, (int) this.minMaxBin.getLowerbound());
187
188	return targetBin;
189	}
190
191	protected int determineOpeningBin() {
192	ArrayList<Double> defaultInitialActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
193	List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultInitialActionValues);
194	int action = this.epsilonGreedy(qValues);
195	this.actions.add(action);
196
197	return action;
198	}
199
200	/**
201	* @param list List of doubles
202	* @return The index of the highest value in the list
203	*/
204	protected int indifferentArgMax(List<Double> list) {
205	double maximum = Collections.max(list);
206
207	List<Integer> maximaIdxs = new ArrayList<Integer>();
208
209	// collect indices of all occurrences of maximum
210	for (int i = 0; i < list.size(); i++) {
211	if (list.get(i) == maximum) {
212	maximaIdxs.add(i);
213	}
214	}
215
216	// pick a random index from the list (this is the indifferent part)
217	Random rnd = new Random();
218	int choice = rnd.nextInt(maximaIdxs.size());
219
220	return maximaIdxs.get(choice);
221	}
222
223	protected int epsilonGreedy(List<Double> qValues) {
224	int action;
225
226	// With probability epsilon, pick a random action (epsilon greedy)
227	if (Math.random() < this.eps && this.isTraining()) {
228	Random random = new Random();
229	action = random.nextInt(qValues.size());
230	}
231	else {
232	action = this.indifferentArgMax(qValues);
233	}
234
235	return action;
236	}
237
238	/**
239	* @return The number of bins in which the each utility axis is divided
240	*/
241	int getNBins() {
242	return this.bins;
243	}
244
245	/**
246	* @return The width of the bins in which the each utility axis is divided
247	*/
248	protected double getBinSize() {
249	return 1.0 / this.getNBins();
250	}
251
252	/**
253	* Setter for the state property
254	* @param state new {@link State}
255	*
256	*/
257	protected void setState(State state) {
258	this.state = state;
259	}
260
261	/**
262	* Getter for the state property
263	* @return
264	*/
265	protected AbstractState getState() {
266	return this.state;
267	}
268
269	/**
270	* Determine the bid with the highest expected utility for the opponent from a list of bids
271	* @param bids
272	* @return BidDetails with representing the maximum bid
273	*/
274	protected BidDetails maxBidForOpponent(List<BidDetails> bids) {
275	BidDetails maxBid = null;
276
277	for (BidDetails bid : bids) {
278	if (maxBid == null) {
279	maxBid = bid;
280	}
281	else if (this.opponentModel.getBidEvaluation(bid.getBid()) > this.opponentModel.getBidEvaluation(maxBid.getBid())) {
282	maxBid = bid;
283	}
284	}
285
286	return maxBid;
287	}
288
289	/**
290	* Gets called by Negotiator when a relevant negotiation event occurs
291	* @param reward
292	* @param newState
293	*/
294	public void observeEnvironment(double reward, AbstractState newState) {
295
296	// Only start updating after an action is performed
297	// Only update if training is enabled
298	if (this.actions.size() > 0 && this.isTraining()) {
299	this.updateQFuction(this.state, this.getLastAction(), reward, newState);
300	}
301	this.state = newState;
302	}
303
304	public HashMap<Integer, ArrayList<Double>> getQTable() {
305	return this.qTable;
306	}
307
308	protected void updateQFuction(AbstractState state, int action, double reward, AbstractState newState) {
309	// initialize state if it is new
310
311	// If agent hasn't done a opening bid, initialize action values to number of bins, otherwise
312	// just 3 values (up/down/nothing).
313	ArrayList<Double> stateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(state.getActionSize(), 0.0));
314	ArrayList<Double> newStateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(newState.getActionSize(), 0.0));
315
316	// Make entries in qTable if they don't exist yet
317	this.qTable.putIfAbsent(state.hash(), stateDefaultActionValues);
318	this.qTable.putIfAbsent(newState.hash(), newStateDefaultActionValues);
319
320
321	// To remind ourselves that the below function is correct =>
322	// the gamma comes from the domain/preference profile through reward which is discounted.
323	Double gamma = 1.0;
324	// Perform update
325	Double Qnext = this.maxActionValue(newState);
326	Double newActionValue = this.qFunction(state, action) + this.alpha * (reward + gamma * Qnext - this.qFunction(state, action));
327	this.qTable.get(state.hash()).set(action, newActionValue);
328	}
329
330	/**
331	* Determine max_a Q(s,a)
332	* @param state The hash of the state for which to retrieve the max action value
333	* @return Value of optimal action given that can be taken in the provided state (0 if state is unknown)
334	*/
335	protected Double maxActionValue(AbstractState state) {
336	return Collections.max(this.qTable.get(state.hash()));
337	}
338
339	/**
340	* Get the Q value associated with the provided (state, action) pair.
341	* @param state
342	* @param action
343	* @return
344	*/
345	protected Double qFunction(AbstractState state, int action) {
346	ArrayList<Double> actionValues = this.qTable.get(state.hash());
347	return actionValues.get(action);
348	}
349
350	public void setHyperparameters(StrategyParameters properties) {
351	this.eps = properties.getValueAsDouble("epsilon");
352	this.alpha = properties.getValueAsDouble("alpha");
353	this.bins = (int) properties.getValueAsDouble("bins");
354	this.mode = properties.getValueAsString("_mode");
355	this.timeBins = (int) properties.getValueAsDouble("time_bins");
356	}
357
358	protected boolean isTraining() {
359	return this.mode.equals("train");
360	}
361	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: src/main/java/agents/rlboa/QlearningStrategy.java

Download in other formats: