Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: src/main/java/uva/projectai/y2018/jasparon/QlearningStrategy.java@ 89

Last change on this file since 89 was 67, checked in by Aron Hammond, 6 years ago
Added support for agents that learn via ReinforcementLearning, including an implementation of an agent that uses tabular Q-learning
File size: 8.5 KB

Line
1	package uva.projectai.y2018.jasparon;
2
3	import genius.core.StrategyParameters;
4	import genius.core.bidding.BidDetails;
5	import genius.core.boaframework.NegotiationSession;
6	import genius.core.boaframework.OfferingStrategy;
7	import genius.core.boaframework.OpponentModel;
8	import genius.core.boaframework.OutcomeSpace;
9	import genius.core.boaframework.SortedOutcomeSpace;
10	import genius.core.misc.Range;
11
12	import java.util.ArrayList;
13	import java.util.Collections;
14	import java.util.HashMap;
15	import java.util.List;
16	import java.util.Random;
17
18	public class QlearningStrategy extends OfferingStrategy {
19
20	protected HashMap<Integer, ArrayList<Double>> qTable;
21	protected ArrayList<Integer> actions = new ArrayList<Integer>();
22	protected int bins;
23	protected Double eps;
24	protected Double alpha;
25	protected Double gamma;
26	protected State state;
27	protected boolean optimistic;
28
29	public QlearningStrategy(NegotiationSession negotiationSession, OpponentModel opponentModel) {
30	super.init(negotiationSession, null);
31	this.opponentModel = opponentModel;
32	this.endNegotiation = false;
33	this.state = State.INITIAL;
34
35	OutcomeSpace outcomeSpace = new SortedOutcomeSpace(negotiationSession.getUtilitySpace());
36	this.negotiationSession.setOutcomeSpace(outcomeSpace);
37	}
38
39	public ArrayList<Integer> getActions() {
40	return this.actions;
41	}
42
43	/**
44	* @return int representing the last action taken by the strategy
45	* @throws IndexOutOfBoundsException if called before any action has been performed
46	*/
47	public int getLastAction() throws IndexOutOfBoundsException {
48	return this.actions.get(this.actions.size() - 1);
49	}
50
51	protected void initQTable() {
52	this.qTable = new HashMap<Integer, ArrayList<Double>>();
53
54	// Initial state has different action space
55	this.qTable.putIfAbsent(this.state.hash(), new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0)));
56	}
57
58	public void initQtable(HashMap<Integer, ArrayList<Double>> qTable) {
59	if (qTable != null) {
60	this.qTable = qTable;
61	}
62	else {
63	this.initQTable();
64	}
65	}
66
67	@Override
68	public BidDetails determineOpeningBid() {
69	// Open the negotiation with a free bid (one of N bins)
70	int targetBin = this.determineOpeningBin();
71	return this.pickBidInBin(targetBin);
72	}
73
74	@Override
75	public BidDetails determineNextBid() {
76	int targetBin = this.determineTargetBin(this.state.getMyBin());
77	return this.pickBidInBin(targetBin);
78	}
79
80	@Override
81	public String getName() {
82	return "Q-Offering";
83	}
84
85	/**
86	* Make the opponent model select a bid that is in the provided target bin
87	* @param targetBin index of the bin in which to pick a bid
88	* @return BidDetails of the selected bid
89	*/
90	protected BidDetails pickBidInBin(int targetBin) {
91
92	double lowerBound = 0.0 + targetBin * this.getBinSize();
93	double upperBound = lowerBound + this.getBinSize() - 0.01;
94
95	List<BidDetails> bidsInRange = this.negotiationSession.getOutcomeSpace().getBidsinRange(new Range(lowerBound, upperBound));
96
97	return this.maxBidForOpponent(bidsInRange);
98	}
99
100	/**
101	* This is the general action function for the RL-agent. We determine a bin by either
102	* moving up (retracting offer), doing nothing or moving down (conceding offer).
103	* @param currentBin
104	* @return
105	*/
106	protected int determineTargetBin(int currentBin) {
107	int targetBin = currentBin;
108	ArrayList<Double> defaultActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
109
110	List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultActionValues);
111	int action = this.epsilonGreedy(qValues);
112	this.actions.add(action);
113
114	// Apply action current bin (ie. move up, down or stay)
115	switch (action) {
116	case 0: targetBin--;
117	break;
118	case 1: targetBin++;
119	break;
120	case 2: break;
121	}
122
123	// Can't go out of bounds
124	// TODO: Discuss impact on learning algorithm
125	targetBin = Math.min(targetBin, this.getNBins() - 1);
126	targetBin = Math.max(targetBin, 0);
127
128	return targetBin;
129
130
131	}
132
133	protected int determineOpeningBin() {
134	ArrayList<Double> defaultInitialActionValues = new ArrayList<Double>(Collections.nCopies(this.state.getActionSize(), 0.0));
135	List<Double> qValues = this.qTable.getOrDefault(this.state.hash(), defaultInitialActionValues);
136	int action = this.epsilonGreedy(qValues);
137	this.actions.add(action);
138
139	return action;
140	}
141
142	/**
143	* @param list List of doubles
144	* @return The index of the highest value in the list
145	*/
146	protected int indifferentArgMax(List<Double> list) {
147	double maximum = Collections.max(list);
148
149	List<Integer> maximaIdxs = new ArrayList<Integer>();
150
151	// collect indices of all occurences of maximum
152	for (int i = 0; i < list.size(); i++) {
153	if (list.get(i) == maximum) {
154	maximaIdxs.add(i);
155	}
156	}
157
158	// pick a random index from the list (this is the indifferent part)
159	Random rnd = new Random();
160	int choice = rnd.nextInt(maximaIdxs.size());
161
162	return maximaIdxs.get(choice);
163	}
164
165	protected int epsilonGreedy(List<Double> qValues) {
166	int action;
167
168	// With probability epsilon, pick a random action (espilon greedy)
169	if (Math.random() < this.eps) {
170	Random random = new Random();
171	action = random.nextInt(qValues.size());
172	}
173	else {
174	action = this.indifferentArgMax(qValues);
175	}
176
177	return action;
178	}
179
180	/**
181	* @return The number of bins in which the each utility axis is divided
182	*/
183	int getNBins() {
184	return this.bins;
185	}
186
187	/**
188	* @return The width of the bins in which the each utility axis is divided
189	*/
190	protected double getBinSize() {
191	return 1.0 / this.getNBins();
192	}
193
194	/**
195	* Setter for the state property
196	* @param state new {@link State}
197	*
198	*/
199	protected void setState(State state) {
200	this.state = state;
201	}
202
203	/**
204	* Getter for the state property
205	* @return
206	*/
207	protected State getState() {
208	return this.state;
209	}
210
211	/**
212	* Determine the bid with the highest expected utility for the opponent from a list of bids
213	* @param bids
214	* @return BidDetails with representing the maximum bid
215	*/
216	protected BidDetails maxBidForOpponent(List<BidDetails> bids) {
217	BidDetails maxBid = null;
218
219	for (BidDetails bid : bids) {
220	if (maxBid == null) {
221	maxBid = bid;
222	}
223	else if (this.opponentModel.getBidEvaluation(bid.getBid()) > this.opponentModel.getBidEvaluation(maxBid.getBid())) {
224	maxBid = bid;
225	}
226	}
227
228	return maxBid;
229	}
230
231	/**
232	* Gets called by Negotiator when a relevant negotiation event occurs
233	* @param reward
234	* @param newState
235	*/
236	public void observeEnvironment(double reward, State newState) {
237
238	// Only start updating after an action is performed
239	if (this.actions.size() > 0) {
240	this.updateQFuction(this.state, this.getLastAction(), reward, newState);
241	}
242	this.state = newState;
243	}
244
245	public HashMap<Integer, ArrayList<Double>> getQTable() {
246	return this.qTable;
247	}
248
249	protected void updateQFuction(State state, int action, double reward, State newState) {
250	// initialize state if it is new
251
252	// If agent hasn't done a opening bid, initialize action values to number of bins, otherwise
253	// just 3 values (up/down/nothing).
254	ArrayList<Double> stateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(state.getActionSize(), 0.0));
255	ArrayList<Double> newStateDefaultActionValues = new ArrayList<Double>(Collections.nCopies(newState.getActionSize(), 0.0));
256
257	// Make entries in qTable if they don't exist yet
258	this.qTable.putIfAbsent(state.hash(), stateDefaultActionValues);
259	this.qTable.putIfAbsent(newState.hash(), newStateDefaultActionValues);
260
261	// Perform update
262	Double Qnext = this.maxActionValue(newState);
263	Double newActionValue = this.qFunction(state, action) + this.alpha * (reward + this.gamma * Qnext - this.qFunction(state, action));
264	this.qTable.get(state.hash()).set(action, newActionValue);
265	}
266
267	/**
268	* Determine max_a Q(s,a)
269	* @param state The hash of the state for which to retrieve the max action value
270	* @return Value of optimal action given that can be taken in the provided state (0 if state is unknown)
271	*/
272	protected Double maxActionValue(State state) {
273	return Collections.max(this.qTable.get(state.hash()));
274	}
275
276	/**
277	* Get the Q value associated with the provided (state, action) pair.
278	* @param state
279	* @param action
280	* @return
281	*/
282	protected Double qFunction(State state, int action) {
283	ArrayList<Double> actionValues = this.qTable.get(state.hash());
284	return actionValues.get(action);
285	}
286
287	public void setHyperparameters(StrategyParameters properties) {
288	this.eps = properties.getValueAsDouble("epsilon");
289	this.gamma = properties.getValueAsDouble("gamma");
290	this.alpha = properties.getValueAsDouble("alpha");
291	this.bins = (int) properties.getValueAsDouble("bins");
292	this.optimistic = properties.getValueAsDouble("optimistic") == 1.0;
293	}
294	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: