Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

ZipfDistribution.java

Last change on this file was 204, checked in by Katsuhide Fujita, 6 years ago
Fixed errors of ANAC2019 agents
Property svn:executable set to ``*
File size: 17.8 KB

Line
1	/*
2	* Licensed to the Apache Software Foundation (ASF) under one or more
3	* contributor license agreements. See the NOTICE file distributed with
4	* this work for additional information regarding copyright ownership.
5	* The ASF licenses this file to You under the Apache License, Version 2.0
6	* (the "License"); you may not use this file except in compliance with
7	* the License. You may obtain a copy of the License at
8	*
9	* http://www.apache.org/licenses/LICENSE-2.0
10	*
11	* Unless required by applicable law or agreed to in writing, software
12	* distributed under the License is distributed on an "AS IS" BASIS,
13	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14	* See the License for the specific language governing permissions and
15	* limitations under the License.
16	*/
17
18	package agents.anac.y2019.harddealer.math3.distribution;
19
20	import agents.anac.y2019.harddealer.math3.exception.NotStrictlyPositiveException;
21	import agents.anac.y2019.harddealer.math3.exception.util.LocalizedFormats;
22	import agents.anac.y2019.harddealer.math3.random.RandomGenerator;
23	import agents.anac.y2019.harddealer.math3.random.Well19937c;
24	import agents.anac.y2019.harddealer.math3.util.FastMath;
25
26	/**
27	* Implementation of the Zipf distribution.
28	* <p>
29	* <strong>Parameters:</strong>
30	* For a random variable {@code X} whose values are distributed according to this
31	* distribution, the probability mass function is given by
32	* <pre>
33	* P(X = k) = H(N,s) * 1 / k^s for {@code k = 1,2,...,N}.
34	* </pre>
35	* {@code H(N,s)} is the normalizing constant
36	* which corresponds to the generalized harmonic number of order N of s.
37	* <p>
38	* <ul>
39	* <li>{@code N} is the number of elements</li>
40	* <li>{@code s} is the exponent</li>
41	* </ul>
42	* @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a>
43	* @see <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized harmonic numbers</a>
44	*/
45	public class ZipfDistribution extends AbstractIntegerDistribution {
46	/** Serializable version identifier. */
47	private static final long serialVersionUID = -140627372283420404L;
48	/** Number of elements. */
49	private final int numberOfElements;
50	/** Exponent parameter of the distribution. */
51	private final double exponent;
52	/** Cached numerical mean */
53	private double numericalMean = Double.NaN;
54	/** Whether or not the numerical mean has been calculated */
55	private boolean numericalMeanIsCalculated = false;
56	/** Cached numerical variance */
57	private double numericalVariance = Double.NaN;
58	/** Whether or not the numerical variance has been calculated */
59	private boolean numericalVarianceIsCalculated = false;
60	/** The sampler to be used for the sample() method */
61	private transient ZipfRejectionInversionSampler sampler;
62
63	/**
64	* Create a new Zipf distribution with the given number of elements and
65	* exponent.
66	* <p>
67	* <b>Note:</b> this constructor will implicitly create an instance of
68	* {@link Well19937c} as random generator to be used for sampling only (see
69	* {@link #sample()} and {@link #sample(int)}). In case no sampling is
70	* needed for the created distribution, it is advised to pass {@code null}
71	* as random generator via the appropriate constructors to avoid the
72	* additional initialisation overhead.
73	*
74	* @param numberOfElements Number of elements.
75	* @param exponent Exponent.
76	* @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
77	* or {@code exponent <= 0}.
78	*/
79	public ZipfDistribution(final int numberOfElements, final double exponent) {
80	this(new Well19937c(), numberOfElements, exponent);
81	}
82
83	/**
84	* Creates a Zipf distribution.
85	*
86	* @param rng Random number generator.
87	* @param numberOfElements Number of elements.
88	* @param exponent Exponent.
89	* @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
90	* or {@code exponent <= 0}.
91	* @since 3.1
92	*/
93	public ZipfDistribution(RandomGenerator rng,
94	int numberOfElements,
95	double exponent)
96	throws NotStrictlyPositiveException {
97	super(rng);
98
99	if (numberOfElements <= 0) {
100	throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
101	numberOfElements);
102	}
103	if (exponent <= 0) {
104	throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
105	exponent);
106	}
107
108	this.numberOfElements = numberOfElements;
109	this.exponent = exponent;
110	}
111
112	/**
113	* Get the number of elements (e.g. corpus size) for the distribution.
114	*
115	* @return the number of elements
116	*/
117	public int getNumberOfElements() {
118	return numberOfElements;
119	}
120
121	/**
122	* Get the exponent characterizing the distribution.
123	*
124	* @return the exponent
125	*/
126	public double getExponent() {
127	return exponent;
128	}
129
130	/** {@inheritDoc} */
131	public double probability(final int x) {
132	if (x <= 0 \|\| x > numberOfElements) {
133	return 0.0;
134	}
135
136	return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
137	}
138
139	/** {@inheritDoc} */
140	@Override
141	public double logProbability(int x) {
142	if (x <= 0 \|\| x > numberOfElements) {
143	return Double.NEGATIVE_INFINITY;
144	}
145
146	return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent));
147	}
148
149	/** {@inheritDoc} */
150	public double cumulativeProbability(final int x) {
151	if (x <= 0) {
152	return 0.0;
153	} else if (x >= numberOfElements) {
154	return 1.0;
155	}
156
157	return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
158	}
159
160	/**
161	* {@inheritDoc}
162	*
163	* For number of elements {@code N} and exponent {@code s}, the mean is
164	* {@code Hs1 / Hs}, where
165	* <ul>
166	* <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
167	* <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
168	* </ul>
169	*/
170	public double getNumericalMean() {
171	if (!numericalMeanIsCalculated) {
172	numericalMean = calculateNumericalMean();
173	numericalMeanIsCalculated = true;
174	}
175	return numericalMean;
176	}
177
178	/**
179	* Used by {@link #getNumericalMean()}.
180	*
181	* @return the mean of this distribution
182	*/
183	protected double calculateNumericalMean() {
184	final int N = getNumberOfElements();
185	final double s = getExponent();
186
187	final double Hs1 = generalizedHarmonic(N, s - 1);
188	final double Hs = generalizedHarmonic(N, s);
189
190	return Hs1 / Hs;
191	}
192
193	/**
194	* {@inheritDoc}
195	*
196	* For number of elements {@code N} and exponent {@code s}, the mean is
197	* {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
198	* <ul>
199	* <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
200	* <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
201	* <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
202	* </ul>
203	*/
204	public double getNumericalVariance() {
205	if (!numericalVarianceIsCalculated) {
206	numericalVariance = calculateNumericalVariance();
207	numericalVarianceIsCalculated = true;
208	}
209	return numericalVariance;
210	}
211
212	/**
213	* Used by {@link #getNumericalVariance()}.
214	*
215	* @return the variance of this distribution
216	*/
217	protected double calculateNumericalVariance() {
218	final int N = getNumberOfElements();
219	final double s = getExponent();
220
221	final double Hs2 = generalizedHarmonic(N, s - 2);
222	final double Hs1 = generalizedHarmonic(N, s - 1);
223	final double Hs = generalizedHarmonic(N, s);
224
225	return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
226	}
227
228	/**
229	* Calculates the Nth generalized harmonic number. See
230	* <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
231	* Series</a>.
232	*
233	* @param n Term in the series to calculate (must be larger than 1)
234	* @param m Exponent (special case {@code m = 1} is the harmonic series).
235	* @return the n<sup>th</sup> generalized harmonic number.
236	*/
237	private double generalizedHarmonic(final int n, final double m) {
238	double value = 0;
239	for (int k = n; k > 0; --k) {
240	value += 1.0 / FastMath.pow(k, m);
241	}
242	return value;
243	}
244
245	/**
246	* {@inheritDoc}
247	*
248	* The lower bound of the support is always 1 no matter the parameters.
249	*
250	* @return lower bound of the support (always 1)
251	*/
252	public int getSupportLowerBound() {
253	return 1;
254	}
255
256	/**
257	* {@inheritDoc}
258	*
259	* The upper bound of the support is the number of elements.
260	*
261	* @return upper bound of the support
262	*/
263	public int getSupportUpperBound() {
264	return getNumberOfElements();
265	}
266
267	/**
268	* {@inheritDoc}
269	*
270	* The support of this distribution is connected.
271	*
272	* @return {@code true}
273	*/
274	public boolean isSupportConnected() {
275	return true;
276	}
277
278	/**
279	* {@inheritDoc}
280	*/
281	@Override
282	public int sample() {
283	if (sampler == null) {
284	sampler = new ZipfRejectionInversionSampler(numberOfElements, exponent);
285	}
286	return sampler.sample(random);
287	}
288
289	/**
290	* Utility class implementing a rejection inversion sampling method for a discrete,
291	* bounded Zipf distribution that is based on the method described in
292	* <p>
293	* Wolfgang Hörmann and Gerhard Derflinger
294	* "Rejection-inversion to generate variates from monotone discrete distributions."
295	* ACM Transactions on Modeling and Computer Simulation (TOMACS) 6.3 (1996): 169-184.
296	* <p>
297	* The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI).
298	* The original method uses {@code H(x) := (v + x)^(1 - q) / (1 - q)}
299	* as the integral of the hat function. This function is undefined for
300	* q = 1, which is the reason for the limitation of the exponent.
301	* If instead the integral function
302	* {@code H(x) := ((v + x)^(1 - q) - 1) / (1 - q)} is used,
303	* for which a meaningful limit exists for q = 1,
304	* the method works for all positive exponents.
305	* <p>
306	* The following implementation uses v := 0 and generates integral numbers
307	* in the range [1, numberOfElements]. This is different to the original method
308	* where v is defined to be positive and numbers are taken from [0, i_max].
309	* This explains why the implementation looks slightly different.
310	*
311	* @since 3.6
312	*/
313	static final class ZipfRejectionInversionSampler {
314
315	/** Exponent parameter of the distribution. */
316	private final double exponent;
317	/** Number of elements. */
318	private final int numberOfElements;
319	/** Constant equal to {@code hIntegral(1.5) - 1}. */
320	private final double hIntegralX1;
321	/** Constant equal to {@code hIntegral(numberOfElements + 0.5)}. */
322	private final double hIntegralNumberOfElements;
323	/** Constant equal to {@code 2 - hIntegralInverse(hIntegral(2.5) - h(2)}. */
324	private final double s;
325
326	/** Simple constructor.
327	* @param numberOfElements number of elements
328	* @param exponent exponent parameter of the distribution
329	*/
330	ZipfRejectionInversionSampler(final int numberOfElements, final double exponent) {
331	this.exponent = exponent;
332	this.numberOfElements = numberOfElements;
333	this.hIntegralX1 = hIntegral(1.5) - 1d;
334	this.hIntegralNumberOfElements = hIntegral(numberOfElements + 0.5);
335	this.s = 2d - hIntegralInverse(hIntegral(2.5) - h(2));
336	}
337
338	/** Generate one integral number in the range [1, numberOfElements].
339	* @param random random generator to use
340	* @return generated integral number in the range [1, numberOfElements]
341	*/
342	int sample(final RandomGenerator random) {
343	while(true) {
344
345	final double u = hIntegralNumberOfElements + random.nextDouble() * (hIntegralX1 - hIntegralNumberOfElements);
346	// u is uniformly distributed in (hIntegralX1, hIntegralNumberOfElements]
347
348	double x = hIntegralInverse(u);
349
350	int k = (int)(x + 0.5);
351
352	// Limit k to the range [1, numberOfElements]
353	// (k could be outside due to numerical inaccuracies)
354	if (k < 1) {
355	k = 1;
356	}
357	else if (k > numberOfElements) {
358	k = numberOfElements;
359	}
360
361	// Here, the distribution of k is given by:
362	//
363	// P(k = 1) = C * (hIntegral(1.5) - hIntegralX1) = C
364	// P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2
365	//
366	// where C := 1 / (hIntegralNumberOfElements - hIntegralX1)
367
368	if (k - x <= s \|\| u >= hIntegral(k + 0.5) - h(k)) {
369
370	// Case k = 1:
371	//
372	// The right inequality is always true, because replacing k by 1 gives
373	// u >= hIntegral(1.5) - h(1) = hIntegralX1 and u is taken from
374	// (hIntegralX1, hIntegralNumberOfElements].
375	//
376	// Therefore, the acceptance rate for k = 1 is P(accepted \| k = 1) = 1
377	// and the probability that 1 is returned as random value is
378	// P(k = 1 and accepted) = P(accepted \| k = 1) * P(k = 1) = C = C / 1^exponent
379	//
380	// Case k >= 2:
381	//
382	// The left inequality (k - x <= s) is just a short cut
383	// to avoid the more expensive evaluation of the right inequality
384	// (u >= hIntegral(k + 0.5) - h(k)) in many cases.
385	//
386	// If the left inequality is true, the right inequality is also true:
387	// Theorem 2 in the paper is valid for all positive exponents, because
388	// the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and
389	// (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0
390	// are both fulfilled.
391	// Therefore, f(x) := x - hIntegralInverse(hIntegral(x + 0.5) - h(x))
392	// is a non-decreasing function. If k - x <= s holds,
393	// k - x <= s + f(k) - f(2) is obviously also true which is equivalent to
394	// -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
395	// -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)),
396	// and finally u >= hIntegral(k + 0.5) - h(k).
397	//
398	// Hence, the right inequality determines the acceptance rate:
399	// P(accepted \| k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2))
400	// The probability that m is returned is given by
401	// P(k = m and accepted) = P(accepted \| k = m) * P(k = m) = C * h(m) = C / m^exponent.
402	//
403	// In both cases the probabilities are proportional to the probability mass function
404	// of the Zipf distribution.
405
406	return k;
407	}
408	}
409	}
410
411	/**
412	* {@code H(x) :=}
413	* <ul>
414	* <li>{@code (x^(1-exponent) - 1)/(1 - exponent)}, if {@code exponent != 1}</li>
415	* <li>{@code log(x)}, if {@code exponent == 1}</li>
416	* </ul>
417	* H(x) is an integral function of h(x),
418	* the derivative of H(x) is h(x).
419	*
420	* @param x free parameter
421	* @return {@code H(x)}
422	*/
423	private double hIntegral(final double x) {
424	final double logX = FastMath.log(x);
425	return helper2((1d-exponent)logX)logX;
426	}
427
428	/**
429	* {@code h(x) := 1/x^exponent}
430	*
431	* @param x free parameter
432	* @return h(x)
433	*/
434	private double h(final double x) {
435	return FastMath.exp(-exponent * FastMath.log(x));
436	}
437
438	/**
439	* The inverse function of H(x).
440	*
441	* @param x free parameter
442	* @return y for which {@code H(y) = x}
443	*/
444	private double hIntegralInverse(final double x) {
445	double t = x*(1d-exponent);
446	if (t < -1d) {
447	// Limit value to the range [-1, +inf).
448	// t could be smaller than -1 in some rare cases due to numerical errors.
449	t = -1;
450	}
451	return FastMath.exp(helper1(t)*x);
452	}
453
454	/**
455	* Helper function that calculates {@code log(1+x)/x}.
456	* <p>
457	* A Taylor series expansion is used, if x is close to 0.
458	*
459	* @param x a value larger than or equal to -1
460	* @return {@code log(1+x)/x}
461	*/
462	static double helper1(final double x) {
463	if (FastMath.abs(x)>1e-8) {
464	return FastMath.log1p(x)/x;
465	}
466	else {
467	return 1.-x((1./2.)-x((1./3.)-x*(1./4.)));
468	}
469	}
470
471	/**
472	* Helper function to calculate {@code (exp(x)-1)/x}.
473	* <p>
474	* A Taylor series expansion is used, if x is close to 0.
475	*
476	* @param x free parameter
477	* @return {@code (exp(x)-1)/x} if x is non-zero, or 1 if x=0
478	*/
479	static double helper2(final double x) {
480	if (FastMath.abs(x)>1e-8) {
481	return FastMath.expm1(x)/x;
482	}
483	else {
484	return 1.+x(1./2.)(1.+x(1./3.)(1.+x*(1./4.)));
485	}
486	}
487	}
488	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: src/main/java/agents/anac/y2019/harddealer/math3/distribution/ZipfDistribution.java

Download in other formats: