001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.descriptive.rank;
018    
019    import java.io.Serializable;
020    import java.util.Arrays;
021    
022    import org.apache.commons.math.MathRuntimeException;
023    import org.apache.commons.math.exception.util.LocalizedFormats;
024    import org.apache.commons.math.stat.descriptive.AbstractUnivariateStatistic;
025    import org.apache.commons.math.util.FastMath;
026    
027    /**
028     * Provides percentile computation.
029     * <p>
030     * There are several commonly used methods for estimating percentiles (a.k.a.
031     * quantiles) based on sample data.  For large samples, the different methods
032     * agree closely, but when sample sizes are small, different methods will give
033     * significantly different results.  The algorithm implemented here works as follows:
034     * <ol>
035     * <li>Let <code>n</code> be the length of the (sorted) array and
036     * <code>0 < p <= 100</code> be the desired percentile.</li>
037     * <li>If <code> n = 1 </code> return the unique array element (regardless of
038     * the value of <code>p</code>); otherwise </li>
039     * <li>Compute the estimated percentile position
040     * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code>
041     * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional
042     * part of <code>pos</code>).  If <code>pos >= n</code> return the largest
043     * element in the array; otherwise</li>
044     * <li>Let <code>lower</code> be the element in position
045     * <code>floor(pos)</code> in the array and let <code>upper</code> be the
046     * next element in the array.  Return <code>lower + d * (upper - lower)</code>
047     * </li>
048     * </ol></p>
049     * <p>
050     * To compute percentiles, the data must be at least partially ordered.  Input
051     * arrays are copied and recursively partitioned using an ordering definition.
052     * The ordering used by <code>Arrays.sort(double[])</code> is the one determined
053     * by {@link java.lang.Double#compareTo(Double)}.  This ordering makes
054     * <code>Double.NaN</code> larger than any other value (including
055     * <code>Double.POSITIVE_INFINITY</code>).  Therefore, for example, the median
056     * (50th percentile) of
057     * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p>
058     * <p>
059     * Since percentile estimation usually involves interpolation between array
060     * elements, arrays containing  <code>NaN</code> or infinite values will often
061     * result in <code>NaN<code> or infinite values returned.</p>
062     * <p>
063     * Since 2.2, Percentile implementation uses only selection instead of complete
064     * sorting and caches selection algorithm state between calls to the various
065     * {@code evaluate} methods when several percentiles are to be computed on the same data.
066     * This greatly improves efficiency, both for single percentile and multiple
067     * percentiles computations. However, it also induces a need to be sure the data
068     * at one call to {@code evaluate} is the same as the data with the cached algorithm
069     * state from the previous calls. Percentile does this by checking the array reference
070     * itself and a checksum of its content by default. If the user already knows he calls
071     * {@code evaluate} on an immutable array, he can save the checking time by calling the
072     * {@code evaluate} methods that do <em>not</em>
073     * </p>
074     * <p>
075     * <strong>Note that this implementation is not synchronized.</strong> If
076     * multiple threads access an instance of this class concurrently, and at least
077     * one of the threads invokes the <code>increment()</code> or
078     * <code>clear()</code> method, it must be synchronized externally.</p>
079     *
080     * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $
081     */
082    public class Percentile extends AbstractUnivariateStatistic implements Serializable {
083    
084        /** Serializable version identifier */
085        private static final long serialVersionUID = -8091216485095130416L;
086    
087        /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */
088        private static final int MIN_SELECT_SIZE = 15;
089    
090        /** Maximum number of partitioning pivots cached (each level double the number of pivots). */
091        private static final int MAX_CACHED_LEVELS = 10;
092    
093        /** Determines what percentile is computed when evaluate() is activated
094         * with no quantile argument */
095        private double quantile = 0.0;
096    
097        /** Cached pivots. */
098        private int[] cachedPivots;
099    
100        /**
101         * Constructs a Percentile with a default quantile
102         * value of 50.0.
103         */
104        public Percentile() {
105            this(50.0);
106        }
107    
108        /**
109         * Constructs a Percentile with the specific quantile value.
110         * @param p the quantile
111         * @throws IllegalArgumentException  if p is not greater than 0 and less
112         * than or equal to 100
113         */
114        public Percentile(final double p) {
115            setQuantile(p);
116            cachedPivots = null;
117        }
118    
119        /**
120         * Copy constructor, creates a new {@code Percentile} identical
121         * to the {@code original}
122         *
123         * @param original the {@code Percentile} instance to copy
124         */
125        public Percentile(Percentile original) {
126            copy(original, this);
127        }
128    
129        /** {@inheritDoc} */
130        @Override
131        public void setData(final double[] values) {
132            if (values == null) {
133                cachedPivots = null;
134            } else {
135                cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
136                Arrays.fill(cachedPivots, -1);
137            }
138            super.setData(values);
139        }
140    
141        /** {@inheritDoc} */
142        @Override
143        public void setData(final double[] values, final int begin, final int length) {
144            if (values == null) {
145                cachedPivots = null;
146            } else {
147                cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
148                Arrays.fill(cachedPivots, -1);
149            }
150            super.setData(values, begin, length);
151        }
152    
153        /**
154         * Returns the result of evaluating the statistic over the stored data.
155         * <p>
156         * The stored array is the one which was set by previous calls to
157         * </p>
158         * @param p the percentile value to compute
159         * @return the value of the statistic applied to the stored data
160         */
161        public double evaluate(final double p) {
162            return evaluate(getDataRef(), p);
163        }
164    
165        /**
166         * Returns an estimate of the <code>p</code>th percentile of the values
167         * in the <code>values</code> array.
168         * <p>
169         * Calls to this method do not modify the internal <code>quantile</code>
170         * state of this statistic.</p>
171         * <p>
172         * <ul>
173         * <li>Returns <code>Double.NaN</code> if <code>values</code> has length
174         * <code>0</code></li>
175         * <li>Returns (for any value of <code>p</code>) <code>values[0]</code>
176         *  if <code>values</code> has length <code>1</code></li>
177         * <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
178         * is null or p is not a valid quantile value (p must be greater than 0
179         * and less than or equal to 100) </li>
180         * </ul></p>
181         * <p>
182         * See {@link Percentile} for a description of the percentile estimation
183         * algorithm used.</p>
184         *
185         * @param values input array of values
186         * @param p the percentile value to compute
187         * @return the percentile value or Double.NaN if the array is empty
188         * @throws IllegalArgumentException if <code>values</code> is null
189         *     or p is invalid
190         */
191        public double evaluate(final double[] values, final double p) {
192            test(values, 0, 0);
193            return evaluate(values, 0, values.length, p);
194        }
195    
196        /**
197         * Returns an estimate of the <code>quantile</code>th percentile of the
198         * designated values in the <code>values</code> array.  The quantile
199         * estimated is determined by the <code>quantile</code> property.
200         * <p>
201         * <ul>
202         * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
203         * <li>Returns (for any value of <code>quantile</code>)
204         * <code>values[begin]</code> if <code>length = 1 </code></li>
205         * <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
206         * is null,  or <code>start</code> or <code>length</code>
207         * is invalid</li>
208         * </ul></p>
209         * <p>
210         * See {@link Percentile} for a description of the percentile estimation
211         * algorithm used.</p>
212         *
213         * @param values the input array
214         * @param start index of the first array element to include
215         * @param length the number of elements to include
216         * @return the percentile value
217         * @throws IllegalArgumentException if the parameters are not valid
218         *
219         */
220        @Override
221        public double evaluate( final double[] values, final int start, final int length) {
222            return evaluate(values, start, length, quantile);
223        }
224    
225         /**
226         * Returns an estimate of the <code>p</code>th percentile of the values
227         * in the <code>values</code> array, starting with the element in (0-based)
228         * position <code>begin</code> in the array and including <code>length</code>
229         * values.
230         * <p>
231         * Calls to this method do not modify the internal <code>quantile</code>
232         * state of this statistic.</p>
233         * <p>
234         * <ul>
235         * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
236         * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code>
237         *  if <code>length = 1 </code></li>
238         * <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
239         *  is null , <code>begin</code> or <code>length</code> is invalid, or
240         * <code>p</code> is not a valid quantile value (p must be greater than 0
241         * and less than or equal to 100)</li>
242         * </ul></p>
243         * <p>
244         * See {@link Percentile} for a description of the percentile estimation
245         * algorithm used.</p>
246         *
247         * @param values array of input values
248         * @param p  the percentile to compute
249         * @param begin  the first (0-based) element to include in the computation
250         * @param length  the number of array elements to include
251         * @return  the percentile value
252         * @throws IllegalArgumentException if the parameters are not valid or the
253         * input array is null
254         */
255        public double evaluate(final double[] values, final int begin,
256                final int length, final double p) {
257    
258            test(values, begin, length);
259    
260            if ((p > 100) || (p <= 0)) {
261                throw MathRuntimeException.createIllegalArgumentException(
262                      LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p);
263            }
264            if (length == 0) {
265                return Double.NaN;
266            }
267            if (length == 1) {
268                return values[begin]; // always return single value for n = 1
269            }
270            double n = length;
271            double pos = p * (n + 1) / 100;
272            double fpos = FastMath.floor(pos);
273            int intPos = (int) fpos;
274            double dif = pos - fpos;
275            double[] work;
276            int[] pivotsHeap;
277            if (values == getDataRef()) {
278                work = getDataRef();
279                pivotsHeap = cachedPivots;
280            } else {
281                work = new double[length];
282                System.arraycopy(values, begin, work, 0, length);
283                pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
284                Arrays.fill(pivotsHeap, -1);
285            }
286    
287            if (pos < 1) {
288                return select(work, pivotsHeap, 0);
289            }
290            if (pos >= n) {
291                return select(work, pivotsHeap, length - 1);
292            }
293            double lower = select(work, pivotsHeap, intPos - 1);
294            double upper = select(work, pivotsHeap, intPos);
295            return lower + dif * (upper - lower);
296        }
297    
298        /**
299         * Select the k<sup>th</sup> smallest element from work array
300         * @param work work array (will be reorganized during the call)
301         * @param pivotsHeap set of pivot index corresponding to elements that
302         * are already at their sorted location, stored as an implicit heap
303         * (i.e. a sorted binary tree stored in a flat array, where the
304         * children of a node at index n are at indices 2n+1 for the left
305         * child and 2n+2 for the right child, with 0-based indices)
306         * @param k index of the desired element
307         * @return k<sup>th</sup> smallest element
308         */
309        private double select(final double[] work, final int[] pivotsHeap, final int k) {
310    
311            int begin = 0;
312            int end   = work.length;
313            int node  = 0;
314    
315            while (end - begin > MIN_SELECT_SIZE) {
316    
317                final int pivot;
318                if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) {
319                    // the pivot has already been found in a previous call
320                    // and the array has already been partitioned around it
321                    pivot = pivotsHeap[node];
322                } else {
323                    // select a pivot and partition work array around it
324                    pivot = partition(work, begin, end, medianOf3(work, begin, end));
325                    if (node < pivotsHeap.length) {
326                        pivotsHeap[node] =  pivot;
327                    }
328                }
329    
330                if (k == pivot) {
331                    // the pivot was exactly the element we wanted
332                    return work[k];
333                } else if (k < pivot) {
334                    // the element is in the left partition
335                    end  = pivot;
336                    node = Math.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow
337                } else {
338                    // the element is in the right partition
339                    begin = pivot + 1;
340                    node  = Math.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow
341                }
342    
343            }
344    
345            // the element is somewhere in the small sub-array
346            // sort the sub-array using insertion sort
347            insertionSort(work, begin, end);
348            return work[k];
349    
350        }
351    
352        /** Select a pivot index as the median of three
353         * @param work data array
354         * @param begin index of the first element of the slice
355         * @param end index after the last element of the slice
356         * @return the index of the median element chosen between the
357         * first, the middle and the last element of the array slice
358         */
359        int medianOf3(final double[] work, final int begin, final int end) {
360    
361            final int inclusiveEnd = end - 1;
362            final int    middle    = begin + (inclusiveEnd - begin) / 2;
363            final double wBegin    = work[begin];
364            final double wMiddle   = work[middle];
365            final double wEnd      = work[inclusiveEnd];
366    
367            if (wBegin < wMiddle) {
368                if (wMiddle < wEnd) {
369                    return middle;
370                } else {
371                    return (wBegin < wEnd) ? inclusiveEnd : begin;
372                }
373            } else {
374                if (wBegin < wEnd) {
375                    return begin;
376                } else {
377                    return (wMiddle < wEnd) ? inclusiveEnd : middle;
378                }
379            }
380    
381        }
382    
383        /**
384         * Partition an array slice around a pivot
385         * <p>
386         * Partitioning exchanges array elements such that all elements
387         * smaller than pivot are before it and all elements larger than
388         * pivot are after it
389         * </p>
390         * @param work data array
391         * @param begin index of the first element of the slice
392         * @param end index after the last element of the slice
393         * @param pivot initial index of the pivot
394         * @return index of the pivot after partition
395         */
396        private int partition(final double[] work, final int begin, final int end, final int pivot) {
397    
398            final double value = work[pivot];
399            work[pivot] = work[begin];
400    
401            int i = begin + 1;
402            int j = end - 1;
403            while (i < j) {
404                while ((i < j) && (work[j] >= value)) {
405                    --j;
406                }
407                while ((i < j) && (work[i] <= value)) {
408                    ++i;
409                }
410    
411                if (i < j) {
412                    final double tmp = work[i];
413                    work[i++] = work[j];
414                    work[j--] = tmp;
415                }
416            }
417    
418            if ((i >= end) || (work[i] > value)) {
419                --i;
420            }
421            work[begin] = work[i];
422            work[i]     = value;
423            return i;
424    
425        }
426    
427        /**
428         * Sort in place a (small) array slice using insertion sort
429         * @param work array to sort
430         * @param begin index of the first element of the slice to sort
431         * @param end index after the last element of the slice to sort
432         */
433        private void insertionSort(final double[] work, final int begin, final int end) {
434            for (int j = begin + 1; j < end; j++) {
435                final double saved = work[j];
436                int i = j - 1;
437                while ((i >= begin) && (saved < work[i])) {
438                    work[i + 1] = work[i];
439                    i--;
440                }
441                work[i + 1] = saved;
442            }
443        }
444    
445        /**
446         * Returns the value of the quantile field (determines what percentile is
447         * computed when evaluate() is called with no quantile argument).
448         *
449         * @return quantile
450         */
451        public double getQuantile() {
452            return quantile;
453        }
454    
455        /**
456         * Sets the value of the quantile field (determines what percentile is
457         * computed when evaluate() is called with no quantile argument).
458         *
459         * @param p a value between 0 < p <= 100
460         * @throws IllegalArgumentException  if p is not greater than 0 and less
461         * than or equal to 100
462         */
463        public void setQuantile(final double p) {
464            if (p <= 0 || p > 100) {
465                throw MathRuntimeException.createIllegalArgumentException(
466                      LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p);
467            }
468            quantile = p;
469        }
470    
471        /**
472         * {@inheritDoc}
473         */
474        @Override
475        public Percentile copy() {
476            Percentile result = new Percentile();
477            copy(this, result);
478            return result;
479        }
480    
481        /**
482         * Copies source to dest.
483         * <p>Neither source nor dest can be null.</p>
484         *
485         * @param source Percentile to copy
486         * @param dest Percentile to copy to
487         * @throws NullPointerException if either source or dest is null
488         */
489        public static void copy(Percentile source, Percentile dest) {
490            dest.setData(source.getDataRef());
491            if (source.cachedPivots != null) {
492                System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length);
493            }
494            dest.quantile = source.quantile;
495        }
496    
497    }