001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.inference;
018    
019    import java.util.Collection;
020    
021    import org.apache.commons.math.MathException;
022    import org.apache.commons.math.MathRuntimeException;
023    import org.apache.commons.math.distribution.FDistribution;
024    import org.apache.commons.math.distribution.FDistributionImpl;
025    import org.apache.commons.math.exception.util.LocalizedFormats;
026    import org.apache.commons.math.stat.descriptive.summary.Sum;
027    import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
028    
029    
030    /**
031     * Implements one-way ANOVA statistics defined in the {@link OneWayAnovaImpl}
032     * interface.
033     *
034     * <p>Uses the
035     * {@link org.apache.commons.math.distribution.FDistribution
036     *  commons-math F Distribution implementation} to estimate exact p-values.</p>
037     *
038     * <p>This implementation is based on a description at
039     * http://faculty.vassar.edu/lowry/ch13pt1.html</p>
040     * <pre>
041     * Abbreviations: bg = between groups,
042     *                wg = within groups,
043     *                ss = sum squared deviations
044     * </pre>
045     *
046     * @since 1.2
047     * @version $Revision: 983921 $ $Date: 2010-08-10 12:46:06 +0200 (mar. 10 ao??t 2010) $
048     */
049    public class OneWayAnovaImpl implements OneWayAnova  {
050    
051        /**
052         * Default constructor.
053         */
054        public OneWayAnovaImpl() {
055        }
056    
057        /**
058         * {@inheritDoc}<p>
059         * This implementation computes the F statistic using the definitional
060         * formula<pre>
061         *   F = msbg/mswg</pre>
062         * where<pre>
063         *  msbg = between group mean square
064         *  mswg = within group mean square</pre>
065         * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
066         * here</a></p>
067         */
068        public double anovaFValue(Collection<double[]> categoryData)
069            throws IllegalArgumentException, MathException {
070            AnovaStats a = anovaStats(categoryData);
071            return a.F;
072        }
073    
074        /**
075         * {@inheritDoc}<p>
076         * This implementation uses the
077         * {@link org.apache.commons.math.distribution.FDistribution
078         * commons-math F Distribution implementation} to estimate the exact
079         * p-value, using the formula<pre>
080         *   p = 1 - cumulativeProbability(F)</pre>
081         * where <code>F</code> is the F value and <code>cumulativeProbability</code>
082         * is the commons-math implementation of the F distribution.</p>
083         */
084        public double anovaPValue(Collection<double[]> categoryData)
085            throws IllegalArgumentException, MathException {
086            AnovaStats a = anovaStats(categoryData);
087            FDistribution fdist = new FDistributionImpl(a.dfbg, a.dfwg);
088            return 1.0 - fdist.cumulativeProbability(a.F);
089        }
090    
091        /**
092         * {@inheritDoc}<p>
093         * This implementation uses the
094         * {@link org.apache.commons.math.distribution.FDistribution
095         * commons-math F Distribution implementation} to estimate the exact
096         * p-value, using the formula<pre>
097         *   p = 1 - cumulativeProbability(F)</pre>
098         * where <code>F</code> is the F value and <code>cumulativeProbability</code>
099         * is the commons-math implementation of the F distribution.</p>
100         * <p>True is returned iff the estimated p-value is less than alpha.</p>
101         */
102        public boolean anovaTest(Collection<double[]> categoryData, double alpha)
103            throws IllegalArgumentException, MathException {
104            if ((alpha <= 0) || (alpha > 0.5)) {
105                throw MathRuntimeException.createIllegalArgumentException(
106                      LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
107                      alpha, 0, 0.5);
108            }
109            return anovaPValue(categoryData) < alpha;
110        }
111    
112    
113        /**
114         * This method actually does the calculations (except P-value).
115         *
116         * @param categoryData <code>Collection</code> of <code>double[]</code>
117         * arrays each containing data for one category
118         * @return computed AnovaStats
119         * @throws IllegalArgumentException if categoryData does not meet
120         * preconditions specified in the interface definition
121         * @throws MathException if an error occurs computing the Anova stats
122         */
123        private AnovaStats anovaStats(Collection<double[]> categoryData)
124            throws IllegalArgumentException, MathException {
125    
126            // check if we have enough categories
127            if (categoryData.size() < 2) {
128                throw MathRuntimeException.createIllegalArgumentException(
129                      LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
130                      categoryData.size());
131            }
132    
133            // check if each category has enough data and all is double[]
134            for (double[] array : categoryData) {
135                if (array.length <= 1) {
136                    throw MathRuntimeException.createIllegalArgumentException(
137                          LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
138                          array.length);
139                }
140            }
141    
142            int dfwg = 0;
143            double sswg = 0;
144            Sum totsum = new Sum();
145            SumOfSquares totsumsq = new SumOfSquares();
146            int totnum = 0;
147    
148            for (double[] data : categoryData) {
149    
150                Sum sum = new Sum();
151                SumOfSquares sumsq = new SumOfSquares();
152                int num = 0;
153    
154                for (int i = 0; i < data.length; i++) {
155                    double val = data[i];
156    
157                    // within category
158                    num++;
159                    sum.increment(val);
160                    sumsq.increment(val);
161    
162                    // for all categories
163                    totnum++;
164                    totsum.increment(val);
165                    totsumsq.increment(val);
166                }
167                dfwg += num - 1;
168                double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num;
169                sswg += ss;
170            }
171            double sst = totsumsq.getResult() - totsum.getResult() *
172                totsum.getResult()/totnum;
173            double ssbg = sst - sswg;
174            int dfbg = categoryData.size() - 1;
175            double msbg = ssbg/dfbg;
176            double mswg = sswg/dfwg;
177            double F = msbg/mswg;
178    
179            return new AnovaStats(dfbg, dfwg, F);
180        }
181    
182        /**
183            Convenience class to pass dfbg,dfwg,F values around within AnovaImpl.
184            No get/set methods provided.
185        */
186        private static class AnovaStats {
187    
188            /** Degrees of freedom in numerator (between groups). */
189            private int dfbg;
190    
191            /** Degrees of freedom in denominator (within groups). */
192            private int dfwg;
193    
194            /** Statistic. */
195            private double F;
196    
197            /**
198             * Constructor
199             * @param dfbg degrees of freedom in numerator (between groups)
200             * @param dfwg degrees of freedom in denominator (within groups)
201             * @param F statistic
202             */
203            private AnovaStats(int dfbg, int dfwg, double F) {
204                this.dfbg = dfbg;
205                this.dfwg = dfwg;
206                this.F = F;
207            }
208        }
209    
210    }