11/*
22 * The baseCode project
3- *
4- * Copyright (c) 2006 University of British Columbia
5- *
3+ *
4+ * Copyright (c) 2006-2021 University of British Columbia
5+ *
66 * Licensed under the Apache License, Version 2.0 (the "License");
77 * you may not use this file except in compliance with the License.
88 * You may obtain a copy of the License at
2323
2424import cern .colt .list .DoubleArrayList ;
2525import cern .jet .stat .Descriptive ;
26+ import org .apache .commons .math3 .util .DoubleArray ;
2627
2728/**
2829 * Miscellaneous functions used for statistical analysis. Some are optimized or specialized versions of methods that can
2930 * be found elsewhere.
30- *
31+ *
32+ * @author Paul Pavlidis
3133 * @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/math/package-summary.html">cern.jet.math
32- * </a>
34+ * </a>
3335 * @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/package-summary.html">cern.jet.stat
34- * </a>
35- * @author Paul Pavlidis
36+ * </a>
3637 */
3738public class Stats {
3839
3940
4041 /**
4142 * Convert an array into a cumulative density function (CDF). This assumes that the input contains counts
4243 * representing the distribution in question.
43- *
44+ *
4445 * @param x The input of counts (i.e. a histogram).
4546 * @return DoubleArrayList the CDF.
4647 */
47- public static DoubleArrayList cdf ( DoubleArrayList x ) {
48- return cumulateRight ( normalize ( x ) );
48+ public static DoubleArrayList cdf (DoubleArrayList x ) {
49+ return cumulateRight (normalize (x ) );
4950 }
5051
5152 /**
5253 * Convert an array into a cumulative array. Summing is from the left hand side. Use this to make CDFs where the
5354 * concern is the left tail.
54- *
55+ *
5556 * @param x DoubleArrayList
5657 * @return cern.colt.list.DoubleArrayList
5758 */
58- public static DoubleArrayList cumulate ( DoubleArrayList x ) {
59- if ( x .size () == 0 ) {
60- return new DoubleArrayList ( 0 );
59+ public static DoubleArrayList cumulate (DoubleArrayList x ) {
60+ if (x .size () == 0 ) {
61+ return new DoubleArrayList (0 );
6162 }
6263
6364 DoubleArrayList r = new DoubleArrayList ();
6465
6566 double sum = 0.0 ;
66- for ( int i = 0 ; i < x .size (); i ++ ) {
67- sum += x .get ( i );
68- r .add ( sum );
67+ for (int i = 0 ; i < x .size (); i ++) {
68+ sum += x .get (i );
69+ r .add (sum );
6970 }
7071 return r ;
7172 }
@@ -74,21 +75,21 @@ public static DoubleArrayList cumulate( DoubleArrayList x ) {
7475 * Convert an array into a cumulative array. Summing is from the right hand side. This is useful for creating
7576 * upper-tail cumulative density histograms from count histograms, where the upper tail is expected to have very
7677 * small numbers that could be lost to rounding.
77- *
78+ *
7879 * @param x the array of data to be cumulated.
7980 * @return cern.colt.list.DoubleArrayList
8081 */
81- public static DoubleArrayList cumulateRight ( DoubleArrayList x ) {
82- if ( x .size () == 0 ) {
83- return new DoubleArrayList ( 0 );
82+ public static DoubleArrayList cumulateRight (DoubleArrayList x ) {
83+ if (x .size () == 0 ) {
84+ return new DoubleArrayList (0 );
8485 }
8586
86- DoubleArrayList r = new DoubleArrayList ( new double [x .size ()] );
87+ DoubleArrayList r = new DoubleArrayList (new double [x .size ()]);
8788
8889 double sum = 0.0 ;
89- for ( int i = x .size () - 1 ; i >= 0 ; i -- ) {
90- sum += x .get ( i );
91- r .set ( i , sum );
90+ for (int i = x .size () - 1 ; i >= 0 ; i --) {
91+ sum += x .get (i );
92+ r .set (i , sum );
9293 }
9394 return r ;
9495 }
@@ -97,33 +98,33 @@ public static DoubleArrayList cumulateRight( DoubleArrayList x ) {
9798 * Compute the coefficient of variation of an array (standard deviation / mean). If the variance is zero, this
9899 * returns zero. If the mean is zero, NaN is returned. If the mean is negative, the CV is computed relative to the
99100 * absolute value of the mean; that is, negative values are treated as magnitudes.
100- *
101+ *
101102 * @param data DoubleArrayList
102103 * @return the cv
103104 * @todo offer a regularized version of this function.
104105 */
105- public static double cv ( DoubleArrayList data ) {
106- double mean = DescriptiveWithMissing .mean ( data );
106+ public static double cv (DoubleArrayList data ) {
107+ double mean = DescriptiveWithMissing .mean (data );
107108
108- double sampleVariance = DescriptiveWithMissing .sampleVariance ( data , mean );
109+ double sampleVariance = DescriptiveWithMissing .sampleVariance (data , mean );
109110
110- if ( sampleVariance == 0.0 ) return 0.0 ;
111+ if (sampleVariance == 0.0 ) return 0.0 ;
111112
112- if ( mean == 0.0 ) {
113+ if (mean == 0.0 ) {
113114 return 0.0 ;
114115 }
115116
116- return Math .sqrt ( sampleVariance ) / Math .abs ( mean );
117+ return Math .sqrt (sampleVariance ) / Math .abs (mean );
117118 }
118119
119120 /**
120121 * Test whether a value is a valid fractional or probability value.
121- *
122+ *
122123 * @param value
123124 * @return true if the value is in the interval 0 to 1.
124125 */
125- public static boolean isValidFraction ( double value ) {
126- if ( value > 1.0 || value < 0.0 ) {
126+ public static boolean isValidFraction (double value ) {
127+ if (value > 1.0 || value < 0.0 ) {
127128 return false ;
128129 }
129130 return true ;
@@ -132,25 +133,25 @@ public static boolean isValidFraction( double value ) {
132133 /**
133134 * calculate the mean of the values above (NOT greater or equal to) a particular index rank of an array. Quantile
134135 * must be a value from 0 to 100.
135- *
136- * @see DescriptiveWithMissing#meanAboveQuantile
137- * @param index the rank of the value we wish to average above.
138- * @param array Array for which we want to get the quantile.
136+ *
137+ * @param index the rank of the value we wish to average above.
138+ * @param array Array for which we want to get the quantile.
139139 * @param effectiveSize The size of the array, not including NaNs.
140140 * @return double
141+ * @see DescriptiveWithMissing#meanAboveQuantile
141142 */
142- public static double meanAboveQuantile ( int index , double [] array , int effectiveSize ) {
143+ public static double meanAboveQuantile (int index , double [] array , int effectiveSize ) {
143144
144145 double [] temp = new double [effectiveSize ];
145146 double median ;
146147 double returnvalue = 0.0 ;
147148 int k = 0 ;
148149
149150 temp = array ;
150- median = quantile ( index , array , effectiveSize );
151+ median = quantile (index , array , effectiveSize );
151152
152- for ( int i = 0 ; i < effectiveSize ; i ++ ) {
153- if ( temp [i ] > median ) {
153+ for (int i = 0 ; i < effectiveSize ; i ++) {
154+ if (temp [i ] > median ) {
154155 returnvalue += temp [i ];
155156 k ++;
156157 }
@@ -160,77 +161,128 @@ public static double meanAboveQuantile( int index, double[] array, int effective
160161
161162 /**
162163 * Adjust the elements of an array so they total to 1.0.
163- *
164+ *
164165 * @param x Input array.
165166 * @return Normalized array.
166167 */
167- public static DoubleArrayList normalize ( DoubleArrayList x ) {
168- return normalize ( x , Descriptive .sum ( x ) );
168+ public static DoubleArrayList normalize (DoubleArrayList x ) {
169+ return normalize (x , Descriptive .sum (x ) );
169170 }
170171
171172 /**
172173 * Divide the elements of an array by a given factor.
173- *
174- * @param x Input array.
174+ *
175+ * @param x Input array.
175176 * @param normfactor double
176177 * @return Normalized array.
177178 */
178- public static DoubleArrayList normalize ( DoubleArrayList x , double normfactor ) {
179- if ( x .size () == 0 ) {
180- return new DoubleArrayList ( 0 );
179+ public static DoubleArrayList normalize (DoubleArrayList x , double normfactor ) {
180+ if (x .size () == 0 ) {
181+ return new DoubleArrayList (0 );
181182 }
182183
183184 DoubleArrayList r = new DoubleArrayList ();
184185
185- for ( int i = 0 ; i < x .size (); i ++ ) {
186- r .add ( x .get ( i ) / normfactor );
186+ for (int i = 0 ; i < x .size (); i ++) {
187+ r .add (x .get (i ) / normfactor );
187188 }
188189 return r ;
189190
190191 }
191192
192193 /**
193- * @param array
194+ * @param array input data
194195 * @param tolerance a small constant
195196 * @return number of distinct values in the array, within tolerance. Double.NaN is counted as a distinct
196- * value.
197+ * value.
197198 */
198- public static Integer numberofDistinctValues ( DoubleArrayList array , double tolerance ) {
199+ public static Integer numberofDistinctValues (DoubleArrayList array , double tolerance ) {
199200
200201 Set <Double > distinct = new HashSet <>();
201202 int r = 1 ;
202- if ( tolerance > 0.0 ) {
203- r = ( int ) Math .ceil ( 1.0 / tolerance );
203+ if (tolerance > 0.0 ) {
204+ r = (int ) Math .ceil (1.0 / tolerance );
204205 }
205- for ( int i = 0 ; i < array .size (); i ++ ) {
206- double v = array .get ( i );
207- if ( tolerance > 0 ) {
206+ for (int i = 0 ; i < array .size (); i ++) {
207+ double v = array .get (i );
208+ if (tolerance > 0 ) {
208209 // this might not be foolproof
209- distinct .add ( ( double ) Math .round ( v * r ) / r );
210+ distinct .add (( double ) Math .round (v * r ) / r );
210211 } else {
211- distinct .add ( v );
212+ distinct .add (v );
212213 }
213214 }
214- return Math .max ( 0 , distinct .size () );
215+ return Math .max (0 , distinct .size ());
215216
216217 }
217218
219+
218220 /**
219- * Given a double array, calculate the quantile requested. Note that no interpolation is done.
220- *
221- * @see DescriptiveWithMissing#quantile
222- * @param index - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median,
223- * we should enter 100.
224- * @param values double[] - array of data we want quantile of
221+ * @param tolerance a small constant
222+ * @return number of distinct values in the array, within tolerance. Double.NaN is ignored entirely
223+ */
224+ public static Integer numberofDistinctValuesNonNA (DoubleArrayList array , double tolerance ) {
225+
226+ Set <Double > distinct = new HashSet <>();
227+ int r = 1 ;
228+ if (tolerance > 0.0 ) {
229+ r = (int ) Math .ceil (1.0 / tolerance );
230+ }
231+ for (int i = 0 ; i < array .size (); i ++) {
232+ double v = array .get (i );
233+ if (Double .isNaN (v )) {
234+ continue ;
235+ }
236+ if (tolerance > 0 ) {
237+ // this might not be foolproof
238+ distinct .add ((double ) Math .round (v * r ) / r );
239+ } else {
240+ distinct .add (v );
241+ }
242+ }
243+ return Math .max (0 , distinct .size ());
244+
245+ }
246+
247+ /**
248+ * Compute the fraction of values which are distinct. NaNs are ignored entirely. If the data are all NaN, 0.0 is returned.
249+ *
250+ * @param array input data
251+ * @param tolerance a small constant to define the difference that is "distinct"
252+ * @return
253+ */
254+ public static Double fractionDistinctValuesNonNA (DoubleArrayList array , double tolerance ) {
255+ double numNonNA = (double ) numNonMissing (array );
256+ if (numNonNA == 0 ) return 0.0 ;
257+ return (double ) numberofDistinctValuesNonNA (array , tolerance ) / numNonNA ;
258+ }
259+
260+ private static Integer numNonMissing (DoubleArrayList array ) {
261+ int nm = 0 ;
262+ for (int i = 0 ; i < array .size (); i ++) {
263+ if (Double .isNaN (array .get (i ))) continue ;
264+ nm ++;
265+ }
266+ return nm ;
267+ }
268+
269+
270+ /**
271+ * Given a double array, calculate the quantile requested. Note that no interpolation is done and missing values are ignored.
272+ *
273+ * @param index - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median,
274+ * we should enter 100.
275+ * @param values double[] - array of data we want quantile of
225276 * @param effectiveSize int the effective size of the array
226277 * @return double the value at the requested quantile
278+ * @see DescriptiveWithMissing#quantile
227279 */
228- public static double quantile ( int index , double [] values , int effectiveSize ) {
280+ public static double quantile (int index , double [] values , int effectiveSize ) {
229281 double pivot = -1.0 ;
230- if ( index == 0 ) {
282+ if (index == 0 ) {
231283 double ans = values [0 ];
232- for ( int i = 1 ; i < effectiveSize ; i ++ ) {
233- if ( ans > values [i ] ) {
284+ for (int i = 1 ; i < effectiveSize ; i ++) {
285+ if (ans > values [i ]) {
234286 ans = values [i ];
235287 }
236288 }
@@ -239,7 +291,7 @@ public static double quantile( int index, double[] values, int effectiveSize ) {
239291
240292 double [] temp = new double [effectiveSize ];
241293
242- for ( int i = 0 ; i < effectiveSize ; i ++ ) {
294+ for (int i = 0 ; i < effectiveSize ; i ++) {
243295 temp [i ] = values [i ];
244296 }
245297
@@ -249,33 +301,33 @@ public static double quantile( int index, double[] values, int effectiveSize ) {
249301 double [] bigger = new double [effectiveSize ];
250302 int itrSm = 0 ;
251303 int itrBg = 0 ;
252- for ( int i = 1 ; i < effectiveSize ; i ++ ) {
253- if ( temp [i ] <= pivot ) {
304+ for (int i = 1 ; i < effectiveSize ; i ++) {
305+ if (temp [i ] <= pivot ) {
254306 smaller [itrSm ] = temp [i ];
255307 itrSm ++;
256- } else if ( temp [i ] > pivot ) {
308+ } else if (temp [i ] > pivot ) {
257309 bigger [itrBg ] = temp [i ];
258310 itrBg ++;
259311 }
260312 }
261- if ( itrSm > index ) { // quantile must be in the 'smaller' array
262- return quantile ( index , smaller , itrSm );
263- } else if ( itrSm < index ) { // quantile is in the 'bigger' array
264- return quantile ( index - itrSm - 1 , bigger , itrBg );
313+ if (itrSm > index ) { // quantile must be in the 'smaller' array
314+ return quantile (index , smaller , itrSm );
315+ } else if (itrSm < index ) { // quantile is in the 'bigger' array
316+ return quantile (index - itrSm - 1 , bigger , itrBg );
265317 } else {
266318 return pivot ;
267319 }
268320
269321 }
270322
271323 /**
272- * Compute the range of an array.
273- *
324+ * Compute the range of an array. Missing values are ignored.
325+ *
274326 * @param data DoubleArrayList
275327 * @return double
276328 */
277- public static double range ( DoubleArrayList data ) {
278- return DescriptiveWithMissing .max ( data ) - DescriptiveWithMissing .min ( data );
329+ public static double range (DoubleArrayList data ) {
330+ return DescriptiveWithMissing .max (data ) - DescriptiveWithMissing .min (data );
279331 }
280332
281333 private Stats () { /* block instantiation */
0 commit comments