casacore
ClassicalStatistics.h
Go to the documentation of this file.
1 //# Copyright (C) 2000,2001
2 //# Associated Universities, Inc. Washington DC, USA.
3 //#
4 //# This library is free software; you can redistribute it and/or modify it
5 //# under the terms of the GNU Library General Public License as published by
6 //# the Free Software Foundation; either version 2 of the License, or (at your
7 //# option) any later version.
8 //#
9 //# This library is distributed in the hope that it will be useful, but WITHOUT
10 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
12 //# License for more details.
13 //#
14 //# You should have received a copy of the GNU Library General Public License
15 //# along with this library; if not, write to the Free Software Foundation,
16 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
17 //#
18 //# Correspondence concerning AIPS++ should be addressed as follows:
19 //# Internet email: aips2-request@nrao.edu.
20 //# Postal address: AIPS++ Project Office
21 //# National Radio Astronomy Observatory
22 //# 520 Edgemont Road
23 //# Charlottesville, VA 22903-2475 USA
24 //#
25 //# $Id: Array.h 21545 2015-01-22 19:36:35Z gervandiepen $
26 
27 #ifndef SCIMATH_CLASSICALSTATS_H
28 #define SCIMATH_CLASSICALSTATS_H
29 
30 #include <casacore/casa/aips.h>
31 
32 #include <casacore/scimath/Mathematics/StatisticsAlgorithm.h>
33 
34 #include <casacore/scimath/Mathematics/StatisticsTypes.h>
35 #include <casacore/scimath/Mathematics/StatisticsUtilities.h>
36 
37 #include <set>
38 #include <vector>
39 #include <utility>
40 
41 namespace casacore {
42 
43 // Class to calculate statistics in a "classical" sense, ie using accumulators with no
44 // special filtering beyond optional range filtering etc.
45 //
46 // setCalculateAsAdded() allows one to specify if statistics should be calculated and updated
47 // on upon each call to set/addData(). If False, statistics will be calculated only when
48 // getStatistic(), getStatistics(), or similar methods are called. Setting this value to True
49 // allows the caller to not have to keep all the data accessible at once. Note however, that all
50 // data must be simultaneously accessible if quantile (eg median) calculations are desired.
51 
52 // I attempted to write this class using the Composite design pattern, with eg the
53 // _unweightedStats() and _weightedStats() methods in their own class, but for reasons I
54 // don't understand, that impacted performance significantly. So I'm using the current
55 // architecture, which I know is a bit a maintenance nightmare.
56 
57 template <class AccumType, class DataIterator, class MaskIterator=const Bool*, class WeightsIterator=DataIterator>
59  : public StatisticsAlgorithm<AccumType, DataIterator, MaskIterator, WeightsIterator> {
60 public:
61 
63 
64  // copy semantics
66 
67  virtual ~ClassicalStatistics();
68 
69  // copy semantics
72  );
73 
74  // get the algorithm that this object uses for computing stats
77  };
78 
79  // <group>
80  // In the following group of methods, if the size of the composite dataset
81  // is smaller than
82  // <src>binningThreshholdSizeBytes</src>, the composite dataset
83  // will be (perhaps partially) sorted and persisted in memory during the
84  // call. In that case, and if <src>persistSortedArray</src> is True, this
85  // sorted array will remain in memory after the call and will be used on
86  // subsequent calls of this method when <src>binningThreshholdSizeBytes</src>
87  // is greater than the size of the composite dataset. If
88  // <src>persistSortedArray</src> is False, the sorted array will not be
89  // stored after this call completes and so any subsequent calls for which the
90  // dataset size is less than <src>binningThreshholdSizeBytes</src>, the
91  // dataset will be sorted from scratch. Values which are not included due to
92  // non-unity strides, are not included in any specified ranges, are masked,
93  // or have associated weights of zero are not considered as dataset members
94  // for quantile computations.
95  // If one has a priori information regarding the number of points (npts) and/or
96  // the minimum and maximum values of the data set, these can be supplied to
97  // improve performance. Note however, that if these values are not correct, the
98  // resulting median
99  // and/or quantile values will also not be correct (although see the following notes regarding
100  // max/min). Note that if this object has already had getStatistics()
101  // called, and the min and max were calculated, there is no need to pass these values in
102  // as they have been stored internally and used (although passing them in shouldn't hurt
103  // anything). If provided, npts, the number of points falling in the specified ranges which are
104  // not masked and have weights > 0, should be exactly correct. <src>min</src> can be less than
105  // the true minimum, and <src>max</src> can be greater than the True maximum, but for best
106  // performance, these should be as close to the actual min and max as possible.
107  // In order for quantile computations to occur over multiple datasets, all datasets
108  // must be available. This means that if setCalculateAsAdded()
109  // was previously called by passing in a value of True, these methods will throw
110  // an exception as the previous call indicates that there is no guarantee that
111  // all datasets will be available. If one uses a data provider (by having called
112  // setDataProvider()), then this should not be an issue.
113 
114  // get the median of the distribution.
115  // For a dataset with an odd number of good points, the median is just the value
116  // at index int(N/2) in the equivalent sorted dataset, where N is the number of points.
117  // For a dataset with an even number of points, the median is the mean of the values at
118  // indices int(N/2)-1 and int(N/2) in the sorted dataset.
119  // <src>nBins</src> is the number of bins, per histogram, to use to bin the data. More
120  // bins decrease the likelihood that multiple passes of the data set will be necessary, but
121  // also increase the amount of memory used. If nBins is set to less than 1,000, it is
122  // automatically increased to 1,000; there should be no reason to ever set nBins to be
123  // this small.
124  virtual AccumType getMedian(
125  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
126  CountedPtr<AccumType> knownMax=NULL, uInt binningThreshholdSizeBytes=4096*4096,
127  Bool persistSortedArray=False, uInt64 nBins=10000
128  );
129 
130  // If one needs to compute both the median and quantile values, it is better to call
131  // getMedianAndQuantiles() rather than getMedian() and getQuantiles() separately, as the
132  // first will scan large data sets fewer times than calling the separate methods.
133  // The return value is the median; the quantiles are returned in the <src>quantiles</src> map.
134  // Values in the <src>fractions</src> set represent the locations in the CDF and should be
135  // between 0 and 1, exclusive.
136  virtual AccumType getMedianAndQuantiles(
137  std::map<Double, AccumType>& quantiles, const std::set<Double>& fractions,
138  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
139  CountedPtr<AccumType> knownMax=NULL,
140  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
141  uInt64 nBins=10000
142  );
143 
144  // get the median of the absolute deviation about the median of the data.
145  virtual AccumType getMedianAbsDevMed(
146  CountedPtr<uInt64> knownNpts=NULL,
147  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
148  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
149  uInt64 nBins=10000
150  );
151 
152  // Get the specified quantiles. <src>fractions</src> must be between 0 and 1,
153  // noninclusive.
154  virtual std::map<Double, AccumType> getQuantiles(
155  const std::set<Double>& fractions, CountedPtr<uInt64> knownNpts=NULL,
156  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
157  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
158  uInt64 nBins=10000
159  );
160 
161  // </group>
162 
163  // scan the dataset(s) that have been added, and find the min and max.
164  // This method may be called even if setStatsToCaclulate has been called and
165  // MAX and MIN has been excluded. If setCalculateAsAdded(True) has previously been
166  // called after this object has been (re)initialized, an exception will be thrown.
167  virtual void getMinMax(AccumType& mymin, AccumType& mymax);
168 
169  // scan the dataset(s) that have been added, and find the number of good points.
170  // This method may be called even if setStatsToCaclulate has been called and
171  // NPTS has been excluded. If setCalculateAsAdded(True) has previously been
172  // called after this object has been (re)initialized, an exception will be thrown.
173  virtual uInt64 getNPts();
174 
175  // see base class description
176  virtual std::pair<Int64, Int64> getStatisticIndex(StatisticsData::STATS stat);
177 
178  // reset object to initial state. Clears all private fields including data,
179  // accumulators, etc.
180  virtual void reset();
181 
182  // Should statistics be updated with calls to addData or should they only be calculated
183  // upon calls to getStatistics etc? Beware that calling this will automatically reinitialize
184  // the object, so that it will contain no references to data et al. after this method has
185  // been called.
186  virtual void setCalculateAsAdded(Bool c);
187 
188  // An exception will be thrown if setCalculateAsAdded(True) has been called.
190 
191  void setStatsToCalculate(std::set<StatisticsData::STATS>& stats);
192 
193 protected:
194 
195  // <group>
196  // scan through the data set to determine the number of good (unmasked, weight > 0,
197  // within range) points. The first with no mask, no
198  // ranges, and no weights is trivial with npts = nr in this class, but is implemented here
199  // so that derived classes may override it.
200  inline virtual void _accumNpts(
201  uInt64& npts,
202  const DataIterator& dataBegin, Int64 nr, uInt dataStride
203  ) const;
204 
205  virtual void _accumNpts(
206  uInt64& npts,
207  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
208  const DataRanges& ranges, Bool isInclude
209  ) const;
210 
211  virtual void _accumNpts(
212  uInt64& npts,
213  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
214  const MaskIterator& maskBegin, uInt maskStride
215  ) const;
216 
217  virtual void _accumNpts(
218  uInt64& npts,
219  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
220  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
221  Bool isInclude
222  ) const;
223 
224  virtual void _accumNpts(
225  uInt64& npts,
226  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
227  Int64 nr, uInt dataStride
228  ) const;
229 
230  virtual void _accumNpts(
231  uInt64& npts,
232  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
233  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
234  ) const;
235 
236  virtual void _accumNpts(
237  uInt64& npts,
238  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
239  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
240  const DataRanges& ranges, Bool isInclude
241  ) const;
242 
243  virtual void _accumNpts(
244  uInt64& npts,
245  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
246  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
247  ) const;
248  // </group>
249 
250  // <group>
251  inline void _accumulate(
252  AccumType& mymin, AccumType& mymax, Int64& minpos, Int64& maxpos,
253  const AccumType& datum , Int64 count
254  );
255 
256  inline void _accumulate(
257  AccumType& mymin, AccumType& mymax, Int64& minpos, Int64& maxpos, const AccumType& datum,
258  const AccumType& weight, Int64 count
259  );
260  // </group>
261 
262  void _addData();
263 
264  void _clearData();
265 
266  void _clearStats();
267 
268  // scan dataset(s) to find min and max
269  void _doMinMax(AccumType& vmin, AccumType& vmax);
270 
271  // <group>
272  // Get the counts of data within the specified histogram bins. The number of
273  // arrays within binCounts will be equal to the number of histograms in <src>binDesc</src>.
274  // Each array within <src>binCounts</src> will have the same number of elements as the
275  // number of bins in its corresponding histogram in <src>binDesc</src>.
276  virtual void _findBins(
277  vector<vector<uInt64> >& binCounts,
278  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
279  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
280  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc,
281  const vector<AccumType>& maxLimit
282  ) const;
283 
284  virtual void _findBins(
285  vector<vector<uInt64> >& binCounts,
286  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
287  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
288  const DataRanges& ranges, Bool isInclude,
289  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
290  ) const;
291 
292  virtual void _findBins(
293  vector<vector<uInt64> >& binCounts,
294  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
295  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
296  const MaskIterator& maskBegin, uInt maskStride,
297  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
298  ) const;
299 
300  virtual void _findBins(
301  vector<vector<uInt64> >& binCounts,
302  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
303  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
304  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
305  Bool isInclude,
306  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
307  ) const;
308 
309  virtual void _findBins(
310  vector<vector<uInt64> >& binCounts,
311  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
312  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
313  Int64 nr, uInt dataStride,
314  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
315  ) const ;
316 
317  virtual void _findBins(
318  vector<vector<uInt64> >& binCounts,
319  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
320  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
321  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude,
322  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
323  ) const;
324 
325  virtual void _findBins(
326  vector<vector<uInt64> >& binCounts,
327  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
328  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
329  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
330  const DataRanges& ranges, Bool isInclude,
331  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
332  ) const;
333 
334  virtual void _findBins(
335  vector<vector<uInt64> >& binCounts,
336  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
337  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
338  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
339  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
340  ) const;
341  // </group>
342 
343  AccumType _getStatistic(StatisticsData::STATS stat);
344 
346 
347  // retreive stats structure. Allows derived classes to maintain their own
348  // StatsData structs.
349  inline virtual StatsData<AccumType>& _getStatsData() { return _statsData; }
350 
351  inline virtual const StatsData<AccumType>& _getStatsData() const { return _statsData; }
352 
353  // <group>
354  virtual void _minMax(
356  const DataIterator& dataBegin, Int64 nr, uInt dataStride
357  ) const;
358 
359  virtual void _minMax(
361  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
362  const DataRanges& ranges, Bool isInclude
363  ) const;
364 
365  virtual void _minMax(
367  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
368  const MaskIterator& maskBegin, uInt maskStride
369  ) const;
370 
371  virtual void _minMax(
373  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
374  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
375  Bool isInclude
376  ) const;
377 
378  virtual void _minMax(
380  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
381  Int64 nr, uInt dataStride
382  ) const;
383 
384  virtual void _minMax(
386  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
387  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
388  ) const;
389 
390  virtual void _minMax(
392  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
393  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
394  const DataRanges& ranges, Bool isInclude
395  ) const;
396 
397  virtual void _minMax(
399  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
400  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
401  ) const;
402  // </group>
403 
404  //<group>
405  // populate an unsorted array with valid data.
406  // no weights, no mask, no ranges
407  virtual void _populateArray(
408  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr, uInt dataStride
409  ) const;
410 
411  // ranges
412  virtual void _populateArray(
413  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
414  uInt dataStride, const DataRanges& ranges, Bool isInclude
415  ) const;
416 
417  virtual void _populateArray(
418  vector<AccumType>& ary, const DataIterator& dataBegin,
419  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
420  uInt maskStride
421  ) const;
422 
423  // mask and ranges
424  virtual void _populateArray(
425  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
426  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
427  const DataRanges& ranges, Bool isInclude
428  ) const;
429 
430  // weights
431  virtual void _populateArray(
432  vector<AccumType>& ary, const DataIterator& dataBegin,
433  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride
434  ) const;
435 
436  // weights and ranges
437  virtual void _populateArray(
438  vector<AccumType>& ary, const DataIterator& dataBegin,
439  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
440  const DataRanges& ranges, Bool isInclude
441  ) const;
442 
443  // weights and mask
444  virtual void _populateArray(
445  vector<AccumType>& ary, const DataIterator& dataBegin,
446  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
447  const MaskIterator& maskBegin, uInt maskStride
448  ) const;
449 
450  // weights, mask, ranges
451  virtual void _populateArray(
452  vector<AccumType>& ary, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
453  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
454  const DataRanges& ranges, Bool isInclude
455  ) const;
456  // </group>
457 
458  // <group>
459  // Create a vector of unsorted arrays, one array for each bin defined by <src>includeLimits</src>.
460  // <src>includeLimits</src> should be non-overlapping and should be given in ascending order (the
461  // algorithm used assumes this). Once the sum of the lengths of all arrays equals <src>maxCount</src>
462  // the method will return with no further processing.
463  // no weights, no mask, no ranges
464  virtual void _populateArrays(
465  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin, Int64 nr, uInt dataStride,
466  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
467  ) const;
468 
469  // ranges
470  virtual void _populateArrays(
471  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin, Int64 nr,
472  uInt dataStride, const DataRanges& ranges, Bool isInclude,
473  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
474  ) const;
475 
476  virtual void _populateArrays(
477  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin,
478  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
479  uInt maskStride,
480  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
481  ) const;
482 
483  // mask and ranges
484  virtual void _populateArrays(
485  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin, Int64 nr,
486  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
487  const DataRanges& ranges, Bool isInclude,
488  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
489  ) const;
490 
491  // weights
492  virtual void _populateArrays(
493  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin,
494  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
495  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
496  ) const;
497 
498  // weights and ranges
499  virtual void _populateArrays(
500  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin,
501  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
502  const DataRanges& ranges, Bool isInclude,
503  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
504  ) const;
505 
506  // weights and mask
507  virtual void _populateArrays(
508  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin,
509  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
510  const MaskIterator& maskBegin, uInt maskStride,
511  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
512  ) const;
513 
514  // weights, mask, ranges
515  virtual void _populateArrays(
516  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
517  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
518  const DataRanges& ranges, Bool isInclude,
519  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
520  ) const;
521  // </group>
522 
523  // <group>
524  // no weights, no mask, no ranges
525  virtual Bool _populateTestArray(
526  vector<AccumType>& ary, const DataIterator& dataBegin,
527  Int64 nr, uInt dataStride, uInt maxElements
528  ) const;
529 
530  // ranges
531  virtual Bool _populateTestArray(
532  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
533  uInt dataStride, const DataRanges& ranges, Bool isInclude,
534  uInt maxElements
535  ) const;
536 
537  // mask
538  virtual Bool _populateTestArray(
539  vector<AccumType>& ary, const DataIterator& dataBegin,
540  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
541  uInt maskStride, uInt maxElements
542  ) const;
543 
544  // mask and ranges
545  virtual Bool _populateTestArray(
546  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
547  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
548  const DataRanges& ranges, Bool isInclude, uInt maxElements
549  ) const;
550 
551  // weights
552  virtual Bool _populateTestArray(
553  vector<AccumType>& ary, const DataIterator& dataBegin,
554  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
555  uInt maxElements
556  ) const;
557 
558  // weights and ranges
559  virtual Bool _populateTestArray(
560  vector<AccumType>& ary, const DataIterator& dataBegin,
561  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
562  const DataRanges& ranges, Bool isInclude, uInt maxElements
563  ) const;
564 
565  // weights and mask
566  virtual Bool _populateTestArray(
567  vector<AccumType>& ary, const DataIterator& dataBegin,
568  const WeightsIterator& weightBegin, Int64 nr,
569  uInt dataStride, const MaskIterator& maskBegin,
570  uInt maskStride, uInt maxElements
571  ) const;
572 
573  // weights, mask, ranges
574  virtual Bool _populateTestArray(
575  vector<AccumType>& ary, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
576  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
577  const DataRanges& ranges, Bool isInclude,
578  uInt maxElements
579  ) const;
580  // </group>
581 
582  // <group>
583  // no weights, no mask, no ranges
584  virtual void _unweightedStats(
585  uInt64& ngood, AccumType& mymin, AccumType& mymax,
586  Int64& minpos, Int64& maxpos,
587  const DataIterator& dataBegin, Int64 nr, uInt dataStride
588  );
589 
590  // no weights, no mask
591  virtual void _unweightedStats(
592  uInt64& ngood, AccumType& mymin, AccumType& mymax,
593  Int64& minpos, Int64& maxpos,
594  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
595  const DataRanges& ranges, Bool isInclude
596 
597  );
598 
599  virtual void _unweightedStats(
600  uInt64& ngood, AccumType& mymin, AccumType& mymax,
601  Int64& minpos, Int64& maxpos,
602  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
603  const MaskIterator& maskBegin, uInt maskStride
604 
605  );
606 
607  virtual void _unweightedStats(
608  uInt64& ngood, AccumType& mymin, AccumType& mymax,
609  Int64& minpos, Int64& maxpos,
610  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
611  const MaskIterator& maskBegin, uInt maskStride,
612  const DataRanges& ranges, Bool isInclude
613  );
614  // </group>
615 
616  // <group>
617  // has weights, but no mask, no ranges
618  virtual void _weightedStats(
619  AccumType& mymin, AccumType& mymax,
620  Int64& minpos, Int64& maxpos,
621  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
622  Int64 nr, uInt dataStride
623  );
624 
625  virtual void _weightedStats(
626  AccumType& mymin, AccumType& mymax,
627  Int64& minpos, Int64& maxpos,
628  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
629  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
630  );
631 
632  virtual void _weightedStats(
633  AccumType& mymin, AccumType& mymax,
634  Int64& minpos, Int64& maxpos,
635  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
636  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
637  );
638 
639  virtual void _weightedStats(
640  AccumType& mymin, AccumType& mymax,
641  Int64& minpos, Int64& maxpos,
642  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
643  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
644  const DataRanges& ranges, Bool isInclude
645  );
646  // </group>
647 
648 
649 private:
653 
654  // mutables, used to mitigate repeated code
655  mutable typename vector<DataIterator>::const_iterator _dend, _diter;
656  mutable vector<Int64>::const_iterator _citer;
657  mutable vector<uInt>::const_iterator _dsiter;
658  mutable std::map<uInt, MaskIterator> _masks;
659  mutable uInt _maskStride;
660  mutable std::map<uInt, WeightsIterator> _weights;
661  mutable std::map<uInt, DataRanges> _ranges;
662  mutable std::map<uInt, Bool> _isIncludeRanges;
665  mutable MaskIterator _myMask;
666  mutable DataIterator _myData;
667  mutable WeightsIterator _myWeights;
669  mutable uInt64 _myCount;
670 
671  // tally the number of data points that fall into each bin provided by <src>binDesc</src>
672  // Any points that are less than binDesc.minLimit or greater than
673  // binDesc.minLimit + binDesc.nBins*binDesc.binWidth are not included in the counts. A data
674  // point that falls exactly on a bin boundary is considered to be in the higher index bin.
675  // <src>sameVal</src> will be non-null if all the good values in the histogram range are the
676  // same. In that case, the value held will be the value of each of those data points.
677  vector<vector<uInt64> > _binCounts(
678  vector<CountedPtr<AccumType> >& sameVal,
679  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc
680  );
681 
682  // convert in place by taking the absolute value of the difference of the vector and the median
683  static void _convertToAbsDevMedArray(vector<AccumType>& myArray, AccumType median);
684 
685  // Create an unsorted array of the complete data set. If <src>includeLimits</src> is specified,
686  // only points within those limits (including min but excluding max, as per definition of bins),
687  // are included.
688  void _createDataArray(
689  vector<AccumType>& array
690  );
691 
692  void _createDataArrays(
693  vector<vector<AccumType> >& arrays,
694  const vector<std::pair<AccumType, AccumType> > &includeLimits,
695  uInt maxCount
696  );
697  // extract data from multiple histograms given by <src>binDesc</src>.
698  // <src>dataIndices</src> represent the indices of the sorted arrays of values to
699  // extract. There should be exactly one set of data indices to extract for each
700  // supplied histogram. The data indices are relative to the minimum value of the minimum
701  // bin in their repsective histograms. The ordering of the maps in the returned vector represent
702  // the ordering of histograms in <src>binDesc</src>. <src>binDesc</src> should contain
703  // non-overlapping histograms and the histograms should be specified in ascending order.
704  vector<std::map<uInt64, AccumType> > _dataFromMultipleBins(
705  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, uInt maxArraySize,
706  const vector<std::set<uInt64> >& dataIndices, uInt64 nBins
707  );
708 
709  vector<std::map<uInt64, AccumType> > _dataFromSingleBins(
710  const vector<uInt64>& binNpts, uInt maxArraySize,
711  const vector<std::pair<AccumType, AccumType> >& binLimits,
712  const vector<std::set<uInt64> >& dataIndices, uInt64 nBins
713  );
714 
715  Int64 _doNpts();
716 
717  // get the values for the specified indices in the sorted array of all good data
718  std::map<uInt64, AccumType> _indicesToValues(
719  CountedPtr<uInt64> knownNpts, CountedPtr<AccumType> knownMin,
720  CountedPtr<AccumType> knownMax, uInt maxArraySize,
721  const std::set<uInt64>& dataIndices, Bool persistSortedArray,
722  uInt64 nBins
723  );
724 
725  void _initIterators();
726 
727  void _initLoopVars();
728 
729  // Determine by scanning the dataset if the number of good points is smaller than
730  // <src>maxArraySize</src>. If so, <src>arrayToSort</src> will contain the unsorted
731  // data values. If not, this vector will be empty.
732  Bool _isNptsSmallerThan(vector<AccumType>& arrayToSort, uInt maxArraySize);
733 
734  // If <src>allowPad</src> is True, then pad the lower side of the lowest bin and the
735  // higher side of the highest bin so that minData and maxData do not fall on the edge
736  // of their respective bins. If false, no padding so that minData and maxData are also
737  // exactly the histogram abscissa limits.
738  static void _makeBins(
739  typename StatisticsUtilities<AccumType>::BinDesc& bins, AccumType minData, AccumType maxData, uInt maxBins,
740  Bool allowPad
741  );
742 
743  // If input set has one value, that is the median, if it has two, the median is the average
744  // of those.
745  // static AccumType _medianFromSet(const std::map<uInt64, AccumType>& values);
746 
747  // get the index (for odd npts) or indices (for even npts) of the median of the sorted array.
748  // If knownNpts is not null, it will be used and must be correct. If it is null, the value of
749  // _npts will be used if it has been previously calculated. If not, the data sets will
750  // be scanned to determine npts.
751  std::set<uInt64> _medianIndices(CountedPtr<uInt64> knownNpts);
752 
753  // update min and max if necessary
754  virtual void _updateMaxMin(
755  AccumType mymin, AccumType mymax, Int64 minpos,
756  Int64 maxpos, uInt dataStride, const Int64& currentDataset
757  );
758 
759  // get values from sorted array if the array is small enough to be held in
760  // memory. Note that this is the array containing all good data, not data in
761  // just a single bin representing a subset of good data.
762  // Returns True if the data were successfully retrieved.
763  // If True is returned, the values map will contain a map of index to value.
765  std::map<uInt64, AccumType>& values, CountedPtr<uInt64> knownNpts,
766  const std::set<uInt64>& indices, uInt maxArraySize,
767  Bool persistSortedArray
768  );
769 };
770 
771 }
772 
773 #ifndef CASACORE_NO_AUTO_TEMPLATES
774 #include <casacore/scimath/Mathematics/ClassicalStatistics.tcc>
775 #endif //# CASACORE_NO_AUTO_TEMPLATES
776 
777 #endif
void _doMinMax(AccumType &vmin, AccumType &vmax)
scan dataset(s) to find min and max
vector< DataIterator >::const_iterator _dend
mutables, used to mitigate repeated code
long long Int64
Define the extra non-standard types used by Casacore (like proposed uSize, Size)
Definition: aipsxtype.h:38
virtual void _updateMaxMin(AccumType mymin, AccumType mymax, Int64 minpos, Int64 maxpos, uInt dataStride, const Int64 &currentDataset)
update min and max if necessary
LatticeExprNode median(const LatticeExprNode &expr)
AccumType _getStatistic(StatisticsData::STATS stat)
ClassicalStatistics< AccumType, DataIterator, MaskIterator, WeightsIterator > & operator=(const ClassicalStatistics< AccumType, DataIterator, MaskIterator, WeightsIterator > &other)
copy semantics
vector< DataIterator >::const_iterator _diter
virtual void _minMax(CountedPtr< AccumType > &mymin, CountedPtr< AccumType > &mymax, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
StatsData< AccumType > _getStatistics()
virtual StatsData< AccumType > & _getStatsData()
retreive stats structure.
TableExprNode array(const TableExprNode &values, const TableExprNodeSet &shape)
Create an array of the given shape and fill it with the values.
Definition: ExprNode.h:1986
void _createDataArray(vector< AccumType > &array)
Create an unsorted array of the complete data set.
virtual void _weightedStats(AccumType &mymin, AccumType &mymax, Int64 &minpos, Int64 &maxpos, const DataIterator &dataBegin, const WeightsIterator &weightsBegin, Int64 nr, uInt dataStride)
has weights, but no mask, no ranges
unsigned long long uInt64
Definition: aipsxtype.h:39
std::set< uInt64 > _medianIndices(CountedPtr< uInt64 > knownNpts)
If input set has one value, that is the median, if it has two, the median is the average of those...
virtual std::pair< Int64, Int64 > getStatisticIndex(StatisticsData::STATS stat)
see base class description
std::map< uInt, DataRanges > _ranges
void setStatsToCalculate(std::set< StatisticsData::STATS > &stats)
Provide guidance to algorithms by specifying a priori which statistics the caller would like calculat...
Abstract base class which defines interface for providing "datasets" to the statistics framework when...
Class to calculate statistics in a "classical" sense, ie using accumulators with no special filtering...
virtual uInt64 getNPts()
scan the dataset(s) that have been added, and find the number of good points.
virtual void getMinMax(AccumType &mymin, AccumType &mymax)
scan the dataset(s) that have been added, and find the min and max.
vector< vector< uInt64 > > _binCounts(vector< CountedPtr< AccumType > > &sameVal, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc)
tally the number of data points that fall into each bin provided by binDesc Any points that are less ...
std::map< uInt, WeightsIterator > _weights
virtual void _unweightedStats(uInt64 &ngood, AccumType &mymin, AccumType &mymax, Int64 &minpos, Int64 &maxpos, const DataIterator &dataBegin, Int64 nr, uInt dataStride)
no weights, no mask, no ranges
ALGORITHM
implemented algorithms
virtual const StatsData< AccumType > & _getStatsData() const
void _addData()
Allows derived classes to do things after data is set or added.
Referenced counted pointer for constant data.
Definition: CountedPtr.h:86
virtual void _accumNpts(uInt64 &npts, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
scan through the data set to determine the number of good (unmasked, weight > 0, within range) points...
virtual AccumType getMedianAbsDevMed(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
get the median of the absolute deviation about the median of the data.
std::map< uInt, Bool > _isIncludeRanges
virtual void _findBins(vector< vector< uInt64 > > &binCounts, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, const DataIterator &dataBegin, Int64 nr, uInt dataStride, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, const vector< AccumType > &maxLimit) const
Get the counts of data within the specified histogram bins.
static void _convertToAbsDevMedArray(vector< AccumType > &myArray, AccumType median)
convert in place by taking the absolute value of the difference of the vector and the median ...
void _createDataArrays(vector< vector< AccumType > > &arrays, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt maxCount)
#define DataRanges
Commonly used types in statistics framework.
bool Bool
Define the standard types used by Casacore.
Definition: aipstype.h:39
virtual Bool _populateTestArray(vector< AccumType > &ary, const DataIterator &dataBegin, Int64 nr, uInt dataStride, uInt maxElements) const
no weights, no mask, no ranges
virtual void setCalculateAsAdded(Bool c)
Should statistics be updated with calls to addData or should they only be calculated upon calls to ge...
void setDataProvider(StatsDataProvider< AccumType, DataIterator, MaskIterator, WeightsIterator > *dataProvider)
An exception will be thrown if setCalculateAsAdded(True) has been called.
virtual AccumType getMedianAndQuantiles(std::map< Double, AccumType > &quantiles, const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
If one needs to compute both the median and quantile values, it is better to call getMedianAndQuantil...
virtual std::map< Double, AccumType > getQuantiles(const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
Get the specified quantiles.
std::map< uInt, MaskIterator > _masks
static void _makeBins(typename StatisticsUtilities< AccumType >::BinDesc &bins, AccumType minData, AccumType maxData, uInt maxBins, Bool allowPad)
If allowPad is True, then pad the lower side of the lowest bin and the higher side of the highest bin...
vector< uInt >::const_iterator _dsiter
const Bool False
Definition: aipstype.h:41
void _accumulate(AccumType &mymin, AccumType &mymax, Int64 &minpos, Int64 &maxpos, const AccumType &datum, Int64 count)
vector< std::map< uInt64, AccumType > > _dataFromMultipleBins(const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, uInt maxArraySize, const vector< std::set< uInt64 > > &dataIndices, uInt64 nBins)
extract data from multiple histograms given by binDesc.
StatsData< AccumType > _statsData
virtual void reset()
reset object to initial state.
virtual StatisticsData::ALGORITHM algorithm() const
get the algorithm that this object uses for computing stats
vector< Int64 >::const_iterator _citer
const Double c
Fundamental physical constants (SI units):
virtual AccumType getMedian(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
In the following group of methods, if the size of the composite dataset is smaller than binningThresh...
std::map< uInt64, AccumType > _indicesToValues(CountedPtr< uInt64 > knownNpts, CountedPtr< AccumType > knownMin, CountedPtr< AccumType > knownMax, uInt maxArraySize, const std::set< uInt64 > &dataIndices, Bool persistSortedArray, uInt64 nBins)
get the values for the specified indices in the sorted array of all good data
Bool _isNptsSmallerThan(vector< AccumType > &arrayToSort, uInt maxArraySize)
Determine by scanning the dataset if the number of good points is smaller than maxArraySize.
Bool _valuesFromSortedArray(std::map< uInt64, AccumType > &values, CountedPtr< uInt64 > knownNpts, const std::set< uInt64 > &indices, uInt maxArraySize, Bool persistSortedArray)
get values from sorted array if the array is small enough to be held in memory.
virtual void _populateArray(vector< AccumType > &ary, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
populate an unsorted array with valid data.
vector< std::map< uInt64, AccumType > > _dataFromSingleBins(const vector< uInt64 > &binNpts, uInt maxArraySize, const vector< std::pair< AccumType, AccumType > > &binLimits, const vector< std::set< uInt64 > > &dataIndices, uInt64 nBins)
Base class of statistics algorithm class hierarchy.
this file contains all the compiler specific defines
Definition: mainpage.dox:28
virtual void _populateArrays(vector< vector< AccumType > > &arys, uInt &currentCount, const DataIterator &dataBegin, Int64 nr, uInt dataStride, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt maxCount) const
Create a vector of unsorted arrays, one array for each bin defined by includeLimits.
unsigned int uInt
Definition: aipstype.h:48
description of a regularly spaced bins with the first bin having lower limit of minLimit and having n...