1 """module for performing statistical calculations.
2
3 (c) 2007-2012 Matt Hilton
4
5 (c) 2013-2014 Matt Hilton & Steven Boada
6
7 U{http://astlib.sourceforge.net}
8
9 This module (as you may notice) provides very few statistical routines. It
10 does, however, provide biweight (robust) estimators of location and scale, as
11 described in Beers et al. 1990 (AJ, 100, 32), in addition to a robust least
12 squares fitting routine that uses the biweight transform.
13
14 Some routines may fail if they are passed lists with few items and encounter a
15 `divide by zero' error. Where this occurs, the function will return None. An
16 error message will be printed to the console when this happens if
17 astStats.REPORT_ERRORS=True (the default). Testing if an astStats function
18 returns None can be used to handle errors in scripts.
19
20 For extensive statistics modules, the Python bindings for GNU R
21 (U{http://rpy.sourceforge.net}), or SciPy (U{http://www.scipy.org}) are
22 suggested.
23
24 """
25
26 import math
27 import numpy
28 import collections
29
30 REPORT_ERRORS = True
31
32
33
35 """Calculates the mean average of a list of numbers.
36
37 @type dataList: list or numpy array
38 @param dataList: input data, must be a one dimensional list
39 @rtype: float
40 @return: mean average
41
42 """
43 return numpy.mean(dataList)
44
45
46
48 """Calculates the weighted mean average of a two dimensional list (value,
49 weight) of numbers.
50
51 @type dataList: list
52 @param dataList: input data, must be a two dimensional list in format
53 [value, weight]
54 @rtype: float
55 @return: weighted mean average
56
57 """
58
59 dataList = numpy.asarray(dataList)
60 mean = numpy.average(dataList[:, 0], weights=dataList[:, 1])
61
62 return mean
63
64
65
67 """Calculates the (sample) standard deviation of a list of numbers.
68
69 @type dataList: list or numpy array
70 @param dataList: input data, must be a one dimensional list
71 @rtype: float
72 @return: standard deviation
73
74 """
75 return numpy.std(dataList)
76
77
78
80 """Calculates the root mean square of a list of numbers.
81
82 @type dataList: list
83 @param dataList: input data, must be a one dimensional list
84 @rtype: float
85 @return: root mean square
86
87 """
88
89
90
91 dataListSq = [math.pow(item, 2) for item in dataList]
92 listMeanSq = mean(dataListSq)
93 rms = math.sqrt(listMeanSq)
94
95 return rms
96
97
98
100 """Calculates the weighted (sample) standard deviation of a list of
101 numbers.
102
103 @type dataList: list
104 @param dataList: input data, must be a two dimensional list in format
105 [value, weight]
106 @rtype: float
107 @return: weighted standard deviation
108
109 @note: Returns None if an error occurs.
110
111 """
112
113 dataList = numpy.asarray(dataList)
114 if dataList.shape[0] < 1:
115 if REPORT_ERRORS:
116 print("ERROR: astStats.weightedStdev() : "
117 "dataList contains < 2 items. ")
118 return None
119
120 listMean = weightedMean(dataList[:, 0], weights=dataList[:, 1])
121 variance = numpy.average((dataList[:, 0] - listMean)**2,
122 weights=dataList[:, 1])
123
124 return numpy.sqrt(variance)
125
126
127
138
139
140
142 """Returns an estimate of the mode of a set of values by
143 mode=(3*median)-(2*mean).
144
145 @type dataList: list
146 @param dataList: input data, must be a one dimensional list
147 @rtype: float
148 @return: estimate of mode average
149
150 """
151 mode = (3 * median(dataList)) - (2 * mean(dataList))
152
153 return mode
154
155
156
158 """Calculates the Median Absolute Deviation of a list of numbers.
159
160 @type dataList: list
161 @param dataList: input data, must be a one dimensional list
162 @rtype: float
163 @return: median absolute deviation
164
165 """
166
167
168
169
170
171
172
173
174
175
176
177 med = numpy.median(dataList)
178 try:
179 return numpy.median(numpy.fabs(dataList - med))
180 except TypeError:
181 dataList = numpy.array(dataList)
182 return numpy.median(numpy.fabs(dataList - med))
183
184
185
187 """ Calculates the normalized Median Absolute Deviation of a list of
188 numbers which, for a Gaussian distribution, is related to the standard
189 deviation by 1.4826.
190
191 @type dataList: list
192 @param dataList: input data, must be a one dimensional list
193 @rtype: float
194 @return: normalized median absolute deviation
195
196 """
197
198 return 1.4826 * MAD(dataList)
199
200
201
203 """Calculates the biweight location estimator (like a robust average) of a
204 list of numbers.
205
206 @type dataList: list
207 @param dataList: input data, must be a one dimensional list
208 @type tuningConstant: float
209 @param tuningConstant: 6.0 is recommended.
210 @rtype: float
211 @return: biweight location
212
213 @note: Returns None if an error occurs.
214
215 """
216
217 if not len(dataList):
218 return numpy.nan
219
220 C = tuningConstant
221
222 listMedian = numpy.median(dataList)
223 listMAD = MAD(dataList)
224 if listMAD != 0:
225 uValues = []
226 for item in dataList:
227 uValues.append((item - listMedian) / (C * listMAD))
228
229 top = 0
230 bottom = 0
231 for i in range(len(uValues)):
232 if math.fabs(uValues[i]) <= 1.0:
233 top = top + ((dataList[i] - listMedian) *
234 (1.0 - (uValues[i] * uValues[i])) *
235 (1.0 - (uValues[i] * uValues[i])))
236
237 bottom = bottom + ((1.0 - (uValues[i] * uValues[i])) *
238 (1.0 - (uValues[i] * uValues[i])))
239
240 CBI = listMedian + (top / bottom)
241
242 else:
243 if REPORT_ERRORS:
244 print("ERROR: astStats: biweightLocation() : MAD() returned 0.")
245 return None
246
247 return CBI
248
249
250
252 """Calculates the biweight scale estimator (like a robust standard
253 deviation) of a list of numbers.
254
255 @type dataList: list
256 @param dataList: input data, must be a one dimensional list
257 @type tuningConstant: float
258 @param tuningConstant: 9.0 is recommended.
259 @rtype: float
260 @return: biweight scale
261
262 @note: Returns None if an error occurs.
263
264 """
265 C = tuningConstant
266
267
268 listMedian = median(dataList)
269 listMAD = MAD(dataList)
270 diffModuli = []
271 for item in dataList:
272 diffModuli.append(math.fabs(item - listMedian))
273 uValues = []
274 for item in dataList:
275 try:
276 uValues.append((item - listMedian) / (C * listMAD))
277 except ZeroDivisionError:
278 if REPORT_ERRORS:
279 print("ERROR: astStats.biweightScale(): divide by zero error.")
280 return None
281
282 top = 0
283 bottom = 0
284 valCount = 0
285
286 for i in range(len(uValues)):
287
288 if math.fabs(uValues[i]) <= 1.0:
289 u2Term = 1.0 - math.pow(uValues[i], 2)
290 u4Term = math.pow(u2Term, 4)
291 top += math.pow(diffModuli[i], 2) * u4Term
292 bottom += (u2Term * (1.0 - (5.0 * math.pow(uValues[i], 2))))
293 valCount += 1
294
295 top = math.sqrt(top)
296 bottom = math.fabs(bottom)
297
298 SBI = math.pow(float(valCount), 0.5) * (top / bottom)
299 return SBI
300
301
302
304 """Calculates the biweight scale estimator (like a robust standard
305 deviation) of a list of numbers.
306
307 @type dataList: list
308 @param dataList: input data, must be a one dimensional list
309 @type tuningConstant: float
310 @param tuningConstant: 9.0 is recommended.
311 @rtype: float
312 @return: biweight scale
313
314 @note: Returns None if an error occurs.
315
316 """
317 C = tuningConstant
318
319
320 listMedian = median(dataList)
321 listMAD = MAD(dataList)
322 diffModuli = []
323 for item in dataList:
324 diffModuli.append(math.fabs(item - listMedian))
325 uValues = []
326 for item in dataList:
327 try:
328 uValues.append((item - listMedian) / (C * listMAD))
329 except ZeroDivisionError:
330 if REPORT_ERRORS:
331 print("ERROR: astStats.biweightScale(): divide by zero error.")
332 return None
333
334 top = 0
335 bottom = 0
336 valCount = 0
337
338 for i in range(len(uValues)):
339
340 if math.fabs(uValues[i]) <= 1.0:
341 u2Term = 1.0 - math.pow(uValues[i], 2)
342 u4Term = math.pow(u2Term, 4)
343 top += math.pow(diffModuli[i], 2) * u4Term
344 bottom += (u2Term * (1.0 - (5.0 * math.pow(uValues[i], 2))))
345 valCount += 1
346
347 top = math.sqrt(top)
348
349 bottom = math.sqrt(bottom * (bottom - 1.))
350
351 SBI = math.pow(float(valCount), 0.5) * (top / bottom)
352 return SBI
353
354
355
357 """Iteratively calculates biweight location and scale, using sigma
358 clipping, for a list of values. The calculation is performed on the first
359 column of a multi-dimensional list; other columns are ignored.
360
361 @type dataList: list
362 @param dataList: input data
363 @type tuningConstant: float
364 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is
365 recommended for scale estimates
366 @type sigmaCut: float
367 @param sigmaCut: sigma clipping to apply
368 @rtype: dictionary
369 @return: estimate of biweight location, scale, and list of non-clipped
370 data, in the format {'biweightLocation', 'biweightScale', 'dataList'}
371
372 @note: Returns None if an error occurs.
373
374 """
375
376 iterations = 0
377 clippedValues = []
378 for row in dataList:
379 if type(row) == list:
380 clippedValues.append(row[0])
381 else:
382 clippedValues.append(row)
383
384 while iterations < 11 and len(clippedValues) > 5:
385
386 cbi = biweightLocation(clippedValues, tuningConstant)
387 sbi = biweightScale(clippedValues, tuningConstant)
388
389
390
391
392 if cbi is None or sbi is None:
393
394 if REPORT_ERRORS:
395 print("ERROR: astStats.biweightClipped(): divide by zero error.")
396
397 return None
398
399 else:
400
401 clippedValues = []
402 clippedData = []
403 for row in dataList:
404 if type(row) == list:
405 if row[0] > cbi - (sigmaCut * sbi) and \
406 row[0] < cbi + (sigmaCut * sbi):
407 clippedValues.append(row[0])
408 clippedData.append(row)
409 else:
410 if row > cbi - (sigmaCut * sbi) and \
411 row < cbi + (sigmaCut * sbi):
412 clippedValues.append(row)
413 clippedData.append(row)
414
415 iterations = iterations + 1
416
417 return {'biweightLocation': cbi,
418 'biweightScale': sbi,
419 'dataList': clippedData}
420
421
422
451
452
453
454
456 """ Calculates the Gapper Estimator (like a robust standard deviation) on a
457 list of numbers. Beers et al. 1990 recommends this for small number
458 statistics as it is insensitive to outliers and more accurately reproduces
459 the true dispersion of the system when compared to the canonical rms
460 standard deviation. See Hou et al. 2009 for a comparison.
461
462 @type dataList: list
463 @param dataList: input data, must be a one dimensional list
464 @rtype: float
465 @return: The dispersion of the dataList
466
467 """
468
469
470 dataList = numpy.copy(dataList)
471
472
473 dataList.sort()
474
475
476 diff = numpy.diff(dataList)
477
478
479 weightedDiff = [i * (len(dataList) - i) * g for i, g in enumerate(diff)]
480
481 return math.sqrt(math.pi) / (len(dataList) * (len(dataList) - 1)) *\
482 sum(weightedDiff)
483
484
485
486
488 """Performs an ordinary least squares fit on a two dimensional list of
489 numbers. Minimum number of data points is 5.
490
491 @type dataList: list
492 @param dataList: input data, must be a two dimensional list in format [x,y]
493 @rtype: dictionary
494 @return: slope and intercept on y-axis, with associated errors, in the
495 format {'slope', 'intercept', 'slopeError', 'interceptError'}
496
497 @note: Returns None if an error occurs.
498
499 """
500 sumX = 0
501 sumY = 0
502 sumXY = 0
503 sumXX = 0
504 n = float(len(dataList))
505 if n > 2:
506 for item in dataList:
507 sumX = sumX + item[0]
508 sumY = sumY + item[1]
509 sumXY = sumXY + (item[0] * item[1])
510 sumXX = sumXX + (item[0] * item[0])
511 m = ((n * sumXY) - (sumX * sumY)) / ((n * sumXX) - (sumX * sumX))
512 c = ((sumXX * sumY) - (sumX * sumXY)) / ((n * sumXX) - (sumX * sumX))
513
514 sumRes = 0
515 for item in dataList:
516
517 sumRes = sumRes + ((item[1] - (m * item[0]) - c) *
518 (item[1] - (m * item[0]) - c))
519
520 sigma = math.sqrt((1.0 / (n - 2)) * sumRes)
521
522 try:
523 mSigma = (sigma *
524 math.sqrt(n)) / math.sqrt((n * sumXX) - (sumX * sumX))
525 except:
526 mSigma = numpy.nan
527 try:
528 cSigma = (
529 sigma *
530 math.sqrt(sumXX)) / math.sqrt((n * sumXX) - (sumX * sumX))
531 except:
532 cSigma = numpy.nan
533 else:
534 if REPORT_ERRORS:
535 print("ERROR: astStats.OLSFit(): dataList contains < 3 items.")
536
537 return None
538
539 return {'slope': m,
540 'intercept': c,
541 'slopeError': mSigma,
542 'interceptError': cSigma}
543
544
545
547 """Calculates the clipped mean and stdev of a list of numbers.
548
549 @type dataList: list
550 @param dataList: input data, one dimensional list of numbers
551 @type sigmaCut: float
552 @param sigmaCut: clipping in Gaussian sigma to apply
553 @type maxIterations: int
554 @param maxIterations: maximum number of iterations
555 @rtype: dictionary
556 @return: format {'clippedMean', 'clippedStdev', 'numPoints'}
557
558 """
559
560 listCopy = []
561 for d in dataList:
562 listCopy.append(d)
563 listCopy = numpy.array(listCopy)
564
565 iterations = 0
566 while iterations < maxIterations and len(listCopy) > 4:
567
568 m = listCopy.mean()
569 s = listCopy.std()
570
571 listCopy = listCopy[numpy.less(abs(listCopy), abs(m + sigmaCut * s))]
572
573 iterations = iterations + 1
574
575 return {'clippedMean': m,
576 'clippedStdev': s,
577 'numPoints':
578 listCopy.shape[0]}
579
580
581
613
614
615
617 """Performs a weighted least squares fit on a list of numbers with sigma
618 clipping. Minimum number of data points is 5.
619
620 @type dataList: list
621 @param dataList: input data, must be a three dimensional list in format [x,
622 y, y weight]
623 @rtype: dictionary
624 @return: slope and intercept on y-axis, with associated errors, in the
625 format {'slope', 'intercept', 'slopeError', 'interceptError'}
626
627 @note: Returns None if an error occurs.
628
629 """
630
631 iterations = 0
632 clippedValues = []
633 for row in dataList:
634 clippedValues.append(row)
635
636 while iterations < 11 and len(clippedValues) > 4:
637
638 fitResults = weightedLSFit(clippedValues, "errors")
639
640 if fitResults['slope'] is None:
641
642 if REPORT_ERRORS:
643 print("ERROR: astStats.clippedWeightedLSFit(): "
644 "divide by zero error.")
645
646 return None
647
648 else:
649
650 clippedValues = []
651 for row in dataList:
652
653
654 fit = fitResults['slope'] * row[0] + fitResults['intercept']
655 res = row[1] - fit
656 if abs(res) / row[2] < sigmaCut:
657 clippedValues.append(row)
658
659 iterations = iterations + 1
660
661
662 fitResults['numDataPoints'] = len(clippedValues)
663
664 return fitResults
665
666
667
669 """Performs a weighted least squares fit on a three dimensional list of
670 numbers [x, y, y error].
671
672 @type dataList: list
673 @param dataList: input data, must be a three dimensional list in format [x,
674 y, y error]
675 @type weightType: string
676 @param weightType: if "errors", weights are calculated assuming the input
677 data is in the format [x, y, error on y]; if "weights", the weights are
678 assumed to be already calculated and stored in a fourth column [x, y, error
679 on y, weight] (as used by e.g. L{astStats.biweightLSFit})
680 @rtype: dictionary
681 @return: slope and intercept on y-axis, with associated errors, in the
682 format {'slope', 'intercept', 'slopeError', 'interceptError'}
683
684 @note: Returns None if an error occurs.
685
686 """
687 if weightType == "weights":
688 sumW = 0
689 sumWX = 0
690 sumWY = 0
691 sumWXY = 0
692 sumWXX = 0
693 n = float(len(dataList))
694 if n > 4:
695 for item in dataList:
696 W = item[3]
697 sumWX = sumWX + (W * item[0])
698 sumWY = sumWY + (W * item[1])
699 sumWXY = sumWXY + (W * item[0] * item[1])
700 sumWXX = sumWXX + (W * item[0] * item[0])
701 sumW = sumW + W
702
703
704 try:
705 m = ((sumW * sumWXY) - (sumWX * sumWY)) / ((sumW * sumWXX) -
706 (sumWX * sumWX))
707 except ZeroDivisionError:
708 if REPORT_ERRORS:
709 print("ERROR: astStats.weightedLSFit(): divide by zero error.")
710 return None
711
712 try:
713 c = ((sumWXX * sumWY) - (sumWX * sumWXY)) / ((sumW * sumWXX) -
714 (sumWX * sumWX))
715 except ZeroDivisionError:
716 if REPORT_ERRORS:
717 print("ERROR: astStats.weightedLSFit(): divide by zero error.")
718 return None
719
720 sumRes = 0
721 for item in dataList:
722
723 sumRes = sumRes + ((item[1] - (m * item[0]) - c) * (item[1] -
724 (m * item[0]) - c))
725
726 sigma = math.sqrt((1.0 / (n - 2)) * sumRes)
727
728
729
730 if (n * sumWXX) - (sumWX * sumWX) > 0.0:
731
732 mSigma = (sigma * math.sqrt(n)) / math.sqrt((n * sumWXX) -
733 (sumWX * sumWX))
734
735 cSigma = (sigma * math.sqrt(sumWXX)) / math.sqrt((n * sumWXX) -
736 (sumWX * sumWX))
737
738 else:
739
740 if REPORT_ERRORS:
741 print("ERROR: astStats.weightedLSFit(): "
742 "divide by zero error.")
743 return None
744
745 else:
746 if REPORT_ERRORS:
747 print("ERROR: astStats.weightedLSFit(): "
748 "dataList contains < 5 items.")
749 return None
750
751 elif weightType == "errors":
752 sumX = 0
753 sumY = 0
754 sumXY = 0
755 sumXX = 0
756 sumSigma = 0
757 n = float(len(dataList))
758 for item in dataList:
759 sumX = sumX + (item[0] / (item[2] * item[2]))
760 sumY = sumY + (item[1] / (item[2] * item[2]))
761 sumXY = sumXY + ((item[0] * item[1]) / (item[2] * item[2]))
762 sumXX = sumXX + ((item[0] * item[0]) / (item[2] * item[2]))
763 sumSigma = sumSigma + (1.0 / (item[2] * item[2]))
764 delta = (sumSigma * sumXX) - (sumX * sumX)
765 m = ((sumSigma * sumXY) - (sumX * sumY)) / delta
766 c = ((sumXX * sumY) - (sumX * sumXY)) / delta
767 mSigma = math.sqrt(sumSigma / delta)
768 cSigma = math.sqrt(sumXX / delta)
769
770 return {'slope': m,
771 'intercept': c,
772 'slopeError': mSigma,
773 'interceptError': cSigma}
774
775
776
778 """Performs a weighted least squares fit, where the weights used are the
779 biweight transforms of the residuals to the previous best fit .i.e. the
780 procedure is iterative, and converges very quickly (iterations is set to 10
781 by default). Minimum number of data points is 10.
782
783 This seems to give slightly different results to the equivalent R routine,
784 so use at your own risk!
785
786 @type dataList: list
787 @param dataList: input data, must be a three dimensional list in format [x,
788 y, y weight]
789 @type tuningConstant: float
790 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is
791 recommended for
792 scale estimates
793 @type sigmaCut: float
794 @param sigmaCut: sigma clipping to apply (set to None if not required)
795 @rtype: dictionary
796 @return: slope and intercept on y-axis, with associated errors, in the
797 format {'slope', 'intercept', 'slopeError', 'interceptError'}
798
799 @note: Returns None if an error occurs.
800
801 """
802
803 dataCopy = []
804 for row in dataList:
805 dataCopy.append(row)
806
807
808 results = OLSFit(dataCopy)
809 for k in range(10):
810 m = results['slope']
811 c = results['intercept']
812 res = []
813 for item in dataCopy:
814 res.append((m * item[0] + c) - item[1])
815
816 if len(res) > 5:
817
818
819 if sigmaCut is not None:
820 absRes = []
821 for item in res:
822 absRes.append(abs(item))
823 sigma = stdev(absRes)
824 count = 0
825 for item in absRes:
826 if item > (sigmaCut * sigma) and len(dataCopy) > 2:
827 del dataCopy[count]
828 del res[count]
829
830
831
832
833 count -= 1
834
835 count += 1
836
837
838 weights = biweightTransform(res, tuningConstant)
839
840
841
842 wData = []
843 for i in range(len(dataCopy)):
844 wData.append([dataCopy[i][0], dataCopy[i][1], dataCopy[i][2],
845 weights[i][1]])
846
847 results = weightedLSFit(wData, "weights")
848
849 return results
850
851
852
854 """Bins the input data cumulatively.
855
856 @param data: input data, must be a one dimensional list
857 @type binMin: float
858 @param binMin: minimum value from which to bin data
859 @type binMax: float
860 @param binMax: maximum value from which to bin data
861 @type binTotal: int
862 @param binTotal: number of bins
863 @rtype: list
864 @return: binned data, in format [bin centre, frequency]
865
866 """
867
868 binStep = float(binMax - binMin) / binTotal
869 bins = []
870 totalItems = len(data)
871 for i in range(binTotal):
872 bins.append(0)
873 for item in data:
874 if item > (binMin + (i * binStep)):
875 bins[i] = bins[i] + 1.0 / totalItems
876
877
878 coords = []
879 for i in range(binTotal):
880 coords.append([binMin + (float(i + 0.5) * binStep), bins[i]])
881
882 return coords
883
884
885
886 -def binner(data, binMin, binMax, binTotal):
887 """Bins the input data..
888
889 @param data: input data, must be a one dimensional list
890 @type binMin: float
891 @param binMin: minimum value from which to bin data
892 @type binMax: float
893 @param binMax: maximum value from which to bin data
894 @type binTotal: int
895 @param binTotal: number of bins
896 @rtype: list
897 @return: binned data, in format [bin centre, frequency]
898
899 """
900
901 binStep = float(binMax - binMin) / binTotal
902 bins = []
903 for i in range(binTotal):
904 bins.append(0)
905 for item in data:
906 if item > (binMin + (i * binStep)) and item <= (binMin + ((i + 1) *
907 binStep)):
908 bins[i] += 1
909
910
911 coords = []
912 for i in range(binTotal):
913 coords.append([binMin + (float(i + 0.5) * binStep), bins[i]])
914
915 return coords
916
917
918
920 """Bins the input data, recorded frequency is sum of weights in bin.
921
922 @param data: input data, must be a one dimensional list
923 @type binMin: float
924 @param binMin: minimum value from which to bin data
925 @type binMax: float
926 @param binMax: maximum value from which to bin data
927 @type binTotal: int
928 @param binTotal: number of bins
929 @rtype: list
930 @return: binned data, in format [bin centre, frequency]
931
932 """
933
934 binStep = float(binMax - binMin) / binTotal
935 bins = []
936 for i in range(binTotal):
937 bins.append(0.0)
938 for item, weight in zip(data, weights):
939 if item > (binMin + (i * binStep)) and item <= (binMin + ((i + 1) *
940 binStep)):
941 bins[i] += weight
942
943
944 coords = []
945 for i in range(binTotal):
946 coords.append([binMin + (float(i + 0.5) * binStep), bins[i]])
947
948 return coords
949
950
951
952
953 -def bootstrap(data,
954 statistic,
955 resamples=1000,
956 alpha=0.05,
957 output='ci',
958 **kwargs):
959 """ Returns the bootstrap estimate of the confidence interval for the given
960 statistic. The confidence interval is given by 100*(1-alpha). Passes a 1d
961 array to the function, statistic. Any arguments needed by statistic are
962 passed by **args.
963
964 @type data: list
965 @param data: The data on which the given statistic is calculated
966 @type statistic: function
967 @param statistic: The statistic desired
968 @type resamples: int
969 @param resamples: The number of bootstrap resamplings
970 @type alpha: float
971 @param alpha: The confidence interval given by 100*(1-alpha), 95% defaulti
972 @param output: The format of the output. 'ci' gives the confidence
973 interval, and 'errorbar' gives the length of the errorbar suitable for
974 plotting with matplotlib.
975 @type kwargs: Keywords
976 @param kwargs: Arguments needed by the statistic function
977 @rtype: tuple
978 @return: (Lower Interval, Upper Interval)
979
980 """
981
982 samples = numpy.random.choice(data,
983 size=(resamples, len(data)),
984 replace=True)
985 stat = numpy.sort([statistic(row, **kwargs) for row in samples])
986 if output == 'ci':
987 return (stat[int((alpha / 2.0) * resamples)],
988 stat[int((1 - alpha / 2.0) * resamples)])
989 elif output == 'errorbar':
990 return (numpy.abs(statistic(data, **kwargs) -
991 numpy.array([stat[int((alpha / 2.0) * resamples)],
992 stat[int((1 - alpha / 2.0) * resamples)]])))
993 else:
994 raise ValueError("Output option {0} is not supported.".format(output))
995
996
997
999 """ Calculates the value given by statistic in bins of x. Useful for
1000 plotting a running mean value for a scatter plot, for example. This
1001 function allows the computation of the sum, mean, median, std, or other
1002 statistic of the values within each bin.
1003
1004 NOTE: if the statistic is a callable function and there are empty data bins
1005 those bins will be skipped to keep the function from falling over.
1006
1007 @type x: numpy array
1008 @param x: data over which the bins are calculated
1009 @type y: numpy array
1010 @param y: values for corresponding x values
1011 @type statistic: string or function
1012 @param statistic: The statistic to compute (default is 'mean'). Acceptable
1013 values are 'mean', 'median', 'sum', 'std', and callable function. Extra
1014 arguements are passed as kwargs.
1015 @type binNumber: int
1016 @param binNumber: The desired number of bins for the x data.
1017 @rtype: tuple
1018 @return: A tuple of two lists containing the left bin edges and the value
1019 of the statistic in each of the bins.
1020
1021 """
1022
1023 if type(statistic) == str:
1024 if statistic not in ['mean', 'median', 'sum', 'std']:
1025 raise ValueError('unrecognized statistic "%s"' % statistic)
1026 elif isinstance(statistic, collections.Callable):
1027 pass
1028 else:
1029 raise ValueError("statistic not understood")
1030
1031 if not isinstance(x, numpy.ndarray):
1032 x = numpy.asarray(x)
1033 if not isinstance(y, numpy.ndarray):
1034 y = numpy.asarray(y)
1035
1036 try:
1037 bins = numpy.linspace(x.min(), x.max(), binNumber)
1038 centers = (bins[:-1] + bins[1:]) / 2.
1039 index = numpy.digitize(x, bins)
1040 except TypeError:
1041 bins = binNumber
1042 centers = (bins[:-1] + bins[1:]) / 2.
1043 index = numpy.digitize(x, binNumber)
1044 binNumber = len(binNumber)
1045
1046 if statistic == 'mean':
1047 running = [numpy.mean(y[index == k]) for k in range(1, binNumber)]
1048 elif statistic == 'median':
1049 running = [numpy.median(y[index == k]) for k in range(1, binNumber)]
1050 elif statistic == 'sum':
1051 running = [numpy.sum(y[index == k]) for k in range(1, binNumber)]
1052 elif statistic == 'std':
1053 running = [numpy.std(y[index == k]) for k in range(1, binNumber)]
1054 elif isinstance(statistic, collections.Callable):
1055 running = [statistic(y[index == k], **kwargs)
1056 for k in range(1, binNumber) if not len(y[index == k]) == 0]
1057 return centers, running
1058
1059
1060
1061
1063 """ Provides N samples from a user-defined discreet distribution.
1064
1065 >>> slice_sampler(px, N=1, x=None)
1066
1067 If x=None (default) or if len(x) != len(px), it will return an rray of
1068 intergers between 0 and len(px)-1. If x is given, it will return the
1069 samples from x according to the distribution px.
1070
1071 Originally written by Adam Laiacano, 2011
1072
1073 @type px: numpy array or list
1074 @param px: A discreet probability distribution
1075 @type N: int
1076 @param N: The number of samples to return, default is 1
1077 @type x: numpy array or list
1078 @param x: Optional array/list of observation values to return, where
1079 prob(x) = px
1080 @rtype: numpy.array
1081 @return: The desired number of samples drawn from the distribution
1082
1083 """
1084
1085 from random import sample
1086
1087 values = numpy.zeros(N, dtype=numpy.int)
1088 samples = numpy.arange(len(px))
1089 px = numpy.array(px) / (1. * sum(px))
1090 u = numpy.random.uniform(0, max(px))
1091 for n in range(N):
1092 included = px >= u
1093 choice = sample(list(range(numpy.sum(included))), 1)[0]
1094 values[n] = samples[included][choice]
1095 u = numpy.random.uniform(0, px[included][choice])
1096
1097 if len(x) == len(px):
1098 x = numpy.array(x)
1099 values = x[values]
1100 else:
1101 print(("px and x are different lengths. ",
1102 "Returning index locations for px."))
1103 if N == 1:
1104 return values[0]
1105 return values
1106