1  """module for performing statistical calculations. 
  2   
  3  (c) 2007-2012 Matt Hilton  
  4   
  5  (c) 2013-2014 Matt Hilton & Steven Boada 
  6   
  7  U{http://astlib.sourceforge.net} 
  8   
  9  This module (as you may notice) provides very few statistical routines. It does, however, provide 
 10  biweight (robust) estimators of location and scale, as described in Beers et al. 1990 (AJ, 100, 
 11  32), in addition to a robust least squares fitting routine that uses the biweight transform. 
 12   
 13  Some routines may fail if they are passed lists with few items and encounter a `divide by zero' 
 14  error. Where this occurs, the function will return None. An error message will be printed to the 
 15  console when this happens if astStats.REPORT_ERRORS=True (the default). Testing if an 
 16  astStats function returns None can be used to handle errors in scripts.  
 17   
 18  For extensive statistics modules, the Python bindings for GNU R (U{http://rpy.sourceforge.net}), or 
 19  SciPy (U{http://www.scipy.org}) are suggested. 
 20   
 21  """ 
 22   
 23  import math 
 24  import numpy 
 25  import sys 
 26   
 27  REPORT_ERRORS=True 
 28   
 29   
 31      """Calculates the mean average of a list of numbers. 
 32       
 33      @type dataList: list or numpy array 
 34      @param dataList: input data, must be a one dimensional list 
 35      @rtype: float 
 36      @return: mean average 
 37       
 38      """ 
 39      return numpy.mean(dataList) 
  40       
 41   
 43      """Calculates the weighted mean average of a two dimensional list (value, weight) of 
 44      numbers. 
 45       
 46      @type dataList: list 
 47      @param dataList: input data, must be a two dimensional list in format [value, weight] 
 48      @rtype: float 
 49      @return: weighted mean average 
 50       
 51      """ 
 52      sum=0 
 53      weightSum=0 
 54      for item in dataList: 
 55          sum=sum+float(item[0]*item[1]) 
 56          weightSum=weightSum+item[1] 
 57      if len(dataList)>0: 
 58          mean=sum/weightSum 
 59      else: 
 60          mean=0 
 61      return mean 
  62   
 63   
 65      """Calculates the (sample) standard deviation of a list of numbers. 
 66       
 67      @type dataList: list or numpy array 
 68      @param dataList: input data, must be a one dimensional list 
 69      @rtype: float 
 70      @return: standard deviation 
 71       
 72      """ 
 73      return numpy.std(dataList) 
  74       
 75   
 77      """Calculates the root mean square of a list of numbers. 
 78       
 79      @type dataList: list 
 80      @param dataList: input data, must be a one dimensional list 
 81      @rtype: float 
 82      @return: root mean square 
 83       
 84      """ 
 85      dataListSq=[] 
 86      for item in dataList: 
 87          dataListSq.append(item*item) 
 88      listMeanSq=mean(dataListSq) 
 89      rms=math.sqrt(listMeanSq) 
 90   
 91      return rms 
  92           
 93   
 95      """Calculates the weighted (sample) standard deviation of a list of numbers.  
 96       
 97      @type dataList: list 
 98      @param dataList: input data, must be a two dimensional list in format [value, weight] 
 99      @rtype: float 
100      @return: weighted standard deviation 
101       
102      @note: Returns None if an error occurs. 
103       
104      """ 
105      listMean=weightedMean(dataList) 
106      sum=0 
107      wSum=0 
108      wNonZero=0 
109      for item in dataList: 
110          if item[1]>0.0: 
111              sum=sum+float((item[0]-listMean)/item[1])*float((item[0]-listMean)/item[1]) 
112              wSum=wSum+float(1.0/item[1])*float(1.0/item[1]) 
113               
114      if len(dataList)>1: 
115          nFactor=float(len(dataList))/float(len(dataList)-1) 
116          stdev=math.sqrt(nFactor*(sum/wSum)) 
117      else: 
118          if REPORT_ERRORS==True: 
119              print("""ERROR: astStats.weightedStdev() : dataList contains < 2 items.""") 
120          stdev=None 
121      return stdev 
 122           
123   
134       
135   
137      """Returns an estimate of the mode of a set of values by mode=(3*median)-(2*mean). 
138       
139      @type dataList: list 
140      @param dataList: input data, must be a one dimensional list 
141      @rtype: float 
142      @return: estimate of mode average 
143       
144      """ 
145      mode=(3*median(dataList))-(2*mean(dataList)) 
146   
147      return mode 
 148   
149   
151      """Calculates the Median Absolute Deviation of a list of numbers. 
152       
153      @type dataList: list 
154      @param dataList: input data, must be a one dimensional list 
155      @rtype: float 
156      @return: median absolute deviation 
157       
158      """ 
159      listMedian=median(dataList) 
160       
161       
162      diffModuli=[] 
163      for item in dataList: 
164          diffModuli.append(math.fabs(item-listMedian)) 
165       
166      MAD=median(diffModuli) 
167           
168      return MAD 
 169   
170   
172      """Calculates the biweight location estimator (like a robust average) of a list of 
173      numbers. 
174       
175      @type dataList: list 
176      @param dataList: input data, must be a one dimensional list 
177      @type tuningConstant: float 
178      @param tuningConstant: 6.0 is recommended. 
179      @rtype: float 
180      @return: biweight location 
181       
182      @note: Returns None if an error occurs.      
183       
184      """  
185      C=tuningConstant 
186      listMedian=median(dataList) 
187      listMAD=MAD(dataList) 
188      if listMAD!=0: 
189          uValues=[] 
190          for item in dataList: 
191              uValues.append((item-listMedian)/(C*listMAD)) 
192                   
193          top=0            
194          bottom=0         
195          for i in range(len(uValues)): 
196              if math.fabs(uValues[i])<=1.0: 
197                  top=top+((dataList[i]-listMedian) \ 
198                      *(1.0-(uValues[i]*uValues[i])) \ 
199                      *(1.0-(uValues[i]*uValues[i]))) 
200               
201                  bottom=bottom+((1.0-(uValues[i]*uValues[i])) \ 
202                      *(1.0-(uValues[i]*uValues[i]))) 
203       
204          CBI=listMedian+(top/bottom) 
205           
206      else: 
207          if REPORT_ERRORS==True: 
208              print("""ERROR: astStats: biweightLocation() : MAD() returned 0.""") 
209          return None 
210       
211      return CBI 
 212   
213   
215      """Calculates the biweight scale estimator (like a robust standard deviation) of a list 
216      of numbers.  
217       
218      @type dataList: list 
219      @param dataList: input data, must be a one dimensional list 
220      @type tuningConstant: float 
221      @param tuningConstant: 9.0 is recommended. 
222      @rtype: float 
223      @return: biweight scale 
224       
225      @note: Returns None if an error occurs. 
226           
227      """  
228      C=tuningConstant 
229       
230       
231      listMedian=median(dataList) 
232      listMAD=MAD(dataList) 
233      diffModuli=[] 
234      for item in dataList: 
235          diffModuli.append(math.fabs(item-listMedian)) 
236      uValues=[] 
237      for item in dataList: 
238          try: 
239              uValues.append((item-listMedian)/(C*listMAD)) 
240          except ZeroDivisionError: 
241              if REPORT_ERRORS==True: 
242                  print("""ERROR: astStats.biweightScale() : divide by zero error.""") 
243              return None 
244           
245      top=0                
246      bottom=0 
247      valCount=0   
248       
249      for i in range(len(uValues)): 
250           
251          if math.fabs(uValues[i])<=1.0: 
252              u2Term=1.0-(uValues[i]*uValues[i]) 
253              u4Term=math.pow(u2Term, 4) 
254              top=top+((diffModuli[i]*diffModuli[i])*u4Term) 
255              bottom=bottom+(u2Term*(1.0-(5.0*(uValues[i]*uValues[i])))) 
256              valCount=valCount+1 
257       
258      top=math.sqrt(top) 
259      bottom=math.fabs(bottom) 
260   
261      SBI=math.pow(float(valCount), 0.5)*(top/bottom) 
262      return SBI 
 263   
264   
266      """Iteratively calculates biweight location and scale, using sigma clipping, for a list 
267      of values.  The calculation is performed on the first column of a multi-dimensional 
268      list; other columns are ignored. 
269       
270      @type dataList: list 
271      @param dataList: input data 
272      @type tuningConstant: float 
273      @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is recommended for 
274      scale estimates      
275      @type sigmaCut: float 
276      @param sigmaCut: sigma clipping to apply 
277      @rtype:     dictionary  
278      @return: estimate of biweight location, scale, and list of non-clipped data, in the format 
279      {'biweightLocation', 'biweightScale', 'dataList'} 
280       
281      @note: Returns None if an error occurs. 
282   
283      """          
284       
285      iterations=0 
286      clippedValues=[] 
287      for row in dataList: 
288          if type(row)==list: 
289              clippedValues.append(row[0]) 
290          else: 
291              clippedValues.append(row) 
292           
293      while iterations<11 and len(clippedValues)>5: 
294           
295          cbi=biweightLocation(clippedValues, tuningConstant)      
296          sbi=biweightScale(clippedValues, tuningConstant) 
297           
298           
299           
300           
301          if cbi==None or sbi==None: 
302               
303              if REPORT_ERRORS==True: 
304                  print("""ERROR: astStats : biweightClipped() : 
305                  divide by zero error.""") 
306               
307              return None 
308               
309          else: 
310               
311              clippedValues=[] 
312              clippedData=[] 
313              for row in dataList: 
314                  if type(row)==list: 
315                      if row[0]>cbi-(sigmaCut*sbi) \ 
316                      and row[0]<cbi+(sigmaCut*sbi): 
317                          clippedValues.append(row[0]) 
318                          clippedData.append(row) 
319                  else: 
320                      if row>cbi-(sigmaCut*sbi) \ 
321                      and row<cbi+(sigmaCut*sbi): 
322                          clippedValues.append(row) 
323                          clippedData.append(row) 
324               
325          iterations=iterations+1 
326               
327      return {'biweightLocation':cbi, 'biweightScale':sbi, 'dataList':clippedData} 
 328   
329   
358       
359   
361      """Performs an ordinary least squares fit on a two dimensional list of numbers. 
362      Minimum number of data points is 5. 
363       
364      @type dataList: list 
365      @param dataList: input data, must be a two dimensional list in format [x, y] 
366      @rtype: dictionary 
367      @return: slope and intercept on y-axis, with associated errors, in the format 
368      {'slope', 'intercept', 'slopeError', 'interceptError'} 
369       
370      @note: Returns None if an error occurs.      
371           
372      """ 
373      sumX=0 
374      sumY=0 
375      sumXY=0 
376      sumXX=0 
377      n=float(len(dataList)) 
378      if n > 2: 
379          for item in dataList: 
380              sumX=sumX+item[0] 
381              sumY=sumY+item[1] 
382              sumXY=sumXY+(item[0]*item[1]) 
383              sumXX=sumXX+(item[0]*item[0])        
384          m=((n*sumXY)-(sumX*sumY))/((n*sumXX)-(sumX*sumX)) 
385          c=((sumXX*sumY)-(sumX*sumXY))/((n*sumXX)-(sumX*sumX)) 
386           
387          sumRes=0 
388          for item in dataList: 
389           
390              sumRes=sumRes+((item[1]-(m*item[0])-c) \ 
391              *(item[1]-(m*item[0])-c)) 
392               
393          sigma=math.sqrt((1.0/(n-2))*sumRes) 
394           
395          try: 
396              mSigma=(sigma*math.sqrt(n))/math.sqrt((n*sumXX)-(sumX*sumX)) 
397          except: 
398              mSigma=numpy.nan 
399          try: 
400              cSigma=(sigma*math.sqrt(sumXX))/math.sqrt((n*sumXX)-(sumX*sumX)) 
401          except: 
402              cSigma=numpy.nan 
403      else: 
404          if REPORT_ERRORS==True: 
405              print("""ERROR: astStats.OLSFit() : dataList contains < 3 items.""") 
406               
407          return None 
408           
409      return {'slope':m, 
410              'intercept':c, 
411              'slopeError':mSigma, 
412              'interceptError':cSigma} 
 413   
414   
416      """Calculates the clipped mean and stdev of a list of numbers. 
417       
418      @type dataList: list 
419      @param dataList: input data, one dimensional list of numbers 
420      @type sigmaCut: float 
421      @param sigmaCut: clipping in Gaussian sigma to apply 
422      @type maxIterations: int 
423      @param maxIterations: maximum number of iterations 
424      @rtype: dictionary 
425      @return: format {'clippedMean', 'clippedStdev', 'numPoints'} 
426       
427      """ 
428       
429      listCopy=[] 
430      for d in dataList: 
431          listCopy.append(d) 
432      listCopy=numpy.array(listCopy) 
433       
434      iterations=0 
435      while iterations < maxIterations and len(listCopy) > 4: 
436           
437          m=listCopy.mean() 
438          s=listCopy.std() 
439           
440          listCopy=listCopy[numpy.less(abs(listCopy), abs(m+sigmaCut*s))] 
441           
442          iterations=iterations+1 
443       
444      return {'clippedMean': m, 'clippedStdev': s, 'numPoints': listCopy.shape[0]} 
 445       
446   
448      """Performs a weighted least squares fit on a list of numbers with sigma clipping. Minimum number of data 
449      points is 5. 
450       
451      @type dataList: list 
452      @param dataList: input data, must be a three dimensional list in format [x, y, y weight] 
453      @rtype: dictionary 
454      @return: slope and intercept on y-axis, with associated errors, in the format 
455      {'slope', 'intercept', 'slopeError', 'interceptError'} 
456       
457      @note: Returns None if an error occurs.      
458       
459      """ 
460       
461      iterations=0 
462      clippedValues=[] 
463      for row in dataList: 
464          clippedValues.append(row) 
465           
466      while iterations<11 and len(clippedValues)>4: 
467           
468          fitResults=weightedLSFit(clippedValues, "errors") 
469           
470          if fitResults['slope'] == None: 
471               
472              if REPORT_ERRORS==True: 
473                  print("""ERROR: astStats : clippedWeightedLSFit() : 
474                  divide by zero error.""") 
475               
476              return None 
477               
478          else: 
479               
480              clippedValues=[] 
481              for row in dataList: 
482                   
483                   
484                  fit=fitResults['slope']*row[0]+fitResults['intercept'] 
485                  res=row[1]-fit 
486                  if abs(res)/row[2] < sigmaCut: 
487                      clippedValues.append(row) 
488               
489          iterations=iterations+1 
490       
491       
492      fitResults['numDataPoints']=len(clippedValues) 
493       
494      return fitResults 
 495       
496   
498      """Performs a weighted least squares fit on a three dimensional list of numbers [x, y, y error]. 
499       
500      @type dataList: list 
501      @param dataList: input data, must be a three dimensional list in format [x, y, y error] 
502      @type weightType: string 
503      @param weightType: if "errors", weights are calculated assuming the input data is in the 
504      format [x, y, error on y]; if "weights", the weights are assumed to be already calculated and 
505      stored in a fourth column [x, y, error on y, weight] (as used by e.g. L{astStats.biweightLSFit}) 
506      @rtype: dictionary 
507      @return: slope and intercept on y-axis, with associated errors, in the format 
508      {'slope', 'intercept', 'slopeError', 'interceptError'} 
509       
510      @note: Returns None if an error occurs.      
511               
512      """ 
513      if weightType == "weights": 
514          sumW=0 
515          sumWX=0 
516          sumWY=0 
517          sumWXY=0 
518          sumWXX=0 
519          n=float(len(dataList)) 
520          if n > 4: 
521              for item in dataList: 
522                  W=item[3] 
523                  sumWX=sumWX+(W*item[0]) 
524                  sumWY=sumWY+(W*item[1]) 
525                  sumWXY=sumWXY+(W*item[0]*item[1]) 
526                  sumWXX=sumWXX+(W*item[0]*item[0]) 
527                  sumW=sumW+W 
528                   
529           
530              try: 
531                  m=((sumW*sumWXY)-(sumWX*sumWY)) \ 
532                  /((sumW*sumWXX)-(sumWX*sumWX)) 
533              except ZeroDivisionError: 
534                  if REPORT_ERRORS == True: 
535                      print("ERROR: astStats.weightedLSFit() : divide by zero error.") 
536                  return None 
537           
538              try: 
539                  c=((sumWXX*sumWY)-(sumWX*sumWXY)) \ 
540                  /((sumW*sumWXX)-(sumWX*sumWX)) 
541              except ZeroDivisionError: 
542                  if REPORT_ERRORS == True: 
543                      print("ERROR: astStats.weightedLSFit() : divide by zero error.") 
544                  return None 
545               
546              sumRes=0 
547              for item in dataList: 
548               
549                  sumRes=sumRes+((item[1]-(m*item[0])-c) \ 
550                  *(item[1]-(m*item[0])-c)) 
551                   
552              sigma=math.sqrt((1.0/(n-2))*sumRes) 
553               
554               
555               
556              if (n*sumWXX)-(sumWX*sumWX)>0.0:  
557               
558                  mSigma=(sigma*math.sqrt(n)) \ 
559                      /math.sqrt((n*sumWXX)-(sumWX*sumWX)) 
560           
561                  cSigma=(sigma*math.sqrt(sumWXX)) \ 
562                      /math.sqrt((n*sumWXX)-(sumWX*sumWX)) 
563                   
564              else: 
565                   
566                  if REPORT_ERRORS==True: 
567                      print("""ERROR: astStats.weightedLSFit() 
568                      : divide by zero error.""") 
569                  return None 
570                   
571          else: 
572              if REPORT_ERRORS==True: 
573                  print("""ERROR: astStats.weightedLSFit() : 
574                  dataList contains < 5 items.""") 
575              return None 
576               
577      elif weightType == "errors": 
578          sumX=0 
579          sumY=0 
580          sumXY=0 
581          sumXX=0 
582          sumSigma=0 
583          n=float(len(dataList)) 
584          for item in dataList: 
585              sumX=sumX+(item[0]/(item[2]*item[2])) 
586              sumY=sumY+(item[1]/(item[2]*item[2])) 
587              sumXY=sumXY+((item[0]*item[1])/(item[2]*item[2])) 
588              sumXX=sumXX+((item[0]*item[0])/(item[2]*item[2])) 
589              sumSigma=sumSigma+(1.0/(item[2]*item[2])) 
590          delta=(sumSigma*sumXX)-(sumX*sumX)       
591          m=((sumSigma*sumXY)-(sumX*sumY))/delta 
592          c=((sumXX*sumY)-(sumX*sumXY))/delta 
593          mSigma=math.sqrt(sumSigma/delta) 
594          cSigma=math.sqrt(sumXX/delta) 
595           
596      return {'slope':m, 
597              'intercept':c, 
598              'slopeError':mSigma, 
599              'interceptError':cSigma} 
 600       
601   
603      """Performs a weighted least squares fit, where the weights used are the biweight 
604      transforms of the residuals to the previous best fit .i.e. the procedure is iterative, 
605      and converges very quickly (iterations is set to 10 by default). Minimum number of data 
606      points is 10. 
607       
608      This seems to give slightly different results to the equivalent R routine, so use at your 
609      own risk! 
610       
611      @type dataList: list 
612      @param dataList: input data, must be a three dimensional list in format [x, y, y weight] 
613      @type tuningConstant: float 
614      @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is recommended for 
615      scale estimates 
616      @type sigmaCut: float 
617      @param sigmaCut: sigma clipping to apply (set to None if not required)       
618      @rtype: dictionary 
619      @return: slope and intercept on y-axis, with associated errors, in the format 
620      {'slope', 'intercept', 'slopeError', 'interceptError'} 
621       
622      @note: Returns None if an error occurs. 
623           
624      """ 
625   
626      dataCopy=[] 
627      for row in dataList: 
628          dataCopy.append(row) 
629           
630       
631      results=OLSFit(dataCopy) 
632      origLen=len(dataCopy) 
633      for k in range(10): 
634          m=results['slope'] 
635          c=results['intercept'] 
636          res=[] 
637          for item in dataCopy: 
638              res.append((m*item[0]+c)-item[1]) 
639               
640          if len(res)>5: 
641               
642               
643              if sigmaCut != None: 
644                  absRes=[] 
645                  for item in res: 
646                      absRes.append(abs(item)) 
647                  sigma=stdev(absRes) 
648                  count=0 
649                  for item in absRes: 
650                      if item>(sigmaCut*sigma) \ 
651                      and len(dataCopy)>2: 
652                          del dataCopy[count] 
653                          del res[count] 
654                           
655                           
656                           
657                           
658                          count=count-1  
659                           
660                      count=count+1 
661                           
662               
663              weights=biweightTransform(res, tuningConstant) 
664                           
665               
666               
667              wData=[] 
668              for i in range(len(dataCopy)): 
669                  wData.append([dataCopy[i][0], dataCopy[i][1], dataCopy[i][2], weights[i][1]]) 
670               
671              results=weightedLSFit(wData, "weights") 
672   
673      return results 
 674       
675   
677      """Bins the input data cumulatively. 
678       
679      @param data: input data, must be a one dimensional list 
680      @type binMin: float 
681      @param binMin: minimum value from which to bin data 
682      @type binMax: float 
683      @param binMax: maximum value from which to bin data  
684      @type binTotal: int 
685      @param binTotal: number of bins  
686      @rtype: list 
687      @return: binned data, in format [bin centre, frequency] 
688           
689      """ 
690       
691      binStep=float(binMax-binMin)/binTotal 
692      bins=[] 
693      totalItems=len(data) 
694      for i in range(binTotal): 
695          bins.append(0) 
696          for item in data: 
697              if item>(binMin+(i*binStep)): 
698                  bins[i]=bins[i]+1.0/totalItems 
699                   
700       
701      coords=[] 
702      for i in range(binTotal): 
703          coords.append([binMin+(float(i+0.5)*binStep), bins[i]]) 
704       
705      return coords 
 706   
707   
708 -def binner(data, binMin, binMax, binTotal): 
 709      """Bins the input data.. 
710       
711      @param data: input data, must be a one dimensional list 
712      @type binMin: float 
713      @param binMin: minimum value from which to bin data 
714      @type binMax: float 
715      @param binMax: maximum value from which to bin data  
716      @type binTotal: int 
717      @param binTotal: number of bins  
718      @rtype: list 
719      @return: binned data, in format [bin centre, frequency] 
720           
721      """ 
722       
723      binStep=float(binMax-binMin)/binTotal 
724      bins=[] 
725      for i in range(binTotal): 
726          bins.append(0) 
727          for item in data: 
728              if item>(binMin+(i*binStep)) \ 
729              and item<=(binMin+((i+1)*binStep)): 
730                  bins[i]=bins[i]+1 
731                   
732       
733      coords=[] 
734      for i in range(binTotal): 
735          coords.append([binMin+(float(i+0.5)*binStep), bins[i]]) 
736       
737      return coords 
 738   
739   
741      """Bins the input data, recorded frequency is sum of weights in bin. 
742       
743      @param data: input data, must be a one dimensional list 
744      @type binMin: float 
745      @param binMin: minimum value from which to bin data 
746      @type binMax: float 
747      @param binMax: maximum value from which to bin data  
748      @type binTotal: int 
749      @param binTotal: number of bins  
750      @rtype: list 
751      @return: binned data, in format [bin centre, frequency] 
752           
753      """ 
754       
755      binStep=float(binMax-binMin)/binTotal 
756      bins=[] 
757      for i in range(binTotal): 
758          bins.append(0.0) 
759          for item, weight in zip(data, weights): 
760              if item>(binMin+(i*binStep)) \ 
761              and item<=(binMin+((i+1)*binStep)): 
762                  bins[i]=bins[i]+weight 
763                   
764       
765      coords=[] 
766      for i in range(binTotal): 
767          coords.append([binMin+(float(i+0.5)*binStep), bins[i]]) 
768       
769      return coords 
 770       
771   
772