python Library
dictionaryStatistics
Sometimes, you have data sets and you are interested in the dependency of some signal y to some variable x. Let us assume, you have a dictionary, wherein multiple such data sets are stored {"x":[ ], "y1":[ ], "y2":[ ]}. Then you may utilize this script to sort the data yi into intervals of x, and compute statistical quantities like mean, median, standard deviation, quantile per interval. Finally you may want to fit some analytical curve through these typical values and visualize your results:
import numpy as np import math from scipy.optimize import curve_fit class dictionaryStatistics: """ Concept of the data handling is, to have each data/signal as array in a dictionary with meaningfull names. To look for dependencies within this data, one element is interpreted as abscissa, some other as ordinate. The ordinate values are first sorted into intervals on the abscissa, then statistical quantities per interval are computed. Finally you may like to fit some curve and have a graphical result... Set debug=True to gain a bit more insight. Dr. Arne Jachens 2020-12-06 """ def __init__(self,dict,debug=False): self.debug = debug self.dict = dict self.keys = list(dict.keys()) self.interval = {} self.distribution = {} self.mean = {} self.std = {} self.median = {} self.quantile = {} self.vals = {} def sortIntoBins(self,xName,yName,NoBins=100,xMin=999,xMax=999): """ One element of the dictionary is taken as abscissa, some other as ordinate. Each of these data tuples {x,y} are sorted into abscissa intervals and the y data is prepared for later statistical analysis per interval. If your intend is a histogram only, set yName = xName. """ self.NoBins = NoBins try: NoPoints = len(self.dict[xName]) x = self.dict[xName] y = self.dict[yName] except: print("sortIntoBins: index out of range!") print(self.keys) #if range not set, analyse data: if xMax-xMin<1E-6: xMin = np.min(x) xMax = np.max(x) self.xMin = xMin self.xMax = xMax #create bins binCounter = {} self.vals[yName] = {} xLower = [] deltaX = (xMax-xMin)/float(NoBins) for i in range(NoBins): xLower.append(xMin + float(i)*deltaX) binCounter[i] = 0 self.vals[yName][i] = [] #sort into bins NoOutliners = 0 for j,thisX in enumerate(x): found = False for i in range(NoBins): if thisX<=xLower[i]+deltaX and thisX>self.xMin : found = True binCounter[i] = binCounter[i]+1 self.vals[yName][i].append(y[j]) break if not found: NoOutliners = NoOutliners+1 #normalize counters total=0 for i in range(NoBins): total = total + binCounter[i] #utilize numpy statistics self.interval[yName] = [] self.distribution[yName] = [] for i in range(NoBins): self.distribution[yName].append(binCounter[i]/float(max(1,total))) self.interval[yName].append(xLower[i]+0.5*deltaX) if self.debug: print("sortIntoBins:") print("There were ",total," data points of ",yName," in the interval,") print("\t",NoOutliners," data points were outside the interval") print("x\tdistribution") for i in range(NoBins): print('bin: {0:1.3E}'.format(self.interval[yName][i]),"\t",'{0:1.3E}'.format(self.distribution[yName][i])) return(self.interval[yName], self.distribution[yName]) def DictMean(self, yName): """ Compute mean and standard deviation of data 'yName' per interval. """ self.mean[yName] = [] self.std[yName] = [] for i in range(self.NoBins): if len(self.vals[yName][i])>0: tmp = np.array(self.vals[yName][i]) self.mean[yName].append(np.mean(tmp)) self.std[yName].append(np.std(tmp)) else: self.mean[yName].append(np.nan) self.std[yName].append(np.nan) if self.debug: print("DictMean:") print("x\tmean") for i in range(self.NoBins): print('{0:1.3E}'.format(self.interval[yName][i]),"\t",'{0:1.3E}'.format(self.mean[yName][i]) ) return (self.interval[yName],self.mean[yName],self.std[yName]) def DictMedian(self, yName): """ Compute median and quantiles of data 'yName' per interval. """ self.median[yName] = [] self.quantile[yName] = {} self.quantile[yName][0] = [] self.quantile[yName][1] = [] self.quantile[yName][2] = [] self.quantile[yName][3] = [] for i in range(self.NoBins): if len(self.vals[yName][i])>0: tmp = np.array(self.vals[yName][i]) self.median[yName].append(np.median(tmp)) self.quantile[yName][0].append(np.min(tmp)) self.quantile[yName][1].append(np.quantile(tmp,0.25)) self.quantile[yName][2].append(np.quantile(tmp,0.75)) self.quantile[yName][3].append(np.max(tmp)) else: self.median[yName].append(np.nan) self.quantile[yName][0].append(np.nan) self.quantile[yName][1].append(np.nan) self.quantile[yName][2].append(np.nan) self.quantile[yName][3].append(np.nan) if self.debug: print("DictMedian:") print("x\tmedian") for i in range(self.NoBins): print('{0:1.3E}'.format(self.interval[yName][i]),"\t",'{0:1.3E}'.format(self.median[yName][i]) ) return (self.interval[yName],self.median[yName],self.quantile[yName]) def fitCurve(self, x, y, p, funcName="myFunc", debug=False): """ Use any function like 'myFunc' to fit the data. If there should be 'NaN' in the data, these are dropped. """ #drop NaN values goodX=[] goodY=[] for i,thisX in enumerate(x): if abs(y[i])>=0: goodX.append(thisX) goodY.append(y[i]) #fit curve to the data, p0 are initial parameters funcDict = {"Gauss": Gauss, "Laplace": Laplace} params, cons = curve_fit( funcDict[funcName], goodX, goodY, p0=np.array(p) ) if debug: print(params) print(cons) return params def calculateCurve(self, p, xMin, xMax, funcName="Gauss", dim=1000): """ To plot the curve, you might want to calculate a value table. """ fineX=[] myCurve=[] for i in range(dim): myX = xMin +float(i)*(xMax-xMin)/float(dim) fineX.append(myX) funcDict = {"Gauss": Gauss, "Laplace": Laplace} myCurve.append(funcDict[funcName](myX,p[0],p[1])) return (fineX,myCurve) #=======================================# def Gauss(x, a, b): """ Wrapper to compute some user defined function y(x) with parameters a,b x may either be a numpy array or a single value. """ if isinstance(x,np.ndarray): y=[] for i in range(x.shape[0]): y.append( a*math.exp(-x[i]**2/b**2) ) else: y = a*math.exp(-x**2/b**2) return y #=======================================# def Laplace(x, a, b): """ Wrapper to compute some user defined function y(x) with parameters a,b x may either be a numpy array or a single value. """ if isinstance(x,np.ndarray): y=[] for i in range(x.shape[0]): y.append( a*math.exp(-abs(x[i])/b ) ) else: y = a*math.exp(-abs(x)/b ) return y #=======================================# if __name__ == "__main__": """ To test this class, we generate normally distributed x-data. Fore these we evaluate any function (a gaussian) to have y-data. These sets are put to a dictionary and with this, we check how many values are within which interval, and then we compute mean and median of the data within each such interval. Finally a user defined curve may be fitted to the statistical data and plotted. """ import matplotlib.pyplot as plt No=60 nd = np.random.normal(loc=4.0, scale=2.0, size=No) ud = np.random.uniform(low=1.0, high=9.0, size=No) xData=nd yData=myFunc(xData,1,3) myDict = {"x":xData, "a":yData, "b":yData} #initialte dictionaryStatistics DS = dictionaryStatistics(myDict, True) #sort sub-set in intervals NoIntervals=10 (x,dist) = DS.sortIntoBins("x", "a", NoIntervals) #mean and standart deviation of data 'a' per interval 'x' (x,mean,std) = DS.DictMean("a") #median and quantile of data 'a' per interval 'x' (x,median,quantile) = DS.DictMedian("a") #do some curve fitting to the sorted data p=[10,10] #amplitude, std p = DS.fitCurve(x, mean, p, funcName="Gauss", debug=False) print("fit parameters:",p) #calculate value table of function (fineX,myCurve) = DS.calculateCurve(p, np.min(x), np.max(x), funcName="Gauss") #visualize the results fig,ax = plt.subplots() barWidth = (np.max(x)-np.min(x))/(NoIntervals-1) ax.bar(x, dist, width=barWidth,color='#c0c0c0') ax.set_ylabel("probability density",color="silver",fontsize=14) ax2=ax.twinx() legend = [] legend.append("raw data") legend.append('fitted curve') legend.append('distribution') legend.append('mean') ax2.bar(x, dist, width=0.8,color='#c0c0c0') ax2.plot(xData,yData,marker='o', linestyle='none',c='#00ff00') ax2.errorbar(x,mean,std,marker='x', linestyle='none',c='#ff0066') ax2.plot(fineX,myCurve,c='#000000') ax2.legend(legend) #plt.title(title) plt.grid(color='k', linestyle='dotted') plt.savefig("./dictStat.svg", bbox_inches='tight') plt.show()
Index of Library
Der gesamte Sourcecode darf gemäß GNU General Public License weiterverbreitet werden.