Dr. Arne JachensDr. Arne Jachens

python Library

dictionaryStatistics

Sometimes, you have data sets and you are interested in the dependency of some signal y to some variable x.
Let us assume, you have a dictionary, wherein multiple such data sets are stored {"x":[ ], "y1":[ ], "y2":[ ]}.
Then you may utilize this script to sort the data yi into intervals of x,
and compute statistical quantities like mean, median, standard deviation, quantile per interval.
Finally you may want to fit some analytical curve through these typical values and visualize your results:

data analysis diagram

import   numpy as   np
import   math
from scipy.optimize import   curve_fit

class dictionaryStatistics:
    """
    Concept of the data handling is, to have each data/signal as  array in a dictionary with meaningfull names.
    To look for  dependencies within this data, one element is interpreted as  abscissa, some other as  ordinate.
    The ordinate values are first sorted into intervals on the abscissa, then statistical quantities per interval are computed.
    Finally you may like to fit some curve and have a graphical result...
    Set debug=True to gain a bit more insight.
    Dr. Arne Jachens
    2020-12-06
    """
    def  __init__(self,dict,debug=False):
        self.debug = debug
        self.dict = dict
        self.keys = list(dict.keys())
        self.interval     = {}
        self.distribution = {}
        self.mean   = {}
        self.std    = {}
        self.median = {}
        self.quantile = {}
        self.vals = {}

    def  sortIntoBins(self,xName,yName,NoBins=100,xMin=999,xMax=999):
        """
        One element of the dictionary is taken as  abscissa, some other as  ordinate.
        Each of these data tuples {x,y} are sorted into abscissa intervals 
        and the y data is prepared for  later statistical analysis per interval.
        If your intend is a histogram only, set yName = xName.
        """
        self.NoBins = NoBins
        try:
            NoPoints = len(self.dict[xName])
            x = self.dict[xName]
            y = self.dict[yName]
        except:
            print("sortIntoBins: index out of range!")
            print(self.keys)
        
        #if range not set, analyse data:
        if xMax-xMin<1E-6:
            xMin = np.min(x)
            xMax = np.max(x)
        self.xMin = xMin
        self.xMax = xMax

        #create bins
        binCounter = {}
        self.vals[yName] = {}
        xLower     = []
        deltaX = (xMax-xMin)/float(NoBins)
        for  i in range(NoBins):
            xLower.append(xMin + float(i)*deltaX)
            binCounter[i]       = 0
            self.vals[yName][i] = []

        #sort into bins
        NoOutliners = 0
        for  j,thisX in enumerate(x):
            found = False
            for   i in range(NoBins):
                if thisX<=xLower[i]+deltaX and thisX>self.xMin :
                    found = True
                    binCounter[i] = binCounter[i]+1
                    self.vals[yName][i].append(y[j])
                    break

            if not found:
                NoOutliners = NoOutliners+1

        #normalize counters
        total=0
        for  i in range(NoBins):
            total = total + binCounter[i]

        #utilize numpy statistics
        self.interval[yName]     = []
        self.distribution[yName] = []
        for   i in range(NoBins):
            self.distribution[yName].append(binCounter[i]/float(max(1,total)))
            self.interval[yName].append(xLower[i]+0.5*deltaX)
 
        if self.debug:
            print("sortIntoBins:")
            print("There were ",total," data points of ",yName," in the interval,")
            print("\t",NoOutliners," data points were outside the interval")
            print("x\tdistribution")
            for   i in range(NoBins):
                print('bin: {0:1.3E}'.format(self.interval[yName][i]),"\t",'{0:1.3E}'.format(self.distribution[yName][i]))

        return(self.interval[yName], self.distribution[yName])
    
    def  DictMean(self, yName):
        """
        Compute mean and standard deviation of data 'yName' per interval.
        """
        self.mean[yName]   = []
        self.std[yName]    = []
        for   i in range(self.NoBins):
            if len(self.vals[yName][i])>0:
                tmp = np.array(self.vals[yName][i])
                self.mean[yName].append(np.mean(tmp))
                self.std[yName].append(np.std(tmp))
            else:
                self.mean[yName].append(np.nan)
                self.std[yName].append(np.nan)
                
        if self.debug:
            print("DictMean:")
            print("x\tmean")
            for   i in range(self.NoBins):
                print('{0:1.3E}'.format(self.interval[yName][i]),"\t",'{0:1.3E}'.format(self.mean[yName][i]) )
                
        return (self.interval[yName],self.mean[yName],self.std[yName])
    
    def  DictMedian(self, yName):
        """
        Compute median and quantiles of data 'yName' per interval.
        """
        self.median[yName]      = []
        self.quantile[yName]    = {}
        self.quantile[yName][0] = []
        self.quantile[yName][1] = []
        self.quantile[yName][2] = []
        self.quantile[yName][3] = []
        for   i in range(self.NoBins):
            if len(self.vals[yName][i])>0:
                tmp = np.array(self.vals[yName][i])
                self.median[yName].append(np.median(tmp))
                self.quantile[yName][0].append(np.min(tmp))
                self.quantile[yName][1].append(np.quantile(tmp,0.25))
                self.quantile[yName][2].append(np.quantile(tmp,0.75))
                self.quantile[yName][3].append(np.max(tmp))
            else:
                self.median[yName].append(np.nan)
                self.quantile[yName][0].append(np.nan)
                self.quantile[yName][1].append(np.nan)
                self.quantile[yName][2].append(np.nan)
                self.quantile[yName][3].append(np.nan)

        if self.debug:
            print("DictMedian:")
            print("x\tmedian")
            for   i in range(self.NoBins):
                print('{0:1.3E}'.format(self.interval[yName][i]),"\t",'{0:1.3E}'.format(self.median[yName][i]) )
                
        return (self.interval[yName],self.median[yName],self.quantile[yName])

    
    def  fitCurve(self, x, y, p, funcName="myFunc", debug=False):
        """
        Use any function like 'myFunc' to fit the data.
        If there should be 'NaN' in the data, these are dropped.
        """
        #drop NaN values
        goodX=[]
        goodY=[]
        for  i,thisX in enumerate(x):
            if abs(y[i])>=0:
                goodX.append(thisX)
                goodY.append(y[i])

        #fit curve to the data, p0 are initial parameters
        funcDict = {"Gauss": Gauss, "Laplace": Laplace}
        params, cons = curve_fit( funcDict[funcName], goodX, goodY, p0=np.array(p) )
        if debug:
            print(params)
            print(cons)

        return params

    
    def  calculateCurve(self, p, xMin, xMax, funcName="Gauss", dim=1000):
        """
        To plot the curve, you might want to calculate a value table.
        """
        fineX=[]
        myCurve=[]
        for  i in range(dim):
            myX = xMin +float(i)*(xMax-xMin)/float(dim)
            fineX.append(myX)
            funcDict = {"Gauss": Gauss, "Laplace": Laplace}
            myCurve.append(funcDict[funcName](myX,p[0],p[1]))

        return (fineX,myCurve)

        
#=======================================#
def  Gauss(x, a, b):
    """
    Wrapper to compute some user defined function y(x) with parameters a,b
    x may either be a numpy array or a single value.
    """
    if isinstance(x,np.ndarray):
        y=[]
        for   i in range(x.shape[0]):
            y.append( a*math.exp(-x[i]**2/b**2) )
    else:
        y = a*math.exp(-x**2/b**2)
        
    return y
 
#=======================================#
def  Laplace(x, a, b):
    """
    Wrapper to compute some user defined function y(x) with parameters a,b
    x may either be a numpy array or a single value.
    """
    if isinstance(x,np.ndarray):
        y=[]
        for   i in range(x.shape[0]):
            y.append( a*math.exp(-abs(x[i])/b ) )
    else:
        y = a*math.exp(-abs(x)/b )
        
    return y


#=======================================#
if __name__ == "__main__":
    """
    To test this class, we generate normally distributed x-data.
    Fore these we evaluate any function (a gaussian) to have y-data.
    These sets are put to a dictionary and with this, we check 
    how many values are within which interval, and then we compute 
    mean and median of the data within each such interval.
    Finally a user defined curve may be fitted to the statistical data 
    and plotted.
    """
    import  matplotlib.pyplot as  plt
    No=60
    nd = np.random.normal(loc=4.0, scale=2.0, size=No)
    ud = np.random.uniform(low=1.0, high=9.0, size=No)
    xData=nd
    yData=myFunc(xData,1,3)

    myDict = {"x":xData, "a":yData, "b":yData}
    
    #initialte dictionaryStatistics
    DS = dictionaryStatistics(myDict, True)
    
    #sort sub-set in intervals
    NoIntervals=10
    (x,dist) = DS.sortIntoBins("x", "a", NoIntervals)
    
    #mean and standart deviation of data 'a' per interval 'x'
    (x,mean,std) = DS.DictMean("a")
    
    #median and quantile of data 'a' per interval 'x'
    (x,median,quantile) = DS.DictMedian("a")
    
    #do some curve fitting to the sorted data
    p=[10,10] #amplitude, std
    p = DS.fitCurve(x, mean, p, funcName="Gauss", debug=False)
    print("fit parameters:",p)
    
    #calculate value table of function
    (fineX,myCurve) = DS.calculateCurve(p, np.min(x), np.max(x), funcName="Gauss")

    
    #visualize the results
    fig,ax = plt.subplots()
    barWidth = (np.max(x)-np.min(x))/(NoIntervals-1)
    ax.bar(x, dist, width=barWidth,color='#c0c0c0')
    ax.set_ylabel("probability density",color="silver",fontsize=14)
    
    ax2=ax.twinx()
    legend = []
    legend.append("raw data")
    legend.append('fitted curve')

    legend.append('distribution')
    legend.append('mean')
    ax2.bar(x, dist, width=0.8,color='#c0c0c0')
    ax2.plot(xData,yData,marker='o', linestyle='none',c='#00ff00')
    ax2.errorbar(x,mean,std,marker='x', linestyle='none',c='#ff0066')
    ax2.plot(fineX,myCurve,c='#000000')

        
    ax2.legend(legend)
    #plt.title(title)
    plt.grid(color='k', linestyle='dotted')
    plt.savefig("./dictStat.svg", bbox_inches='tight')
    plt.show()

Index of Library

1CIEcolorCoordinates_Spectrum.py
2MatlabStructures.py
3ModelicaExecution.py
4ModelicaMatFilter.py
5OperateAllFiles.py
6dictionaryStatistics.py
7familienKonto.py
8makeDoc.py
9plotTimeSeries.py
10readData2DictOfArrays.py
11replaceInFile.py
12showPointOnCIExy.py
13testNumpyMatrix.py
14writeDictOfArrays.py
15writeTSV.py

Der gesamte Sourcecode darf gemäß GNU General Public License weiterverbreitet werden.