暂无图片
暂无图片
暂无图片
暂无图片
暂无图片

关于数据质量管理之正态分布验证

追梦IT人 2020-03-24
1042

数据质量管理中很重要的一个部分就是数据的离散程度,通常而言,连续值性数据录入是遵循正态分布的,从直方图上容易看,但如何自动化验证数据满足正态分布呢,本文尝试了kstest,normaltest,shaprio等方法,最终结论是建议通过normaltest作为正态分布验证标准,p值>0.05,此外也尝试拓展dataframe.describe,并为以后的数据质量收集做好准备。



代码示例

  1. import numpy as np

  2. import pandas as pd

  3. import matplotlib.pyplot as plt

  4. from scipy import stats


  5. # numpy.random.rand(d0, d1, …, dn)的随机样本位于[0, 1)中

  6. # dataset = pd.DataFrame(np.random.rand(500),columns = ['value'])

  7. # numpy.random.randn(d0, d1, …, dn)是从标准正态分布中返回一个或多个样本值。

  8. dataset = pd.DataFrame(np.random.randn(500,2)+10,columns = ['value1','value2'])

  9. print(dataset.head())

  10. # value1 value2

  11. # 0 9.343089 10.632460

  12. # 1 12.594335 7.722195

  13. # 2 9.364273 10.625419

  14. # 3 7.974014 9.241017

  15. # 4 9.200326 10.263768

  16. print(dataset.describe())

  17. # dataset.describe()缺省只包括计数、均值、方差、最大值、最小值、25%、50%、75%等信息

  18. # value1 value2

  19. # count 500.000000 500.000000

  20. # mean 10.009236 10.003829

  21. # std 1.018261 1.005165

  22. # min 7.345624 7.215147

  23. # 25% 9.335039 9.366773

  24. # 50% 9.921288 10.052754

  25. # 75% 10.652195 10.665082

  26. # max 13.409198 13.481374

  27. dtdesc = dataset.describe()

  28. # 可人工追加均值+3西格玛,均值-3西格玛,上四分位+1.5倍的四分位间距,下四分位-1.5倍的四分位间距

  29. dtdesc.loc['mean+3std'] = dtdesc.loc['mean'] + 3 * dtdesc.loc['std']

  30. #计算平均值-3倍标准差

  31. dtdesc.loc['mean-3std'] = dtdesc.loc['mean'] - 3 * dtdesc.loc['std']

  32. #计算上四分位+1.5倍的四分位间距

  33. dtdesc.loc['75%+1.5dist'] = dtdesc.loc['75%'] + 1.5 * (dtdesc.loc['75%'] - dtdesc.loc['25%'])

  34. #计算下四分位-1.5倍的四分位间距

  35. dtdesc.loc['25%-1.5dist'] = dtdesc.loc['25%'] - 1.5 * (dtdesc.loc['75%'] - dtdesc.loc['25%'])

  36. # value1 value2

  37. # mean+3std 13.064018 13.019324

  38. # mean-3std 6.954454 6.988333

  39. # 75%+1.5dist 12.627930 12.612547

  40. # 25%-1.5dist 7.359305 7.419308


  41. # 可再追加各列是否满足正态分布

  42. normaldistribution=[]

  43. for col in dtdesc.columns:

  44. x = dataset[col]

  45. u = dataset[col].mean() # 计算均值

  46. std = dataset[col].std() # 计算标准差

  47. statistic,pvalue = stats.kstest(x, 'norm', (u, std))

  48. normaldistribution.append(True if pvalue>0.05 else False)

  49. dtdesc.loc['normaldistribution']=normaldistribution

  50. # value1 value2

  51. # normaldistribution True True


  52. # 构建正态分布数据

  53. # 参数loc(float):正态分布的均值,对应着这个分布的中心。loc=0说明这一个以Y轴为对称轴的正态分布,

  54. # 参数scale(float):正态分布的标准差,对应分布的宽度,scale越大,正态分布的曲线越矮胖,scale越小,曲线越高瘦。

  55. # 参数size(int 或者整数元组):输出的值赋在shape里,默认为None

  56. x = np.random.normal(0,0.963586,500)+9.910642

  57. test_stat = stats.normaltest(x)

  58. # NormaltestResult(statistic=0.008816546859359073, pvalue=0.995601428745786)

  59. x = np.random.normal(0,0.963586,500)

  60. test_stat = stats.normaltest(x)

  61. # NormaltestResult(statistic=0.18462531546355884, pvalue=0.9118200173945978)

  62. x = np.random.normal(0,1,500)

  63. test_stat = stats.normaltest(x)

  64. # NormaltestResult(statistic=1.3984337844262325, pvalue=0.4969743359168226)


  65. # 构建平均值为9.990269,标准差为0.987808,参见上面dataset

  66. x = stats.norm.rvs(loc=9.990269, scale=0.987808, size=(500,))

  67. test_stat = stats.normaltest(x)

  68. # NormaltestResult(statistic=0.6771164970693714, pvalue=0.7127972587837901)


  69. # 创建原始数据图

  70. fig = plt.figure(figsize = (10,6))

  71. ax1 = fig.add_subplot(3,1,1) # 创建子图,value1value2的散点图

  72. ax1.scatter(dataset.index, dataset['value1'])

  73. ax1.scatter(dataset.index, dataset['value2'])

  74. plt.grid()

  75. # 绘制数据分布图

  76. ax2 = fig.add_subplot(3,1,2) # 创建子图,value1的直方图

  77. dataset.hist('value1',bins=50,alpha = 0.5,ax = ax2)

  78. dataset.plot('value1',kind = 'kde', secondary_y=True,ax = ax2)

  79. plt.grid()

  80. ax3 = fig.add_subplot(3,1,3) # 创建子图,value2的直方图

  81. dataset.hist('value2',bins=50,alpha = 0.5,ax = ax3)

  82. dataset.plot('value2',kind = 'kde', secondary_y=True,ax = ax3)

  83. plt.grid()

  84. plt.show()


  85. def retpddesc(dataset):

  86. dtdesc = dataset.describe()

  87. dtdesc.loc['mean+3std'] = dtdesc.loc['mean'] + 3 * dtdesc.loc['std']

  88. dtdesc.loc['mean-3std'] = dtdesc.loc['mean'] - 3 * dtdesc.loc['std']

  89. dtdesc.loc['75%+1.5dist'] = dtdesc.loc['75%'] + 1.5 * (dtdesc.loc['75%'] - dtdesc.loc['25%'])

  90. dtdesc.loc['25%-1.5dist'] = dtdesc.loc['25%'] - 1.5 * (dtdesc.loc['75%'] - dtdesc.loc['25%'])

  91. kstestvalues=[]

  92. kstestustdvalues=[]

  93. normaltestvalues=[]

  94. shapirovalues=[]

  95. kstestvaluesflag = []

  96. kstestustdvaluesflag = []

  97. normaltestvaluesflag = []

  98. shapirovaluesflag = []

  99. for col in dtdesc.columns:

  100. x = dataset[col]

  101. statistic, pvalue =stats.normaltest(x)

  102. kstestvalues.append(pvalue)

  103. kstestvaluesflag.append(True if pvalue > 0.05 else False)

  104. statistic, pvalue =stats.kstest(x, 'norm', (u, std))

  105. kstestustdvalues.append(pvalue)

  106. kstestustdvaluesflag.append(True if pvalue > 0.05 else False)

  107. statistic, pvalue =stats.normaltest(x)

  108. normaltestvalues.append(pvalue)

  109. normaltestvaluesflag.append(True if pvalue > 0.05 else False)

  110. statistic, pvalue =stats.shapiro(x)

  111. shapirovalues.append(pvalue)

  112. shapirovaluesflag.append(True if pvalue > 0.05 else False)

  113. dtdesc.loc['kstestvalues'] = kstestvalues

  114. dtdesc.loc['kstestustdvalues'] = kstestustdvalues

  115. dtdesc.loc['normaltestvalues'] = normaltestvalues

  116. dtdesc.loc['shapirovalues'] = shapirovalues

  117. dtdesc.loc['kstestvaluesflag'] = kstestvaluesflag

  118. dtdesc.loc['kstestustdvaluesflag'] = kstestustdvaluesflag

  119. dtdesc.loc['normaltestvaluesflag'] = normaltestvaluesflag

  120. dtdesc.loc['shapirovaluesflag'] = shapirovaluesflag

  121. return dtdesc

  122. dataset=pd.read_csv("testcsv.csv",header=0)

  123. dataset.describe()

  124. # int1 float int2 int3 int4

  125. # count 500.000000 500.000000 500.000000 500.000000 500.000000

  126. # mean 250.500000 0.511262 50.854000 7.700000 20.247487

  127. # std 144.481833 0.286973 27.798168 3.135229 1.949810

  128. # min 1.000000 0.000216 0.000000 1.000000 15.017961

  129. # 25% 125.750000 0.264791 27.000000 5.000000 19.215537

  130. # 50% 250.500000 0.524347 51.000000 8.000000 20.239853

  131. # 75% 375.250000 0.766143 73.000000 12.000000 21.374594

  132. # max 500.000000 0.999791 100.000000 12.000000 24.571841

  133. pddescribe=retpddesc(dataset)

  134. # int1 float int2 int3 int4

  135. # 75%+1.5dist 7.495000e+02 1.518171e+00 ... 2.250000e+01 24.613179

  136. # 25%-1.5dist -2.485000e+02 -4.872369e-01 ... -5.500000e+00 15.976952

  137. # kstestvalues 1.169302e-69 5.109596e-112 ... 1.614540e-05 0.175321

  138. # kstestustdvalues 0.000000e+00 0.000000e+00 ... 2.580937e-264 0.000000

  139. # normaltestvalues 1.169302e-69 5.109596e-112 ... 1.614540e-05 0.175321

  140. # shapirovalues 2.945411e-11 4.248346e-12 ... 3.732934e-18 0.000060

  141. # kstestvaluesflag 0.000000e+00 0.000000e+00 ... 0.000000e+00 1.000000

  142. # kstestustdvaluesflag 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000

  143. # normaltestvaluesflag 0.000000e+00 0.000000e+00 ... 0.000000e+00 1.000000

  144. # shapirovaluesflag 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000


  145. dataset.hist(bins=10,alpha = 0.4)

  146. plt.show()



长按二维码关注“追梦IT人”


最后修改时间:2020-03-25 21:08:32
文章转载自追梦IT人,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。

评论