Stats Practical WJC

10/28/2020 stats practical WJC - Jupyter Notebook
In [1]:
%matplotlib inline
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import pandas as pd
import csv
from astropy.table import Table
import scipy
In [2]:
data = [1.3, 10.5, 23.5, 0.34, 100.5, 0.023, 89.1, 45., 5.45, 65.1]
In [3]:
mean_a = 0. # initialise arithmetic mean

mean_g = 1. # initialise geometric mean
N = len(data) # no. datapoints
for d in data:
mean_a += d # short for mean_a=mean_a+d
mean_g *= d # short for mean_g=mean_g*d
mean_a /= N
mean_g = mean_g **(1. / N)
In [4]:
print('arithmetic: ', mean_a, np.mean(data))

print('geometric: ', mean_g, ss.gmean(data))
# alternately with fstrings
print(f'geometric: {mean_g} {ss.gmean(data)}')
# more alternately, to display floats as shorter
print(f'geometric: {mean_g:0.3f} {ss.gmean(data):0.3f}')
arithmetic: 34.0813 34.0813

geometric: 7.169091175408984 7.169091175408983
geometric: 7.169091175408984 7.169091175408983
geometric: 7.169 7.169
In [5]:
# an aside:
# another method of checking if two values is:
meandiff = abs(mean_a - np.mean(data))
# never compare two floats with ==, only integers and strings
print(meandiff)
# meandiff = 1
# and if you want to put this as a test; there are two methods
assert(meandiff < 1e-3), 'meandiff too large' # or whatever tolerance you allow
# asserts are used in development code
if meandiff > 1e-3:
raise ValueError('meandiff too large') # this method is used in release code
# try increasing meandiff past 1e-3 and see what happens
0.0
localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 1/23

In [6]:
sorted_data = data.copy() # copy data first...

sorted_data.sort() # ...because sort() actually destroys the original order
ihalf = int(N / 2) # find halfway...but careful with Python counting....
print('ihalf: ', ihalf)
# alternately
ihalf = N // 2 # integer division
print('ihalf alternate: ', ihalf)
print('sorted data: ', sorted_data)
print('sorted_data[ihalf]: ', sorted_data[ihalf])
if N % 2 == 0: # even number of datapoints
median = (sorted_data[ihalf - 1] + sorted_data[ihalf]) / 2.
else:
median = sorted_data[ihalf]
print('median: ', median, np.median(data))
# alternately
if not N % 2:
median = (sorted_data[ihalf - 1] + sorted_data[ihalf]) / 2.
else:
median = sorted_data[ihalf]
print('median alternate: ', median, np.median(data))
ihalf: 5
ihalf alternate: 5
sorted data: [0.023, 0.34, 1.3, 5.45, 10.5, 23.5, 45.0, 65.1, 89.1, 100.5]
sorted_data[ihalf]: 23.5
median: 17.0 17.0
median alternate: 17.0 17.0

In [40]:
with open('example.csv') as csvfile:

readCSV = csv.reader(csvfile, delimiter=',')
column1 = []
column2 = []
next(readCSV)
for row in readCSV:
v1 = row[0]
v2 = row[1]
column1.append(float(v1))
column2.append(float(v2))
print('csv reader method: ', len(column1), len(column2))
# alternate methods
# pandas
df = pd.read_csv('example.csv', names=['col1', 'col2'], header=0)
column1, column2 = df.col1, df.col2
print('pandas method: ', len(column1), len(column2))
# numpy
column1, column2 = np.loadtxt('example.csv', delimiter=',', skiprows=1, unpack=True)
print('numpy method: ', len(column1), len(column2))
# astropy
t = Table.read('example.csv', format='ascii.csv', names=['col1', 'col2'])
column1, column2 = t['col1'], t['col2']
print('astropy method: ', len(column1), len(column2))
# inbuilt
with open('example.csv', 'r') as f:
column1 = []
column2 = []
next(f)
for line in f:
line = line.split(',')
column1.append(float(line[0]))
column2.append(float(line[1]))
print('inbuilt method: ', len(column1), len(column2))
plt.figure(dpi=144) # should always start a new planned plot with plt.figure(),

# outside of notebooks python can get confused, the dpi is just to make the figure
# have higher resolution, makes files larger but improves quality
plt.hist(column1, label="Column 1")
plt.hist(column2, label="Column 2") # should say column 2 not column 1 in label
plt.legend()
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show('Histograms.pdf')
# an alternate to the above (simpler/ time saving) approach is to

# create an axis and figure object, these give you a lot more control and is necessary
# when making subplots or other complicated figures, e.g.
fig, ax = plt.subplots(dpi=144) # the same as plt.figure but could have done:
# fig, axes = plt.subplots(2, 2, dpi=144) # in which case axes has 4 subplots inside
ax.hist(column1, label="Column 1") # same as plt.hist
ax.hist(column2, label="Column 2")
ax.legend()
ax.set_xlabel('x') # slightly different phrasing, this is what is called by pyplot
ax.set_ylabel('N(x)')
plt.show() # or fig.show() but jupyter throws a pointless warning for no real reason
# slight alternate to the above

# fig = plt.figure(dpi=144)
# ax = fig.add_axes([0.05, 0.05, 0.9, 0.9])
# which is what you would do for making axes touching or precise positioning
csv reader method: 50 50

pandas method: 50 50
numpy method: 50 50
astropy method: 50 50
inbuilt method: 50 50

In [61]:
interquart = (np.min(column2),
np.quantile(column2, .25),
np.median(column2),
np.quantile(column2, .75),
np.max(column2))
def basic_stats(col):
percs = np.percentile(col, [*range(0, 125, 25), ])
std = np.std(col)
stderr = std / np.sqrt(len(col))
mean = np.mean(col)
print(f'Min = {percs[0]:.2f}, 25% = {percs[1]:.2f}, 50% (Median) = {percs[2]:.2f}'
f', 75% = {percs[3]:.2f}, Max = {percs[4]:.2f},\n Range = {percs[4]-percs[0]:.2f}'
f', Interquartile Range = {percs[3] - percs[1]:.2f}'
f',\n Standard Deviation = {std:.2f}, Standard Error = {stderr:.2f}'
f', Mean = {mean:.2f} +/- {stderr:.2f}\n')
return
for col in (column1, column2):
basic_stats(col)
Min = 2.42, 25% = 7.93, 50% (Median) = 9.96, 75% = 11.91, Max = 17.56,
Range = 15.13, Interquartile Range = 3.97,
Standard Deviation = 3.02, Standard Error = 0.43, Mean = 9.93 +/- 0.43
Min = 3.15, 25% = 3.92, 50% (Median) = 4.48, 75% = 5.55, Max = 6.28,
Range = 3.13, Interquartile Range = 1.63,
Standard Deviation = 0.97, Standard Error = 0.14, Mean = 4.63 +/- 0.14
In [8]:
sigma2 = 0.
for d in data:
sigma2 += (d-mean_a)**2.
sigma2 /= N
print('variance: ', sigma2, np.var(data))
sigma = np.sqrt(sigma2)
print('std. dev.: ', sigma, np.std(data))
variance: 1338.0728532099997 1338.0728532099997

std. dev.: 36.5796781452489 36.5796781452489

In [9]:
mu = 200.
sigma = 40.
xvalues = np.linspace(0, 400, 400)
pvalues = 1. / (sigma * np.sqrt(2. * np.pi)) * \
np.exp(-(xvalues - mu) **2. / (2. * sigma**2.))
plt.figure(dpi=144)
plt.plot(xvalues, pvalues)
plt.xlabel('Time since installation (days)')
plt.ylabel('Probability of failure')
plt.show()
In [10]:
a = 30. # lower limit

b = 60. # upper limit
n = 1000 # number of integration points
xvalues = np.linspace(a, b, n) # x values between those limits
delta_x = (b - a) / n # find delta x
pvalues = 1. / (sigma * np.sqrt(2. *np.pi)) * \
np.exp(-(xvalues - mu)**2. / (2. * sigma**2.))
integral = sum(pvalues) * delta_x # compute sum
print('Probability of failure between ',
int(a), ' days and ', int(b),' days: ', round(integral * 100, 3), '%')
Probability of failure between 30 days and 60 days: 0.022 %
In [11]:
print('Probability of failure after 250 days: ',

round((1 - ss.norm.cdf(250, mu, sigma)) * 100, 3), '%')
Probability of failure after 250 days: 10.565 %

In [12]:
lambda_p = 4.5
xs = np.arange(0, 15, 1)
factorialx = [np.math.factorial(x) for x in xs]
ps = lambda_p**xs * np.exp(-lambda_p) / factorialx
plt.figure(dpi=144)
plt.plot(xs, ps,'o')
plt.xlabel('x')
plt.ylabel('p(x)')
plt.show()
In [13]:
poisson_data = np.random.poisson(lambda_p, 1000)

In [14]:
plt.figure(dpi=144)
n, bins, patches = plt.hist(poisson_data)
# can just do plt.hist(poisson_data) if you don't care about the returned objects
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show()

In [15]:
xbins = np.arange(0, len(xs) + 1, 1) - 0.5

print(xbins)
plt.figure(dpi=144)
n, bins, patches = plt.hist(poisson_data, bins=xbins)
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show()
[-0.5 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5
13.5 14.5]

In [16]:
hist_area = sum(n) # delta-x is always 1. in this case :)

theoretical_ns = ps * hist_area
plt.figure(dpi=144)
n, bins, patches = plt.hist(poisson_data, bins=xs - 0.5)
plt.plot(xs, theoretical_ns, 'r-')
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show()

In [17]:
plt.figure(dpi=144)
n, bins, patches = plt.hist(poisson_data, bins=xbins)
errors = np.sqrt(theoretical_ns)
plt.errorbar(xs, theoretical_ns, yerr=errors, color='red')
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show()
In [18]:
chi2 = 0.
for o, e in zip(n, theoretical_ns):
chi2 += ((o - e)**2.) / e
print('chi-squared: ', chi2)
chi-squared: 6.5499882913951915
In [19]:
print(ss.chisquare(n, theoretical_ns))
Power_divergenceResult(statistic=6.5499882913951915, pvalue=0.95067177752022
24)

In [20]:
mu = 7.0 # mean of Poisson distribution

sigma = np.sqrt(mu) # variance is the same as the mean
ninsamp = 5 # number in each sample
nsamp = 10000 # number of samples to be taken
samplemeans = []
for i in range(nsamp):
rn = np.random.poisson(mu, ninsamp) # draw from distribution
samplemeans.append(np.mean(rn)) # add mean to list of means
nbins = int(6 * sigma) # choose a reasonable number of bins = 6 standard dev
print('No. bins: ', nbins)
plt.figure(dpi=144)
n, bins, patches = plt.hist(samplemeans, bins=nbins)
plt.xlabel('Sample mean')
plt.ylabel('N')
plt.show()
No. bins: 15

In [21]:
mu = 7.0 # mean of Poisson distribution

sigma = np.sqrt(mu) # variance is the same as the mean
ninsamp = 4 # number in each sample
nsamp = 100 # number of samples to be taken
samplemeans = []
for i in range(nsamp):
rn = np.random.poisson(mu, ninsamp) # draw from distribution
samplemeans.append(np.mean(rn)) # add mean to list of means
nbins = int(6*sigma) # choose a reasonable number of bins = 6 standard deviations
print('No. bins: ', nbins)
plt.figure(dpi=144)
n, bins, patches = plt.hist(samplemeans, bins=nbins)
plt.xlabel('Sample mean')
plt.ylabel('N')
# we can find the centres of the histogram bins from the 'bins' variable
xvals = []
for i in range(len(bins) - 1): # alternately, range(len(bins[:-1]))
xvals.append(0.5 * (bins[i] + bins[i + 1]))
binwidth = bins[1] - bins[0]
yerrs = np.sqrt(n)
plt.errorbar(xvals, n, yerr=yerrs, fmt='o')
# create Gaussian
mean = np.mean(samplemeans)
sigma = np.std(samplemeans)
xs = np.linspace(mean - 3 * sigma, mean + 3 * sigma, 1000)
# force normalised Gaussian to have same area as data
area = sum(n) * binwidth
ys = area * 1. / (sigma * np.sqrt(2. * np.pi)) * \
np.exp(-(xs - mean)**2. / (2. * sigma**2.))
plt.plot(xs, ys, lw=3)
plt.show()
No. bins: 15

In [22]:
body = [31.4, 31.4, 64.2, 67.5, 52.7, 29.9, 77.6, 39.4, 20., 90.3]
brain = [3.9, 3.8, 7.6, 7.8, 5.5, 4.1, 8.8, 5.2, 3.1, 5.1]
errors=[0.2, 0.4, 0.4, 0.2, 0.6, 0.4, 0.1, 0.4, 0.6, 2.4]
plt.figure(dpi=144)
plt.errorbar(body, brain, yerr=errors, fmt='o')
plt.xlabel('Body weight (kg)')
plt.ylabel('Brain weight (kg)')
plt.show()

In [23]:
A = np.vstack((np.ones_like(body), body)) # make matrix A

print(A) # the vstack function has put the two vectors on top of each other,
# we want them side by side
A = A.T # tranpose
print(A)
first_part = np.linalg.inv(np.dot(A.T, A))
c, m = np.dot(first_part, np.dot(A.T, brain))
print("Brain Weight = {:.3f} + {:.3f} Body Weight".format(c, m))
# or can use the fstrings as mentioned earlier
print(f"Brain Weight = {c:.3f} + {m:.3f} Body Weight")
plt.figure(dpi=144)
plt.errorbar(body, brain, yerr=errors, fmt='o', markersize=4)
plt.plot(xvalues, m * xvalues + c, color='red')
plt.xlabel('Body weight [kg]')
plt.ylabel('Brain weight [kg]')
plt.show()
[[ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. ]
[31.4 31.4 64.2 67.5 52.7 29.9 77.6 39.4 20. 90.3]]
[[ 1. 31.4]
[ 1. 31.4]
[ 1. 64.2]
[ 1. 67.5]
[ 1. 52.7]
[ 1. 29.9]
[ 1. 77.6]
[ 1. 39.4]
[ 1. 20. ]
[ 1. 90.3]]
Brain Weight = 2.370 + 0.062 Body Weight
Brain Weight = 2.370 + 0.062 Body Weight

In [24]:
def fit_function(x, m, c):

return m * x + c
print(scipy.optimize.curve_fit(fit_function, body, brain))
(array([0.06186344, 2.36960805]), array([[ 3.70585585e-04, -1.86923367e-02],

[-1.86923367e-02, 1.12840788e+00]]))
In [25]:
parameters, other_stuff = scipy.optimize.curve_fit(fit_function, body, brain)

# fyi, if you don't care about 'other_stuff', can just do:
parameters = scipy.optimize.curve_fit(fit_function, body, brain)[0]
m = parameters[0]
c = parameters[1]
plt.figure(dpi=144)
plt.show()

In [26]:
parameters, other_stuff = scipy.optimize.curve_fit(fit_function, body, brain, sigma=errors)

m = parameters[0]
c = parameters[1]
plt.figure(dpi=144)
plt.show()

In [27]:
xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise=0.1 # we can use this to control how much correlation there is between x and y
ys = xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [28]:
meanx = np.mean(xs)
meany = np.mean(ys)
In [29]:
numerator = np.sum((xs - meanx) * (ys - meany))

denominator = np.sqrt(np.sum((xs - meanx)**2.) * np.sum((ys - meany)**2.))
In [30]:
r = numerator / denominator
print('Pearson r: ', r)
Pearson r: 0.9975360030023925
In [31]:
xs.sort()
noise = 0.5 # we can use this to control how much correlation there is between x and y
plt.figure(dpi=144)
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [32]:
meanx = np.mean(xs)
meany = np.mean(ys)
Pearson r: 0.9420785286064383
In [33]:
xs.sort()
plt.figure(dpi=144)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
In [34]:
meanx = np.mean(xs)
meany = np.mean(ys)
Pearson r: 0.8235364616132786

In [35]:
xs.sort()
plt.figure(dpi=144)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
In [36]:
meanx = np.mean(xs)
meany = np.mean(ys)
Pearson r: 0.5792264650420678

In [37]:
xs.sort()
ys = -xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
In [38]:
meanx = np.mean(xs)
meany = np.mean(ys)
Pearson r: -0.5800754774903243

In [39]:
print('Pearson r:', ss.pearsonr(xs, ys))
Pearson r: (-0.5800754774903243, 5.503199068310432e-91)

Stats Practical WJC

Uploaded by

Copyright:

Available Formats

Stats Practical WJC

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Stats Practical WJC

Uploaded by

Copyright:

Available Formats

10/28/2020 stats practical WJC - Jupyter Notebook

mean_a = 0. # initialise arithmetic mean

print('arithmetic: ', mean_a, np.mean(data))

arithmetic: 34.0813 34.0813

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 1/23

sorted_data = data.copy() # copy data first...

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 2/23

with open('example.csv') as csvfile:

plt.figure(dpi=144) # should always start a new planned plot with plt.figure(),

# an alternate to the above (simpler/ time saving) approach is to

# slight alternate to the above

csv reader method: 50 50

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 4/23

variance: 1338.0728532099997 1338.0728532099997

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 5/23

a = 30. # lower limit

Probability of failure between 30 days and 60 days: 0.022 %

print('Probability of failure after 250 days: ',

Probability of failure after 250 days: 10.565 %

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 6/23

poisson_data = np.random.poisson(lambda_p, 1000)

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 7/23

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 8/23

xbins = np.arange(0, len(xs) + 1, 1) - 0.5

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 9/23

hist_area = sum(n) # delta-x is always 1. in this case :)

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 10/23

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 11/23

mu = 7.0 # mean of Poisson distribution

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 12/23

mu = 7.0 # mean of Poisson distribution

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 13/23

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 14/23

A = np.vstack((np.ones_like(body), body)) # make matrix A

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 15/23

def fit_function(x, m, c):

print(scipy.optimize.curve_fit(fit_function, body, brain))

(array([0.06186344, 2.36960805]), array([[ 3.70585585e-04, -1.86923367e-02],

parameters, other_stuff = scipy.optimize.curve_fit(fit_function, body, brain)

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 16/23

parameters, other_stuff = scipy.optimize.curve_fit(fit_function, body, brain, sigma=errors)

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 17/23

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 18/23

numerator = np.sum((xs - meanx) * (ys - meany))

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 19/23

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 20/23

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 21/23

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 22/23

print('Pearson r:', ss.pearsonr(xs, ys))

Pearson r: (-0.5800754774903243, 5.503199068310432e-91)

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 23/23

You might also like