Stats Practical WJC

Download as pdf or txt
Download as pdf or txt
You are on page 1of 23

10/28/2020 stats practical WJC - Jupyter Notebook

In [1]:

%matplotlib inline
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import pandas as pd
import csv
from astropy.table import Table
import scipy

In [2]:

data = [1.3, 10.5, 23.5, 0.34, 100.5, 0.023, 89.1, 45., 5.45, 65.1]

In [3]:

mean_a = 0. # initialise arithmetic mean


mean_g = 1. # initialise geometric mean
N = len(data) # no. datapoints
for d in data:
mean_a += d # short for mean_a=mean_a+d
mean_g *= d # short for mean_g=mean_g*d
mean_a /= N
mean_g = mean_g **(1. / N)

In [4]:

print('arithmetic: ', mean_a, np.mean(data))


print('geometric: ', mean_g, ss.gmean(data))
# alternately with fstrings
print(f'geometric: {mean_g} {ss.gmean(data)}')
# more alternately, to display floats as shorter
print(f'geometric: {mean_g:0.3f} {ss.gmean(data):0.3f}')

arithmetic: 34.0813 34.0813


geometric: 7.169091175408984 7.169091175408983
geometric: 7.169091175408984 7.169091175408983
geometric: 7.169 7.169

In [5]:

# an aside:
# another method of checking if two values is:
meandiff = abs(mean_a - np.mean(data))
# never compare two floats with ==, only integers and strings
print(meandiff)
# meandiff = 1
# and if you want to put this as a test; there are two methods
assert(meandiff < 1e-3), 'meandiff too large' # or whatever tolerance you allow
# asserts are used in development code
if meandiff > 1e-3:
raise ValueError('meandiff too large') # this method is used in release code
# try increasing meandiff past 1e-3 and see what happens

0.0

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 1/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [6]:

sorted_data = data.copy() # copy data first...


sorted_data.sort() # ...because sort() actually destroys the original order
ihalf = int(N / 2) # find halfway...but careful with Python counting....
print('ihalf: ', ihalf)
# alternately
ihalf = N // 2 # integer division
print('ihalf alternate: ', ihalf)
print('sorted data: ', sorted_data)
print('sorted_data[ihalf]: ', sorted_data[ihalf])
if N % 2 == 0: # even number of datapoints
median = (sorted_data[ihalf - 1] + sorted_data[ihalf]) / 2.
else:
median = sorted_data[ihalf]
print('median: ', median, np.median(data))
# alternately
if not N % 2:
median = (sorted_data[ihalf - 1] + sorted_data[ihalf]) / 2.
else:
median = sorted_data[ihalf]
print('median alternate: ', median, np.median(data))

ihalf: 5
ihalf alternate: 5
sorted data: [0.023, 0.34, 1.3, 5.45, 10.5, 23.5, 45.0, 65.1, 89.1, 100.5]
sorted_data[ihalf]: 23.5
median: 17.0 17.0
median alternate: 17.0 17.0

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 2/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [40]:

with open('example.csv') as csvfile:


readCSV = csv.reader(csvfile, delimiter=',')
column1 = []
column2 = []
next(readCSV)
for row in readCSV:
v1 = row[0]
v2 = row[1]
column1.append(float(v1))
column2.append(float(v2))
print('csv reader method: ', len(column1), len(column2))

# alternate methods
# pandas
df = pd.read_csv('example.csv', names=['col1', 'col2'], header=0)
column1, column2 = df.col1, df.col2
print('pandas method: ', len(column1), len(column2))

# numpy
column1, column2 = np.loadtxt('example.csv', delimiter=',', skiprows=1, unpack=True)
print('numpy method: ', len(column1), len(column2))

# astropy
t = Table.read('example.csv', format='ascii.csv', names=['col1', 'col2'])
column1, column2 = t['col1'], t['col2']
print('astropy method: ', len(column1), len(column2))

# inbuilt
with open('example.csv', 'r') as f:
column1 = []
column2 = []
next(f)
for line in f:
line = line.split(',')
column1.append(float(line[0]))
column2.append(float(line[1]))
print('inbuilt method: ', len(column1), len(column2))

plt.figure(dpi=144) # should always start a new planned plot with plt.figure(),


# outside of notebooks python can get confused, the dpi is just to make the figure
# have higher resolution, makes files larger but improves quality
plt.hist(column1, label="Column 1")
plt.hist(column2, label="Column 2") # should say column 2 not column 1 in label
plt.legend()
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show('Histograms.pdf')

# an alternate to the above (simpler/ time saving) approach is to


# create an axis and figure object, these give you a lot more control and is necessary
# when making subplots or other complicated figures, e.g.
fig, ax = plt.subplots(dpi=144) # the same as plt.figure but could have done:
# fig, axes = plt.subplots(2, 2, dpi=144) # in which case axes has 4 subplots inside
ax.hist(column1, label="Column 1") # same as plt.hist
ax.hist(column2, label="Column 2")
ax.legend()
ax.set_xlabel('x') # slightly different phrasing, this is what is called by pyplot
ax.set_ylabel('N(x)')
plt.show() # or fig.show() but jupyter throws a pointless warning for no real reason
localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 3/23
10/28/2020 stats practical WJC - Jupyter Notebook

# slight alternate to the above


# fig = plt.figure(dpi=144)
# ax = fig.add_axes([0.05, 0.05, 0.9, 0.9])
# which is what you would do for making axes touching or precise positioning

csv reader method: 50 50


pandas method: 50 50
numpy method: 50 50
astropy method: 50 50
inbuilt method: 50 50

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 4/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [61]:

interquart = (np.min(column2),
np.quantile(column2, .25),
np.median(column2),
np.quantile(column2, .75),
np.max(column2))
def basic_stats(col):
percs = np.percentile(col, [*range(0, 125, 25), ])
std = np.std(col)
stderr = std / np.sqrt(len(col))
mean = np.mean(col)
print(f'Min = {percs[0]:.2f}, 25% = {percs[1]:.2f}, 50% (Median) = {percs[2]:.2f}'
f', 75% = {percs[3]:.2f}, Max = {percs[4]:.2f},\n Range = {percs[4]-percs[0]:.2f}'
f', Interquartile Range = {percs[3] - percs[1]:.2f}'
f',\n Standard Deviation = {std:.2f}, Standard Error = {stderr:.2f}'
f', Mean = {mean:.2f} +/- {stderr:.2f}\n')
return
for col in (column1, column2):
basic_stats(col)

Min = 2.42, 25% = 7.93, 50% (Median) = 9.96, 75% = 11.91, Max = 17.56,
Range = 15.13, Interquartile Range = 3.97,
Standard Deviation = 3.02, Standard Error = 0.43, Mean = 9.93 +/- 0.43

Min = 3.15, 25% = 3.92, 50% (Median) = 4.48, 75% = 5.55, Max = 6.28,
Range = 3.13, Interquartile Range = 1.63,
Standard Deviation = 0.97, Standard Error = 0.14, Mean = 4.63 +/- 0.14

In [8]:

sigma2 = 0.
for d in data:
sigma2 += (d-mean_a)**2.
sigma2 /= N
print('variance: ', sigma2, np.var(data))
sigma = np.sqrt(sigma2)
print('std. dev.: ', sigma, np.std(data))

variance: 1338.0728532099997 1338.0728532099997


std. dev.: 36.5796781452489 36.5796781452489

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 5/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [9]:

mu = 200.
sigma = 40.
xvalues = np.linspace(0, 400, 400)
pvalues = 1. / (sigma * np.sqrt(2. * np.pi)) * \
np.exp(-(xvalues - mu) **2. / (2. * sigma**2.))
plt.figure(dpi=144)
plt.plot(xvalues, pvalues)
plt.xlabel('Time since installation (days)')
plt.ylabel('Probability of failure')
plt.show()

In [10]:

a = 30. # lower limit


b = 60. # upper limit
n = 1000 # number of integration points
xvalues = np.linspace(a, b, n) # x values between those limits
delta_x = (b - a) / n # find delta x
pvalues = 1. / (sigma * np.sqrt(2. *np.pi)) * \
np.exp(-(xvalues - mu)**2. / (2. * sigma**2.))
integral = sum(pvalues) * delta_x # compute sum
print('Probability of failure between ',
int(a), ' days and ', int(b),' days: ', round(integral * 100, 3), '%')

Probability of failure between 30 days and 60 days: 0.022 %

In [11]:

print('Probability of failure after 250 days: ',


round((1 - ss.norm.cdf(250, mu, sigma)) * 100, 3), '%')

Probability of failure after 250 days: 10.565 %

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 6/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [12]:

lambda_p = 4.5
xs = np.arange(0, 15, 1)
factorialx = [np.math.factorial(x) for x in xs]
ps = lambda_p**xs * np.exp(-lambda_p) / factorialx
plt.figure(dpi=144)
plt.plot(xs, ps,'o')
plt.xlabel('x')
plt.ylabel('p(x)')
plt.show()

In [13]:

poisson_data = np.random.poisson(lambda_p, 1000)

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 7/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [14]:

plt.figure(dpi=144)
n, bins, patches = plt.hist(poisson_data)
# can just do plt.hist(poisson_data) if you don't care about the returned objects
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show()

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 8/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [15]:

xbins = np.arange(0, len(xs) + 1, 1) - 0.5


print(xbins)
plt.figure(dpi=144)
n, bins, patches = plt.hist(poisson_data, bins=xbins)
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show()

[-0.5 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5
13.5 14.5]

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 9/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [16]:

hist_area = sum(n) # delta-x is always 1. in this case :)


theoretical_ns = ps * hist_area
plt.figure(dpi=144)
n, bins, patches = plt.hist(poisson_data, bins=xs - 0.5)
plt.plot(xs, theoretical_ns, 'r-')
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show()

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 10/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [17]:

plt.figure(dpi=144)
n, bins, patches = plt.hist(poisson_data, bins=xbins)
errors = np.sqrt(theoretical_ns)
plt.errorbar(xs, theoretical_ns, yerr=errors, color='red')
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show()

In [18]:

chi2 = 0.
for o, e in zip(n, theoretical_ns):
chi2 += ((o - e)**2.) / e
print('chi-squared: ', chi2)

chi-squared: 6.5499882913951915

In [19]:

print(ss.chisquare(n, theoretical_ns))

Power_divergenceResult(statistic=6.5499882913951915, pvalue=0.95067177752022
24)

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 11/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [20]:

mu = 7.0 # mean of Poisson distribution


sigma = np.sqrt(mu) # variance is the same as the mean
ninsamp = 5 # number in each sample
nsamp = 10000 # number of samples to be taken
samplemeans = []
for i in range(nsamp):
rn = np.random.poisson(mu, ninsamp) # draw from distribution
samplemeans.append(np.mean(rn)) # add mean to list of means
nbins = int(6 * sigma) # choose a reasonable number of bins = 6 standard dev
print('No. bins: ', nbins)
plt.figure(dpi=144)
n, bins, patches = plt.hist(samplemeans, bins=nbins)
plt.xlabel('Sample mean')
plt.ylabel('N')
plt.show()

No. bins: 15

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 12/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [21]:

mu = 7.0 # mean of Poisson distribution


sigma = np.sqrt(mu) # variance is the same as the mean
ninsamp = 4 # number in each sample
nsamp = 100 # number of samples to be taken
samplemeans = []
for i in range(nsamp):
rn = np.random.poisson(mu, ninsamp) # draw from distribution
samplemeans.append(np.mean(rn)) # add mean to list of means
nbins = int(6*sigma) # choose a reasonable number of bins = 6 standard deviations
print('No. bins: ', nbins)
plt.figure(dpi=144)
n, bins, patches = plt.hist(samplemeans, bins=nbins)
plt.xlabel('Sample mean')
plt.ylabel('N')
# we can find the centres of the histogram bins from the 'bins' variable
xvals = []
for i in range(len(bins) - 1): # alternately, range(len(bins[:-1]))
xvals.append(0.5 * (bins[i] + bins[i + 1]))
binwidth = bins[1] - bins[0]
yerrs = np.sqrt(n)
plt.errorbar(xvals, n, yerr=yerrs, fmt='o')
# create Gaussian
mean = np.mean(samplemeans)
sigma = np.std(samplemeans)
xs = np.linspace(mean - 3 * sigma, mean + 3 * sigma, 1000)
# force normalised Gaussian to have same area as data
area = sum(n) * binwidth
ys = area * 1. / (sigma * np.sqrt(2. * np.pi)) * \
np.exp(-(xs - mean)**2. / (2. * sigma**2.))
plt.plot(xs, ys, lw=3)
plt.show()

No. bins: 15

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 13/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [22]:

body = [31.4, 31.4, 64.2, 67.5, 52.7, 29.9, 77.6, 39.4, 20., 90.3]
brain = [3.9, 3.8, 7.6, 7.8, 5.5, 4.1, 8.8, 5.2, 3.1, 5.1]
errors=[0.2, 0.4, 0.4, 0.2, 0.6, 0.4, 0.1, 0.4, 0.6, 2.4]
plt.figure(dpi=144)
plt.errorbar(body, brain, yerr=errors, fmt='o')
plt.xlabel('Body weight (kg)')
plt.ylabel('Brain weight (kg)')
plt.show()

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 14/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [23]:

A = np.vstack((np.ones_like(body), body)) # make matrix A


print(A) # the vstack function has put the two vectors on top of each other,
# we want them side by side
A = A.T # tranpose
print(A)
first_part = np.linalg.inv(np.dot(A.T, A))
c, m = np.dot(first_part, np.dot(A.T, brain))
print("Brain Weight = {:.3f} + {:.3f} Body Weight".format(c, m))
# or can use the fstrings as mentioned earlier
print(f"Brain Weight = {c:.3f} + {m:.3f} Body Weight")
plt.figure(dpi=144)
plt.errorbar(body, brain, yerr=errors, fmt='o', markersize=4)
xvalues = np.linspace(10, 100, 10)
plt.plot(xvalues, m * xvalues + c, color='red')
plt.xlabel('Body weight [kg]')
plt.ylabel('Brain weight [kg]')
plt.show()

[[ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. ]
[31.4 31.4 64.2 67.5 52.7 29.9 77.6 39.4 20. 90.3]]
[[ 1. 31.4]
[ 1. 31.4]
[ 1. 64.2]
[ 1. 67.5]
[ 1. 52.7]
[ 1. 29.9]
[ 1. 77.6]
[ 1. 39.4]
[ 1. 20. ]
[ 1. 90.3]]
Brain Weight = 2.370 + 0.062 Body Weight
Brain Weight = 2.370 + 0.062 Body Weight

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 15/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [24]:

def fit_function(x, m, c):


return m * x + c

print(scipy.optimize.curve_fit(fit_function, body, brain))

(array([0.06186344, 2.36960805]), array([[ 3.70585585e-04, -1.86923367e-02],


[-1.86923367e-02, 1.12840788e+00]]))

In [25]:

parameters, other_stuff = scipy.optimize.curve_fit(fit_function, body, brain)


# fyi, if you don't care about 'other_stuff', can just do:
parameters = scipy.optimize.curve_fit(fit_function, body, brain)[0]
m = parameters[0]
c = parameters[1]
plt.figure(dpi=144)
plt.errorbar(body, brain, yerr=errors, fmt='o', markersize=4)
xvalues = np.linspace(10, 100, 10)
plt.plot(xvalues, m * xvalues + c, color='red')
plt.xlabel('Body weight [kg]')
plt.ylabel('Brain weight [kg]')
plt.show()

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 16/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [26]:

parameters, other_stuff = scipy.optimize.curve_fit(fit_function, body, brain, sigma=errors)


m = parameters[0]
c = parameters[1]
plt.figure(dpi=144)
plt.errorbar(body, brain, yerr=errors, fmt='o', markersize=4)
xvalues = np.linspace(10, 100, 10)
plt.plot(xvalues, m * xvalues + c, color='red')
plt.xlabel('Body weight [kg]')
plt.ylabel('Brain weight [kg]')
plt.show()

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 17/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [27]:

xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise=0.1 # we can use this to control how much correlation there is between x and y
ys = xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 18/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [28]:

meanx = np.mean(xs)
meany = np.mean(ys)

In [29]:

numerator = np.sum((xs - meanx) * (ys - meany))


denominator = np.sqrt(np.sum((xs - meanx)**2.) * np.sum((ys - meany)**2.))

In [30]:

r = numerator / denominator
print('Pearson r: ', r)

Pearson r: 0.9975360030023925

In [31]:

xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise = 0.5 # we can use this to control how much correlation there is between x and y
ys = xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 19/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [32]:

meanx = np.mean(xs)
meany = np.mean(ys)
numerator = np.sum((xs - meanx) * (ys - meany))
denominator = np.sqrt(np.sum((xs - meanx)**2.) * np.sum((ys - meany)**2.))
r = numerator / denominator
print('Pearson r: ', r)

Pearson r: 0.9420785286064383

In [33]:

xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise = 1.0 # we can use this to control how much correlation there is between x and y
ys = xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [34]:

meanx = np.mean(xs)
meany = np.mean(ys)
numerator = np.sum((xs - meanx) * (ys - meany))
denominator = np.sqrt(np.sum((xs - meanx)**2.) * np.sum((ys - meany)**2.))
r = numerator / denominator
print('Pearson r: ', r)

Pearson r: 0.8235364616132786

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 20/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [35]:

xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise = 2.0 # we can use this to control how much correlation there is between x and y
ys = xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [36]:

meanx = np.mean(xs)
meany = np.mean(ys)
numerator = np.sum((xs - meanx) * (ys - meany))
denominator = np.sqrt(np.sum((xs - meanx)**2.) * np.sum((ys - meany)**2.))
r = numerator / denominator
print('Pearson r: ', r)

Pearson r: 0.5792264650420678

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 21/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [37]:

xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise = 2.0 # we can use this to control how much correlation there is between x and y
ys = -xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [38]:

meanx = np.mean(xs)
meany = np.mean(ys)
numerator = np.sum((xs - meanx) * (ys - meany))
denominator = np.sqrt(np.sum((xs - meanx)**2.) * np.sum((ys - meany)**2.))
r = numerator / denominator
print('Pearson r: ', r)

Pearson r: -0.5800754774903243

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 22/23


10/28/2020 stats practical WJC - Jupyter Notebook

In [39]:

print('Pearson r:', ss.pearsonr(xs, ys))

Pearson r: (-0.5800754774903243, 5.503199068310432e-91)

localhost:8888/notebooks/Documents/6pam1052/stats practical WJC.ipynb 23/23

You might also like