Stats Practical WJC
Stats Practical WJC
Stats Practical WJC
In [1]:
%matplotlib inline
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import pandas as pd
import csv
from astropy.table import Table
import scipy
In [2]:
data = [1.3, 10.5, 23.5, 0.34, 100.5, 0.023, 89.1, 45., 5.45, 65.1]
In [3]:
In [4]:
In [5]:
# an aside:
# another method of checking if two values is:
meandiff = abs(mean_a - np.mean(data))
# never compare two floats with ==, only integers and strings
print(meandiff)
# meandiff = 1
# and if you want to put this as a test; there are two methods
assert(meandiff < 1e-3), 'meandiff too large' # or whatever tolerance you allow
# asserts are used in development code
if meandiff > 1e-3:
raise ValueError('meandiff too large') # this method is used in release code
# try increasing meandiff past 1e-3 and see what happens
0.0
In [6]:
ihalf: 5
ihalf alternate: 5
sorted data: [0.023, 0.34, 1.3, 5.45, 10.5, 23.5, 45.0, 65.1, 89.1, 100.5]
sorted_data[ihalf]: 23.5
median: 17.0 17.0
median alternate: 17.0 17.0
In [40]:
# alternate methods
# pandas
df = pd.read_csv('example.csv', names=['col1', 'col2'], header=0)
column1, column2 = df.col1, df.col2
print('pandas method: ', len(column1), len(column2))
# numpy
column1, column2 = np.loadtxt('example.csv', delimiter=',', skiprows=1, unpack=True)
print('numpy method: ', len(column1), len(column2))
# astropy
t = Table.read('example.csv', format='ascii.csv', names=['col1', 'col2'])
column1, column2 = t['col1'], t['col2']
print('astropy method: ', len(column1), len(column2))
# inbuilt
with open('example.csv', 'r') as f:
column1 = []
column2 = []
next(f)
for line in f:
line = line.split(',')
column1.append(float(line[0]))
column2.append(float(line[1]))
print('inbuilt method: ', len(column1), len(column2))
In [61]:
interquart = (np.min(column2),
np.quantile(column2, .25),
np.median(column2),
np.quantile(column2, .75),
np.max(column2))
def basic_stats(col):
percs = np.percentile(col, [*range(0, 125, 25), ])
std = np.std(col)
stderr = std / np.sqrt(len(col))
mean = np.mean(col)
print(f'Min = {percs[0]:.2f}, 25% = {percs[1]:.2f}, 50% (Median) = {percs[2]:.2f}'
f', 75% = {percs[3]:.2f}, Max = {percs[4]:.2f},\n Range = {percs[4]-percs[0]:.2f}'
f', Interquartile Range = {percs[3] - percs[1]:.2f}'
f',\n Standard Deviation = {std:.2f}, Standard Error = {stderr:.2f}'
f', Mean = {mean:.2f} +/- {stderr:.2f}\n')
return
for col in (column1, column2):
basic_stats(col)
Min = 2.42, 25% = 7.93, 50% (Median) = 9.96, 75% = 11.91, Max = 17.56,
Range = 15.13, Interquartile Range = 3.97,
Standard Deviation = 3.02, Standard Error = 0.43, Mean = 9.93 +/- 0.43
Min = 3.15, 25% = 3.92, 50% (Median) = 4.48, 75% = 5.55, Max = 6.28,
Range = 3.13, Interquartile Range = 1.63,
Standard Deviation = 0.97, Standard Error = 0.14, Mean = 4.63 +/- 0.14
In [8]:
sigma2 = 0.
for d in data:
sigma2 += (d-mean_a)**2.
sigma2 /= N
print('variance: ', sigma2, np.var(data))
sigma = np.sqrt(sigma2)
print('std. dev.: ', sigma, np.std(data))
In [9]:
mu = 200.
sigma = 40.
xvalues = np.linspace(0, 400, 400)
pvalues = 1. / (sigma * np.sqrt(2. * np.pi)) * \
np.exp(-(xvalues - mu) **2. / (2. * sigma**2.))
plt.figure(dpi=144)
plt.plot(xvalues, pvalues)
plt.xlabel('Time since installation (days)')
plt.ylabel('Probability of failure')
plt.show()
In [10]:
In [11]:
In [12]:
lambda_p = 4.5
xs = np.arange(0, 15, 1)
factorialx = [np.math.factorial(x) for x in xs]
ps = lambda_p**xs * np.exp(-lambda_p) / factorialx
plt.figure(dpi=144)
plt.plot(xs, ps,'o')
plt.xlabel('x')
plt.ylabel('p(x)')
plt.show()
In [13]:
In [14]:
plt.figure(dpi=144)
n, bins, patches = plt.hist(poisson_data)
# can just do plt.hist(poisson_data) if you don't care about the returned objects
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show()
In [15]:
[-0.5 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5
13.5 14.5]
In [16]:
In [17]:
plt.figure(dpi=144)
n, bins, patches = plt.hist(poisson_data, bins=xbins)
errors = np.sqrt(theoretical_ns)
plt.errorbar(xs, theoretical_ns, yerr=errors, color='red')
plt.xlabel('x')
plt.ylabel('N(x)')
plt.show()
In [18]:
chi2 = 0.
for o, e in zip(n, theoretical_ns):
chi2 += ((o - e)**2.) / e
print('chi-squared: ', chi2)
chi-squared: 6.5499882913951915
In [19]:
print(ss.chisquare(n, theoretical_ns))
Power_divergenceResult(statistic=6.5499882913951915, pvalue=0.95067177752022
24)
In [20]:
No. bins: 15
In [21]:
No. bins: 15
In [22]:
body = [31.4, 31.4, 64.2, 67.5, 52.7, 29.9, 77.6, 39.4, 20., 90.3]
brain = [3.9, 3.8, 7.6, 7.8, 5.5, 4.1, 8.8, 5.2, 3.1, 5.1]
errors=[0.2, 0.4, 0.4, 0.2, 0.6, 0.4, 0.1, 0.4, 0.6, 2.4]
plt.figure(dpi=144)
plt.errorbar(body, brain, yerr=errors, fmt='o')
plt.xlabel('Body weight (kg)')
plt.ylabel('Brain weight (kg)')
plt.show()
In [23]:
[[ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. ]
[31.4 31.4 64.2 67.5 52.7 29.9 77.6 39.4 20. 90.3]]
[[ 1. 31.4]
[ 1. 31.4]
[ 1. 64.2]
[ 1. 67.5]
[ 1. 52.7]
[ 1. 29.9]
[ 1. 77.6]
[ 1. 39.4]
[ 1. 20. ]
[ 1. 90.3]]
Brain Weight = 2.370 + 0.062 Body Weight
Brain Weight = 2.370 + 0.062 Body Weight
In [24]:
In [25]:
In [26]:
In [27]:
xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise=0.1 # we can use this to control how much correlation there is between x and y
ys = xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()
In [28]:
meanx = np.mean(xs)
meany = np.mean(ys)
In [29]:
In [30]:
r = numerator / denominator
print('Pearson r: ', r)
Pearson r: 0.9975360030023925
In [31]:
xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise = 0.5 # we can use this to control how much correlation there is between x and y
ys = xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()
In [32]:
meanx = np.mean(xs)
meany = np.mean(ys)
numerator = np.sum((xs - meanx) * (ys - meany))
denominator = np.sqrt(np.sum((xs - meanx)**2.) * np.sum((ys - meany)**2.))
r = numerator / denominator
print('Pearson r: ', r)
Pearson r: 0.9420785286064383
In [33]:
xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise = 1.0 # we can use this to control how much correlation there is between x and y
ys = xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()
In [34]:
meanx = np.mean(xs)
meany = np.mean(ys)
numerator = np.sum((xs - meanx) * (ys - meany))
denominator = np.sqrt(np.sum((xs - meanx)**2.) * np.sum((ys - meany)**2.))
r = numerator / denominator
print('Pearson r: ', r)
Pearson r: 0.8235364616132786
In [35]:
xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise = 2.0 # we can use this to control how much correlation there is between x and y
ys = xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()
In [36]:
meanx = np.mean(xs)
meany = np.mean(ys)
numerator = np.sum((xs - meanx) * (ys - meany))
denominator = np.sqrt(np.sum((xs - meanx)**2.) * np.sum((ys - meany)**2.))
r = numerator / denominator
print('Pearson r: ', r)
Pearson r: 0.5792264650420678
In [37]:
xs = np.random.uniform(0, 5, 1000)
xs.sort()
noise = 2.0 # we can use this to control how much correlation there is between x and y
ys = -xs + np.random.normal(1, noise, 1000)
plt.figure(dpi=144)
plt.plot(xs, ys, 'o')
plt.xlabel('x')
plt.ylabel('y')
plt.show()
In [38]:
meanx = np.mean(xs)
meany = np.mean(ys)
numerator = np.sum((xs - meanx) * (ys - meany))
denominator = np.sqrt(np.sum((xs - meanx)**2.) * np.sum((ys - meany)**2.))
r = numerator / denominator
print('Pearson r: ', r)
Pearson r: -0.5800754774903243
In [39]: