Numbers: # Basic Calculations 1+2 5/6 # Numbers A 123.1 Print (A) B 10 Print (B) A + B C A + B Print (C)
Numbers: # Basic Calculations 1+2 5/6 # Numbers A 123.1 Print (A) B 10 Print (B) A + B C A + B Print (C)
# Basic Calculations
1+2
5/6
# Numbers
a = 123.1
print(a)
b = 10
print(b)
a + b
c = a + b
print(c)
1
Boolean and None
Multiple Assignments
# Boolean
a = True
b = False
print(a, b)
# No value
a = None
print(a)
# Multiple Assignments
a, b, c = 1, 2, 3
print(a, b, c)
2
Strings
# Strings
data = 'hello world'
print(data)
3
Functions
# Sum function
def mysum(x, y):
return x + y-----------specify here what want to do
variables defined abve
# Be very careful with the correct indentation!
# Recommendation: use 4 spaces.
locals()
# will give you a dictionary of all local variables.
4
Loops
# For-Loop
myrange = range(10) gives the domain/data we looking at
for i in myrange:
print(i)
# While-Loop
i = 0
while i < 10: this is the condition set for valu of I,
print(i) give result
i += 1
means
I = I + 1 use this condition otw will keep doing 0
5
If Statements
# If Statement (Boolean)
is_customer = True
if is_customer:
print("Yes")
6
Appendix:
Arithmetic Expressions
• An arithmetic expression consists of operands
and operators combined in a manner that is
already familiar to you from learning algebra
7
Arithmetic Expressions (continued)
• Precedence rules:
– ** has the highest precedence and is evaluated first
– Unary negation is evaluated next
– *, /, and % are evaluated before + and -
– + and - are evaluated before =
– With two exceptions, operations of equal precedence
are left associative, so they are evaluated from left to
right
» ** and = are right associative
– You can use () to change the order of evaluation
8
Arithmetic Expressions (continued)
9
Mathematical Constants
# e, pi
import math
math.e
math.pi
10
11
12
13
14
15
16
17
18
19
20
Lists
emptylist = []
mylist = [1, 2, 3]
print("Zeroth Value: ", mylist[0])
a.sort()
print(a)
for i in range(len(a)):
print(a[i])
22
Slicing
# You can select sections of most sequence types by
using slice notation, which in its basic form consists
of start:stop passed to the indexing operator []:
seq = [7, 2, 3, 7, 5, 6, 0, 1]
seq[1:5]
23
Slicing (Cont’d)
# Slices can also be assigned to with a sequence.
seq[3:4] = [6, 3]
seq
seq[3:5] = [0, 1]
seq
# Check again!
24
Slicing (Cont’d)
# Either the start or stop can be omitted in which case
they default to the start of the sequence and the end
of the sequence, respectively:
seq[:5]
seq[3:]
25
Slicing (Cont’d)
# Negative indices slice the sequence relative to the
end:
seq[-4:]
seq[-6:-2]
26
Data Sets: Lists in Lists – Not quite!
# access values
dataset = [[1, 2, 3], [3, 4, 5]]--- putting 2 rows
print(dataset)
print("Specific row and col: ", myarray[0,2])- gives info from all
row 0 AND only against column 2
print(myseries[0])---- gives 1
print(myseries['Ann'])
30
pandas Series slicing
# Try some numerical slicing, e.g.,
print(myseries[:])
print(myseries[1:3])
print(myseries[1:])
print(myseries[:1])
print(myseries[:-1])
print(myseries[-1:])
34
Rows and Columns – to DataFrame
# rows in a DataFrame can be retrieved as a DataFrame
print(mydataframe[0:2])
print(mydataframe['Ann':'Ben'])
print(mydataframe.iloc[0:2])– give me entire rows 0 to 2
print(mydataframe.loc['Ann':'Ben’]) – give me entrie colmn
36
37
Reading Data from CSV file
import pandas as pd
# load CSV using pandas
from pandas import read_csv
filename = 'OnlineRetail.csv'
sales = read_csv(filename, encoding = "ISO-8859-1")
print(sales.shape) tells how many rows and columns
print(sales.head())if don’t use value, will give 5 rows
print(sales.dtypes) tells data types of columns
print(sales.info())
gives all summary and also tells non null data count
and hence missing data
print(sales.count())
38
Reading Data from CSV file
with new column names
# Load CSV using pandas
# first attempt
mynames = ["InvNo", "SKU", "Descr", "Qty", "InvDate",
"UnitP", "CusID", "Cntry"]--- new column names
sales = read_csv(filename, names=mynames, encoding =
"ISO-8859-1")
print(sales.shape)
print(sales.head())
print(sales.dtypes)
# second attempt
sales = read_csv(filename, header=0, names=names,
encoding = "ISO-8859-1")– DO NOT USE THE HEADER NAMES
WHICH ALREADY EXIST so putting header =0
print(sales.shape)
print(sales.head())
39
Changing Column Types (Part I)
You can attempt to change the type of a column:
41
Creating/Deleting Columns
# Revenue?
prodsold = prodsold.assign(Rev = prodsold.UnitP *
prodsold.Qty)
REV WOULD BE A NEW COLUMN WITH THIS CALCULATION
print(prodsold.shape)
print(prodsold.head())
# Revenue in CAD
prodsold = prodsold.assign(inCAD = prodsold.Rev)
prodsold.inCAD *= 1.5929
print(prodsold.head())
# rename columns:
prodsold.rename(columns={'Rev': 'RevInEuro', 'UnitP':
'UnitP_in_Euro'}, inplace=True)
print(prodsold)
# remove full-stop:
prodsold['Descr'] = prodsold['Descr'].str.replace('.','')
print(prodsold)
45
Sorting
The default order for sorting is ascending.
46
Sorting (cont'd)
# Now in descending order (both columns)
sorted = prodsold.sort_values(['UnitP','Qty'],
ascending=False) ------descending
print(sorted.head(10))
47
Filtering Data – Single Condition
# Create a Boolean array for all products
# with price > 9.99
atleast10 = prodsold.UnitP > 9.99
print(atleast10)
48
Missing Values?!
# Several columns are missing data:
print(sales.shape)
print(sales.count())
49
Missing Values: Series Example
import pandas as pd
from numpy import nan as NA
print(myseries.isnull())
# which is equivalent to
print(myseries[myseries.notnull()])
50
Missing Values: DataFrame Example
mydf = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA,
NA, NA], [NA, 6.5, 3.]])
print(mydf)
cleaned = mydf.dropna()
print(cleaned)
# Passing how='all' will only drop rows that are all NA:
cleaned = mydf.dropna(how='all')
print(cleaned)
51
Missing Values: DataFrame Example
Dropping Columns
# modify the example --- adding another column here
mydf[3] = NA
print(mydf)
cleaned = mydf.dropna(axis=1)
print(cleaned)
52
Missing Values: DataFrame Example
Dropping Threshold
import numpy as np
mydf = pd.DataFrame(np.random.randn(7, 3))
mydf.iloc[:4, 1] = NA
mydf.iloc[:2, 2] = NA
print(mydf)
cleaned = mydf.dropna()
print(cleaned)
54
55
56
57
58
59
First, Read Data from CSV file again
import pandas as pd
# load CSV using pandas
from pandas import read_csv
filename = 'OnlineRetail.csv'
names = ["InvNo", "SKU", "Descr", "Qty", "InvDate",
"UnitP", "CusID", "Cntry"]
sales = read_csv(filename, header=0, names=names,
encoding = "ISO-8859-1")
# drop some column
sales.drop('Descr', axis=1, inplace=True)
print(sales.shape)
print(sales.head())
print(sales.dtypes)
60
Grouping Data by Column(s) and
Aggregate – Step by Step
# group by customer ID
by_customer = sales.groupby('CusID')
61
Grouping Data by Column(s) and
Aggregate – Step by Step (cont'd)
# group by customer ID and StockCode
by_customer_SKU = sales.groupby(['CusID','SKU'])
count_by_customer_SKU = by_customer_SKU['SKU'].count()
print(count_by_customer_SKU)
62
Grouping Data:
Series vs DataFrame
Important note:
♦ If you aggregate only one column, a Series will
be returned.
♦ If you aggregate multiple columns, a DataFrame
will be returned.
63
Grouping Data by Column(s) and
Aggregate – More Concisely
# group by Country and aggregate all columns
print(sales.groupby('Cntry').count())
64
Multiple Aggregates of Multiple Columns
# group by Country
by_country = sales.groupby('Cntry')
# select Qty and UnitP
by_country_sub = by_country[['Qty','UnitP']]
65
Multiple-level Column Index
Have a closer look at aggregated itself:
print(aggregated.head())
66
Multiple-level Row Index
You can also create a multi-level row index, even while
loading the data set:
sales = read_csv(filename, header=0, names=names,
encoding = "ISO-8859-1",
index_col=['Cntry','CusID','InvNo','SKU'])
print(sales.head(12))
67
Multiple-level Row Index
– Flattened
If you would rather like to flatten it, you can reset the index:
by_country_customer =
sales.groupby(level=['Cntry','CusID'])
ccc = by_country_customer.count()
ccc = ccc.reset_index()
print(ccc)
68
Parsing Dates
# parsing InvDate as date – transforming date into date type
sales = read_csv(filename, header=0, names=names, encoding = "ISO-
8859-1", parse_dates=['InvDate'])
print(sales.head())
69
Grouping by Weekday
# parsing InvDate as date
sales = read_csv(filename, header=0, names=names,
encoding = "ISO-8859-1", parse_dates=['InvDate'])
print(sales.head())
# grouping by weekday
by_day = sales.groupby(sales.InvDate.dt.strftime('%a'))
70
Detecting Outliers with Z-Scores
# import zscore
from scipy.stats import zscore
71
Detecting Outliers with Z-Scores (cont'd)
# But what are the Z-Scores with respect to individual
customers?
72
Detecting Outliers with Z-Scores (cont'd)
# construct a Boolean Series to identify outliers:
outliers = ((standardized['Qty'] < -3) |
(standardized['Qty'] > 3))
# filter by outliers:
sales_outliers = sales[outliers]
print(sales_outliers.head())
73
Descriptive Statistics
Numerical Summaries
74
First, Read Data from CSV file
# Load CSV using pandas
import numpy as np
import pandas as pd
from pandas import read_csv
filename =
'https://raw.githubusercontent.com/GerhardTrippen/Data
Sets/master/AirBnB.csv'
visits = read_csv(filename, parse_dates =
['ts_min','ts_max'])
print(visits.head())
print(visits.shape)
print(visits.head())
print(visits.dtypes)
75
How much Time did Visitors spend?
visits['ts_diff'] = visits.ts_max - visits.ts_min
by_visitor = visits.groupby('id_visitor')
visit_mean_time = by_visitor['ts_diff'].mean()
# Hmm, NOT implemented (yet)!?
visit_mean_time = by_visitor['ts_diff'].sum() /
by_visitor['ts_diff'].count()
print(visit_mean_time)
76
How much Time did Visitors spend? (cont'd)
# Now in minutes (represented as float64,
# not as timedelta64 as before)
visits['ts_mins'] = visits.ts_diff /
np.timedelta64(1,'m')
print(visits.head())
visit_mean_time = by_visitor['ts_mins'].mean()
print(visit_mean_time)
77
Visitors' Totals
visitors = by_visitor['ts_mins', 'did_search',
'sent_message', 'sent_booking_request'].sum()
print(visitors.head())
78
Descriptive Statistics
pd.set_option('display.width', 100)
pd.set_option('precision', 3)
description = visitors.describe()
print(description)
description = visitors.describe(percentiles=[.05, .
25, .5, .75, .95 ])
print(description)
print(visitors.mode())
79
Descriptive Statistics (cont'd)
# counting categories
booking_request_class_counts =
visitors.groupby('sent_booking_request').size()
print(booking_request_class_counts)
# correlation coefficients
correlations = visitors.corr(method = 'pearson')
print(correlations)
# skewness
skew = visitors.skew()
print(skew)
80