0% found this document useful (0 votes)
5 views4 pages

6

Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
Download as pdf or txt
0% found this document useful (0 votes)
5 views4 pages

6

Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
Download as pdf or txt
Download as pdf or txt
You are on page 1/ 4

6 - Jupyter Notebook http://localhost:8888/notebooks/Practicals_AI/6.

ipynb

6. Implement clustering algorithms for unsupervised


classification.

In [14]: ## Load the relevant libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score
# import scipy.cluster.hierarchy as shc
# from sklearn.cluster import AgglomerativeClustering

import warnings
warnings.filterwarnings('ignore')

In [15]: ## Load the dataset into the environment

df = pd.read_csv('Mall_Customers.xls')

## Check a few records in the dataset that we just loaded

df.head(5)

Out[15]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100)

0 1 Male 19 15 39

1 2 Male 21 15 81

2 3 Female 20 16 6

3 4 Female 23 16 77

4 5 Female 31 17 40

In [16]: ## Rename a few variables

df.rename(columns = {'Annual Income (k$)' : 'Income(k$/yr)', 'Spending Score (1-100)'

## Drop CustomerID variable, as it is redundant in our analysis

df = df.drop(columns=['CustomerID'])

print(df.columns) ## Check the column names

Index(['Gender', 'Age', 'Income(k$/yr)', 'SpendScore(1-100)'], dtype='objec


t')

1 of 4 30-10-2024, 22:07
6 - Jupyter Notebook http://localhost:8888/notebooks/Practicals_AI/6.ipynb

In [17]: ## Let's start analysis by creating a subset dataframe

X = df[['Income(k$/yr)', 'SpendScore(1-100)']].copy()
print(X.head(5))

Income(k$/yr) SpendScore(1-100)
0 15 39
1 15 81
2 16 6
3 16 77
4 17 40

In [18]: ## Scale the features

scaler = StandardScaler()
X_kmeans = scaler.fit_transform(X)

In [19]: ## Applying the elbow method to determine the number of clusters

wcss = []

for cluster in range(1,11):


kmeans = KMeans(n_clusters = cluster, init = 'k-means++', random_state = 42
kmeans.fit(X_kmeans)
wcss.append(kmeans.inertia_)

wcss

Out[19]: [400.0,
269.69101219276394,
157.70400815035947,
108.92131661364357,
65.5684081557168,
55.05734827038599,
44.86475569922556,
37.228187677585886,
32.39226763033116,
29.981897788243693]

2 of 4 30-10-2024, 22:07
6 - Jupyter Notebook http://localhost:8888/notebooks/Practicals_AI/6.ipynb

In [20]: ## Plotting the Elbow Plot to determine the ideal number of clusters

plt.figure(figsize=(8,5))

plt.plot(range(1,11), wcss, 'o--')


plt.title('Elbow Plot to Determe the Number of Clusters', fontsize=14)
plt.xlabel('No of Clusters')
plt.ylabel('WCSS')

plt.show()

In [21]: ## Model fitting with 5 clusters

clusters = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)


y_kmeans = clusters.fit_predict(X_kmeans)
y_kmeans

Out[21]: array([4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2,
4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 2, 4, 0,
4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 0, 1, 3, 1, 3, 1,
0, 1, 3, 1, 3, 1, 3, 1, 3, 1, 0, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1,
3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1,
3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1,
3, 1])

3 of 4 30-10-2024, 22:07
6 - Jupyter Notebook http://localhost:8888/notebooks/Practicals_AI/6.ipynb

In [22]: ## Creating a New Dataframe with an appended cluster column

df_Kmeans = df.copy() ## making a copy of original dataframe


df_Kmeans['Cluster'] = y_kmeans ## appending the cluster column
print(df_Kmeans.head(5))

Gender Age Income(k$/yr) SpendScore(1-100) Cluster


0 Male 19 15 39 4
1 Male 21 15 81 2
2 Female 20 16 6 4
3 Female 23 16 77 2
4 Female 31 17 40 4

In [23]: ## Visualising the new dataframe with cluster numbers through scatterplot

plt.figure(figsize=(8,5))
plt.title('KMeans Cluster Diagram : Scatterplot', fontsize=14)
sns.scatterplot(data=df_Kmeans, x='SpendScore(1-100)', y='Income(k$/yr)', hue=
plt.legend(bbox_to_anchor=(1.02, 1), loc='best', borderaxespad=0)
plt.show()

4 of 4 30-10-2024, 22:07

You might also like