import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

True

from sklearn.cluster import KMeans

from sklearn.datasets import make_blobs
import numpy as np
import matplotlib.pyplot as plt

X,y_true = make_blobs (n_samples=6,n_features=2,centers=2,cluster_std=0.60, random_state=0)
print(X,y_true)
plt.scatter(X[:,0],X[:,1])

[[2.51189016 0.97066867]
 [2.32158546 1.09786826]
 [1.54632313 4.212973  ]
 [2.09680487 3.7174206 ]
 [2.14169366 1.77022776]
 [0.91433877 4.55014643]] [1 1 0 0 1 0]

<matplotlib.collections.PathCollection at 0x7f6ac9e90fd0>

#manual initialization with centroids
initial_centroids = np.array([X[0],X[2]])

#Assign Each Point to Nearest Centroid
def assign_clusters(X, centroids):
    distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)

    return np.argmin(distances, axis=1)

def recalculate_centroids(X, labels, k):
    return np.array([X[labels == i].mean(axis=0) for i in range(k)])

steps = []
centroids = initial_centroids.copy()

for _ in range(3):  # Run for 3 steps manually
    labels = assign_clusters(X, centroids)

    steps.append((centroids.copy(), labels.copy()))
    centroids = recalculate_centroids(X, labels, k=2)
    print(centroids)

[[2.32505643 1.27958823]
 [1.51915559 4.16018001]]
[[2.32505643 1.27958823]
 [1.51915559 4.16018001]]
[[2.32505643 1.27958823]
 [1.51915559 4.16018001]]

fig, axs = plt.subplots(1, 3, figsize=(15, 4))

for i, (centroids, labels) in enumerate(steps):
    axs[i].scatter(X[:, 0], X[:, 1], c=labels, cmap='Set1', s=100)
    axs[i].scatter(centroids[:, 0], centroids[:, 1], s=200, c='black', marker='X')
    axs[i].set_title(f"Step {i + 1}")

from sklearn.datasets import fetch_20newsgroups
import pandas as pd
newsgroup = fetch_20newsgroups(subset='all')

df = pd.DataFrame({
    'test':newsgroup.data,
    'label_name':[newsgroup.target_names[i] for i in newsgroup.target]
})

df.head()

import matplotlib.pyplot as plt

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import  PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

categories = ['sci.space', 'comp.graphics', 'rec.sport.baseball']
data = fetch_20newsgroups(subset='train',categories=categories,shuffle=True, random_state=42)

#custom tokenizer with stemming and stopword removal
def tokenizer(text):
  tokens = word_tokenize(text) # Use standard word_tokenize
  stop_words = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  return [stemmer.stem(w.lower()) for w in tokens if w.lower() not in stop_words and w.isalpha()]

import nltk
nltk.download('punkt_tab') # Removed unnecessary download

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.

True

vectorizer = TfidfVectorizer(tokenizer=tokenizer)
X_tfidf = vectorizer.fit_transform(data.data)

print(X_tfidf)

/usr/local/lib/python3.11/dist-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 152147 stored elements and shape (1774, 15860)>
  Coords	Values
  (0, 7571)	0.2513694474224105
  (0, 7255)	0.2513694474224105
  (0, 13454)	0.018289081701086417
  (0, 5664)	0.1033029948584861
  (0, 7885)	0.4471501010254352
  (0, 12300)	0.3419980156026723
  (0, 9918)	0.018843497233258066
  (0, 13747)	0.1350272552792699
  (0, 14672)	0.03339797285995738
  (0, 13829)	0.0626457725497478
  (0, 7987)	0.018361350252307326
  (0, 3708)	0.04478025836436145
  (0, 6681)	0.14244283976839994
  (0, 773)	0.033658741890354456
  (0, 12778)	0.23906191097098523
  (0, 7374)	0.10441176351998434
  (0, 15600)	0.03005171376110715
  (0, 2984)	0.0749766220466511
  (0, 6537)	0.17863727195945153
  (0, 3684)	0.07715936464775287
  (0, 14107)	0.09145058397373186
  (0, 12764)	0.11953095548549261
  (0, 5701)	0.11300769829506672
  (0, 8598)	0.06500491997471924
  (0, 6985)	0.09353511663818827
  :	:
  (1773, 6255)	0.08668605103131068
  (1773, 8018)	0.09296545282811268
  (1773, 13175)	0.0871736891964625
  (1773, 1183)	0.08485904528330765
  (1773, 2463)	0.12230775088728253
  (1773, 1503)	0.09676657624529794
  (1773, 1643)	0.09296545282811268
  (1773, 14232)	0.1099713394221031
  (1773, 10352)	0.11394277686283492
  (1773, 10723)	0.1099713394221031
  (1773, 15207)	0.1016063653976555
  (1773, 15724)	0.12627918832801435
  (1773, 7047)	0.31194874303768333
  (1773, 13775)	0.11394277686283492
  (1773, 9553)	0.11631932581107389
  (1773, 3910)	0.11906285022741785
  (1773, 15775)	0.11906285022741785
  (1773, 20)	0.24461550177456506
  (1773, 15287)	0.11184651212682134
  (1773, 4541)	0.10672643876223842
  (1773, 12059)	0.11906285022741785
  (1773, 8641)	0.12230775088728253
  (1773, 1515)	0.13139926169259727
  (1773, 12455)	0.1386155997931938
  (1773, 6097)	0.1386155997931938

k = 3
kmeans = KMeans(n_clusters=k,random_state=42)

labels = kmeans.fit_predict(X_tfidf)
print (labels)

[2 2 0 ... 1 0 0]

# Silhouette score
score = silhouette_score(X_tfidf, labels)
print(f"Silhouette Score: {score:.2f}")

Silhouette Score: 0.01

# PCA for visualization
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X_tfidf.toarray())
print(X_reduced.shape)

(1774, 2)

# Plot clusters
plt.figure(figsize=(8, 6))
for i in range(k):
    plt.scatter(X_reduced[labels == i, 0], X_reduced[labels == i, 1], label=f'Cluster {i}', alpha=0.6)
plt.title(f'KMeans Clustering with Stemming & Stopwords Removal\nSilhouette Score: {score:.2f}')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

new_sentences = [
    "NASA is planning a new mission to Mars in 2027.",
    "The 3D rendering in this graphics software is mind-blowing.",
    "The Yankees won the baseball game in extra innings.",
    "Astrophysicists discovered a new black hole near the Milky Way.",
    "Photoshop is the most powerful image editing tool I’ve used.",
    "The pitcher threw a perfect game in last night’s match.",
    "SpaceX successfully launched another satellite.",
    "OpenGL is essential for real-time rendering in game engines.",
    "The outfielder made an incredible diving catch!",
    "Astronauts train for months before heading to the International Space Station."
]

X_new_tfidf = vectorizer.transform(new_sentences)
new_labels = kmeans.predict(X_new_tfidf)

print(X_new_tfidf,new_labels)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 55 stored elements and shape (10, 15860)>
  Coords	Values
  (0, 8385)	0.5109977249080925
  (0, 8894)	0.485571061109194
  (0, 9278)	0.4088928091742114
  (0, 9419)	0.32955019295137306
  (0, 10593)	0.4767655720407878
  (1, 5644)	0.4425646909777223
  (1, 11558)	0.7476830495436253
  (1, 12882)	0.4950823686266135
  (2, 1118)	0.35035753107350825
  (2, 4607)	0.5438945169808079
  (2, 5279)	0.32334150055135846
  (2, 6759)	0.5023282449705079
  (2, 15722)	0.4738617553087975
  (3, 853)	0.5277569199288241
  (3, 1415)	0.3069822919642613
  (3, 3649)	0.36842339756469267
  (3, 6237)	0.3183021847179204
  (3, 8822)	0.4807879891046514
  (3, 9330)	0.29728551498190403
  (3, 9419)	0.19029100696746076
  (3, 15260)	0.1855693670848088
  (4, 4084)	0.45548404184039276
  (4, 6537)	0.3061474773342483
  (4, 10486)	0.563871825364051
  (4, 10799)	0.3481439700265573
  :	:
  (5, 9501)	0.3603913463404528
  (5, 10353)	0.4156151458698583
  (5, 10560)	0.34178734553312423
  (5, 14052)	0.44571367451779403
  (6, 564)	0.43302155344916404
  (6, 7751)	0.4609564825768136
  (6, 12148)	0.5428992452541547
  (6, 13495)	0.5525141309698365
  (7, 4285)	0.31490143130065223
  (7, 4433)	0.4494977962241765
  (7, 5279)	0.26722330780691167
  (7, 9869)	0.6270547299745431
  (7, 11558)	0.48392446635368047
  (8, 2063)	0.42107404752469385
  (8, 3718)	0.561948553310125
  (8, 6642)	0.4618776793169979
  (8, 8272)	0.2832816418889835
  (8, 9995)	0.4618776793169979
  (9, 846)	0.44352515663784514
  (9, 5990)	0.38918660969657093
  (9, 6855)	0.36578164787583006
  (9, 9024)	0.37734029119808743
  (9, 12975)	0.24570069490880947
  (9, 13235)	0.35651285177117714
  (9, 14271)	0.43378227720984586 [1 2 0 2 2 0 1 0 2 1]

for i,(sent,label) in enumerate(zip(new_sentences,new_labels),1):
  print(f"{i:02d}. Cluster {label}: {sent}")

01. Cluster 1: NASA is planning a new mission to Mars in 2027.
02. Cluster 2: The 3D rendering in this graphics software is mind-blowing.
03. Cluster 0: The Yankees won the baseball game in extra innings.
04. Cluster 2: Astrophysicists discovered a new black hole near the Milky Way.
05. Cluster 2: Photoshop is the most powerful image editing tool I’ve used.
06. Cluster 0: The pitcher threw a perfect game in last night’s match.
07. Cluster 1: SpaceX successfully launched another satellite.
08. Cluster 0: OpenGL is essential for real-time rendering in game engines.
09. Cluster 2: The outfielder made an incredible diving catch!
10. Cluster 1: Astronauts train for months before heading to the International Space Station.

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

inertias = []
K = range(1, 11)

for k in K:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X_tfidf)
    inertias.append(model.inertia_)

plt.plot(K, inertias, marker='o')
plt.title("Elbow Method for Optimal k")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia (WCSS)")
plt.grid(True)
plt.show()

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_tfidf)
    score = silhouette_score(X_tfidf, labels)
    print(f"k = {k}, Silhouette Score = {score:.3f}")

k = 2, Silhouette Score = 0.007
k = 3, Silhouette Score = 0.008
k = 4, Silhouette Score = 0.008
k = 5, Silhouette Score = 0.008
k = 6, Silhouette Score = 0.008
k = 7, Silhouette Score = 0.007
k = 8, Silhouette Score = 0.007
k = 9, Silhouette Score = 0.007
k = 10, Silhouette Score = 0.008

from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import numpy as np

X,y_true = make_moons(n_samples=300, noise=0.1,random_state=42)
plt.scatter(X[:,0],X[:,1], c=y_true)

<matplotlib.collections.PathCollection at 0x7f6ac766c610>

from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
import numpy as np

# # Generate slightly more points
X, _ = make_moons(n_samples=30, noise=0, random_state=42)
from sklearn.datasets import make_blobs
# X, _ = make_blobs(n_samples=50, centers=2, cluster_std=0.3, random_state=0)

# Updated DBSCAN params
eps = 0.7
min_samples = 10
db = DBSCAN(eps=eps, min_samples=min_samples)
labels = db.fit_predict(X)

# Core point mask
core_mask = np.zeros_like(labels, dtype=bool)
core_mask[db.core_sample_indices_] = True

# Plotting
plt.figure(figsize=(8, 6))
ax = plt.gca()

colors = plt.get_cmap('tab10', len(set(labels)))

for i, (point, label) in enumerate(zip(X, labels)):
    if core_mask[i]:
        circle = Circle(point, eps, color='black', fill=False, linestyle='--', linewidth=2)
        ax.add_patch(circle)
    color = 'red' if label == -1 else colors(label)
    plt.plot(point[0], point[1], 'o', color=color, markersize=10)

plt.title("DBSCAN: Core Points with ε-Circles")
plt.xlabel("X1")
plt.ylabel("X2")
plt.axis('equal')
plt.grid(True)
plt.tight_layout()
plt.show()

from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
import numpy as np

X,y_true = make_moons(n_samples=30,noise=0.02,random_state=42)
for value,color,label in zip([0,1],['green','red'],['Class 0','Class 1']):
  plt.scatter(
      X[y_true==value,0],X[y_true==value,1],c=color, s=80,label=label
  )
#plt.scatter(X[:,0],X[:,1],c=colors,s=80,label=lables)
plt.legend()
plt.grid(True)

eps = 0.7
min_samples=10
db=DBSCAN(eps=eps,min_samples=min_samples)
labels=db.fit_predict(X)
print(labels,db.core_sample_indices_)

[-1  0  1  0  0  0 -1  1 -1  1 -1  0  0 -1 -1  1  1  1 -1  1  1 -1  0 -1
  1  0  0 -1  1  0] [1 9]

core_mask=np.zeros_like(labels,dtype=bool)
core_mask[db.core_sample_indices_] = True
print(core_mask)

[False  True False False False False False False False  True False False
 False False False False False False False False False False False False
 False False False False False False]

plt.figure(figsize=(8,6))
ax=plt.gca()
#gathering colors that are of the length of the number of clusters
colors = plt.get_cmap('tab10', len(set(labels)))
for i, (point,label) in enumerate(zip(X,labels)):
  if core_mask[i]:
    circle = Circle(point, eps, color=colors(i), fill=False, linestyle='--', linewidth=2)
    ax.add_patch(circle)
  color = 'red' if label==-1 else colors(label)
  plt.plot(point[0], point[1], 'o', color=color, markersize=10)

plt.title("DBSCAN: Core Points with ε-Circles")
plt.xlabel("X1")
plt.ylabel("X2")
plt.axis('equal')
plt.grid(True)
plt.tight_layout()
plt.show()

fig, ax = plt.subplots()

center = (0, 0)
circle = Circle(center, 2.0, fill=False, color='blue')
ax.add_patch(circle)

ax.set_xlim(-5, 5)
ax.set_ylim(-5, 5)
ax.set_aspect('equal')
plt.grid(True)

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage

# Step 1: Generate simple 2D blob data
X, y_true = make_blobs(n_samples=15, centers=3, cluster_std=1.0, random_state=42)

# Step 2: Scale the data for better cluster performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Compute linkage matrix using Ward's method
linked = linkage(X_scaled, method='ward')  # 'ward' minimizes within-cluster variance

# Step 4: Plot dendrogram
plt.figure(figsize=(10, 5))
dendrogram(linked,
           orientation='top',
           distance_sort='ascending',
           show_leaf_counts=True)
plt.title("Dendrogram - Hierarchical Clustering")
plt.xlabel("Sample index")
plt.ylabel("Distance")
plt.tight_layout()
plt.show()

from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage,dendrogram,fcluster
import matplotlib.pyplot as plt

iris=load_iris()
X=iris.data
feature_names = iris.feature_names

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

linked = linkage(X_scaled,method='ward')
print(linked)

[[1.01000000e+02 1.42000000e+02 0.00000000e+00 2.00000000e+00]
 [7.00000000e+00 3.90000000e+01 1.21167870e-01 2.00000000e+00]
 [1.00000000e+01 4.80000000e+01 1.21167870e-01 2.00000000e+00]
 [0.00000000e+00 1.70000000e+01 1.31632184e-01 2.00000000e+00]
 [9.00000000e+00 3.40000000e+01 1.31632184e-01 2.00000000e+00]
 [1.28000000e+02 1.32000000e+02 1.31632184e-01 2.00000000e+00]
 [1.27000000e+02 1.38000000e+02 1.33836265e-01 2.00000000e+00]
 [2.00000000e+00 4.70000000e+01 1.33836265e-01 2.00000000e+00]
 [1.90000000e+01 4.60000000e+01 1.43378956e-01 2.00000000e+00]
 [8.00000000e+01 8.10000000e+01 1.43378956e-01 2.00000000e+00]
 [1.00000000e+00 2.50000000e+01 1.66143388e-01 2.00000000e+00]
 [1.20000000e+02 1.43000000e+02 1.66143388e-01 2.00000000e+00]
 [1.10000000e+01 2.40000000e+01 1.70512281e-01 2.00000000e+00]
 [4.00000000e+01 1.53000000e+02 1.72216546e-01 3.00000000e+00]
 [3.00000000e+01 1.54000000e+02 1.72216546e-01 3.00000000e+00]
 [2.90000000e+01 1.57000000e+02 1.78366645e-01 3.00000000e+00]
 [4.00000000e+00 3.70000000e+01 1.78909711e-01 2.00000000e+00]
 [8.80000000e+01 9.50000000e+01 1.87721011e-01 2.00000000e+00]
 [1.36000000e+02 1.48000000e+02 2.11968529e-01 2.00000000e+00]
 [6.30000000e+01 7.80000000e+01 2.11968529e-01 2.00000000e+00]
 [1.30000000e+01 3.80000000e+01 2.11968529e-01 2.00000000e+00]
 [6.50000000e+01 8.60000000e+01 2.15410004e-01 2.00000000e+00]
 [2.80000000e+01 1.51000000e+02 2.19891525e-01 3.00000000e+00]
 [5.00000000e+00 1.60000000e+01 2.27349708e-01 2.00000000e+00]
 [5.50000000e+01 9.90000000e+01 2.27349708e-01 2.00000000e+00]
 [8.20000000e+01 9.20000000e+01 2.37109773e-01 2.00000000e+00]
 [6.60000000e+01 8.40000000e+01 2.42335741e-01 2.00000000e+00]
 [7.40000000e+01 9.70000000e+01 2.42335741e-01 2.00000000e+00]
 [2.70000000e+01 1.63000000e+02 2.43550974e-01 4.00000000e+00]
 [3.50000000e+01 4.90000000e+01 2.56734344e-01 2.00000000e+00]
 [5.70000000e+01 9.30000000e+01 2.60138817e-01 2.00000000e+00]
 [1.16000000e+02 1.37000000e+02 2.60138817e-01 2.00000000e+00]
 [1.20000000e+01 4.50000000e+01 2.63264369e-01 2.00000000e+00]
 [1.23000000e+02 1.26000000e+02 2.66275604e-01 2.00000000e+00]
 [1.12000000e+02 1.39000000e+02 2.66275604e-01 2.00000000e+00]
 [1.49000000e+02 1.56000000e+02 2.66393946e-01 3.00000000e+00]
 [1.60000000e+02 1.82000000e+02 2.69311017e-01 4.00000000e+00]
 [9.60000000e+01 1.74000000e+02 2.73790782e-01 3.00000000e+00]
 [9.10000000e+01 1.69000000e+02 2.85172982e-01 3.00000000e+00]
 [2.00000000e+01 3.10000000e+01 2.86757912e-01 2.00000000e+00]
 [5.80000000e+01 7.50000000e+01 2.88512661e-01 2.00000000e+00]
 [6.90000000e+01 8.90000000e+01 2.95330787e-01 2.00000000e+00]
 [2.30000000e+01 2.60000000e+01 2.97034895e-01 2.00000000e+00]
 [3.00000000e+00 1.65000000e+02 3.02369406e-01 4.00000000e+00]
 [5.10000000e+01 5.60000000e+01 3.12923646e-01 2.00000000e+00]
 [9.00000000e+01 9.40000000e+01 3.12923646e-01 2.00000000e+00]
 [5.00000000e+01 5.20000000e+01 3.12923646e-01 2.00000000e+00]
 [1.07000000e+02 1.30000000e+02 3.12923646e-01 2.00000000e+00]
 [8.00000000e+00 1.70000000e+02 3.13931404e-01 3.00000000e+00]
 [2.10000000e+01 4.40000000e+01 3.23540478e-01 2.00000000e+00]
 [1.41000000e+02 1.45000000e+02 3.39039313e-01 2.00000000e+00]
 [1.58000000e+02 1.99000000e+02 3.44969764e-01 4.00000000e+00]
 [6.70000000e+01 1.75000000e+02 3.46072006e-01 3.00000000e+00]
 [6.80000000e+01 1.19000000e+02 3.73482180e-01 2.00000000e+00]
 [5.40000000e+01 1.33000000e+02 3.73482180e-01 2.00000000e+00]
 [1.40000000e+02 2.00000000e+02 3.84142001e-01 3.00000000e+00]
 [3.60000000e+01 1.89000000e+02 3.89997733e-01 3.00000000e+00]
 [1.47000000e+02 1.81000000e+02 3.92089474e-01 3.00000000e+00]
 [6.00000000e+00 1.62000000e+02 3.92644341e-01 3.00000000e+00]
 [1.17000000e+02 1.31000000e+02 3.96370000e-01 2.00000000e+00]
 [4.20000000e+01 1.93000000e+02 4.04654616e-01 5.00000000e+00]
 [1.05000000e+02 1.35000000e+02 4.05896672e-01 2.00000000e+00]
 [1.11000000e+02 1.83000000e+02 4.14770417e-01 3.00000000e+00]
 [7.10000000e+01 7.30000000e+01 4.19071824e-01 2.00000000e+00]
 [1.10000000e+02 1.15000000e+02 4.28423754e-01 2.00000000e+00]
 [6.10000000e+01 1.88000000e+02 4.30794355e-01 4.00000000e+00]
 [1.21000000e+02 1.50000000e+02 4.35072663e-01 3.00000000e+00]
 [1.64000000e+02 1.86000000e+02 4.43084504e-01 7.00000000e+00]
 [1.72000000e+02 1.78000000e+02 4.48838821e-01 7.00000000e+00]
 [3.20000000e+01 3.30000000e+01 4.53522824e-01 2.00000000e+00]
 [1.24000000e+02 1.61000000e+02 4.59801248e-01 3.00000000e+00]
 [6.40000000e+01 1.67000000e+02 4.60021939e-01 3.00000000e+00]
 [4.30000000e+01 1.92000000e+02 4.65750432e-01 3.00000000e+00]
 [1.59000000e+02 1.91000000e+02 4.65980168e-01 4.00000000e+00]
 [8.30000000e+01 1.34000000e+02 4.66629038e-01 2.00000000e+00]
 [7.90000000e+01 2.02000000e+02 4.74023652e-01 4.00000000e+00]
 [1.02000000e+02 1.84000000e+02 4.76690610e-01 3.00000000e+00]
 [1.03000000e+02 2.07000000e+02 4.83131455e-01 4.00000000e+00]
 [1.77000000e+02 2.13000000e+02 4.83438815e-01 4.00000000e+00]
 [1.00000000e+02 1.68000000e+02 4.84986754e-01 3.00000000e+00]
 [1.44000000e+02 2.20000000e+02 4.94292271e-01 4.00000000e+00]
 [1.71000000e+02 1.96000000e+02 5.00396942e-01 4.00000000e+00]
 [9.80000000e+01 1.80000000e+02 5.14670029e-01 3.00000000e+00]
 [7.20000000e+01 1.46000000e+02 5.29587579e-01 2.00000000e+00]
 [7.60000000e+01 1.90000000e+02 5.30862879e-01 3.00000000e+00]
 [1.80000000e+01 1.73000000e+02 5.35878632e-01 3.00000000e+00]
 [5.30000000e+01 2.23000000e+02 5.37369874e-01 5.00000000e+00]
 [8.70000000e+01 2.03000000e+02 5.42339169e-01 3.00000000e+00]
 [1.25000000e+02 1.29000000e+02 5.42394971e-01 2.00000000e+00]
 [2.20000000e+01 1.66000000e+02 5.60796927e-01 3.00000000e+00]
 [7.00000000e+01 8.50000000e+01 5.70110887e-01 2.00000000e+00]
 [1.04000000e+02 1.55000000e+02 5.70260277e-01 3.00000000e+00]
 [7.70000000e+01 2.27000000e+02 5.87105860e-01 5.00000000e+00]
 [5.90000000e+01 1.95000000e+02 6.15111476e-01 3.00000000e+00]
 [1.18000000e+02 1.22000000e+02 6.17112198e-01 2.00000000e+00]
 [1.87000000e+02 2.21000000e+02 6.28642029e-01 6.00000000e+00]
 [1.52000000e+02 2.01000000e+02 6.40184657e-01 6.00000000e+00]
 [1.13000000e+02 2.16000000e+02 6.68799582e-01 4.00000000e+00]
 [1.40000000e+01 2.35000000e+02 6.90546663e-01 4.00000000e+00]
 [2.04000000e+02 2.34000000e+02 7.07198886e-01 5.00000000e+00]
 [2.05000000e+02 2.26000000e+02 7.14312345e-01 6.00000000e+00]
 [1.94000000e+02 2.40000000e+02 7.58877498e-01 4.00000000e+00]
 [1.79000000e+02 2.08000000e+02 7.92138167e-01 5.00000000e+00]
 [2.15000000e+02 2.28000000e+02 8.02273104e-01 8.00000000e+00]
 [1.08000000e+02 2.33000000e+02 8.04683460e-01 3.00000000e+00]
 [1.76000000e+02 2.45000000e+02 8.17160958e-01 8.00000000e+00]
 [2.18000000e+02 2.22000000e+02 8.32534580e-01 1.00000000e+01]
 [1.14000000e+02 2.47000000e+02 8.42610909e-01 5.00000000e+00]
 [6.20000000e+01 2.37000000e+02 8.67420340e-01 4.00000000e+00]
 [2.14000000e+02 2.30000000e+02 9.01895536e-01 6.00000000e+00]
 [1.50000000e+01 2.19000000e+02 9.09316787e-01 3.00000000e+00]
 [2.06000000e+02 2.56000000e+02 9.28556520e-01 1.30000000e+01]
 [1.97000000e+02 2.38000000e+02 9.29822482e-01 4.00000000e+00]
 [2.12000000e+02 2.24000000e+02 9.30361790e-01 5.00000000e+00]
 [2.25000000e+02 2.43000000e+02 9.61515979e-01 7.00000000e+00]
 [2.11000000e+02 2.44000000e+02 1.04912784e+00 4.00000000e+00]
 [2.54000000e+02 2.63000000e+02 1.05469485e+00 8.00000000e+00]
 [2.10000000e+02 2.52000000e+02 1.05589471e+00 1.00000000e+01]
 [2.41000000e+02 2.42000000e+02 1.12252423e+00 8.00000000e+00]
 [6.00000000e+01 2.32000000e+02 1.14816058e+00 4.00000000e+00]
 [2.50000000e+02 2.59000000e+02 1.18433800e+00 1.20000000e+01]
 [2.39000000e+02 2.46000000e+02 1.19173432e+00 9.00000000e+00]
 [1.09000000e+02 2.09000000e+02 1.20174967e+00 3.00000000e+00]
 [1.98000000e+02 2.17000000e+02 1.32966730e+00 1.00000000e+01]
 [1.85000000e+02 2.51000000e+02 1.33208869e+00 7.00000000e+00]
 [2.36000000e+02 2.64000000e+02 1.36861817e+00 1.20000000e+01]
 [2.49000000e+02 2.53000000e+02 1.43563131e+00 1.30000000e+01]
 [2.48000000e+02 2.60000000e+02 1.45868403e+00 7.00000000e+00]
 [1.06000000e+02 2.75000000e+02 1.53655871e+00 1.30000000e+01]
 [2.62000000e+02 2.65000000e+02 1.65405823e+00 8.00000000e+00]
 [2.67000000e+02 2.73000000e+02 1.75300281e+00 2.00000000e+01]
 [2.29000000e+02 2.70000000e+02 1.80586052e+00 1.50000000e+01]
 [2.61000000e+02 2.71000000e+02 2.01333091e+00 2.20000000e+01]
 [4.10000000e+01 2.69000000e+02 2.01509807e+00 5.00000000e+00]
 [2.57000000e+02 2.66000000e+02 2.01781256e+00 1.30000000e+01]
 [2.31000000e+02 2.76000000e+02 2.14172453e+00 1.70000000e+01]
 [2.68000000e+02 2.74000000e+02 2.30338940e+00 1.50000000e+01]
 [2.58000000e+02 2.78000000e+02 2.69536867e+00 1.70000000e+01]
 [2.85000000e+02 2.86000000e+02 2.86888385e+00 3.20000000e+01]
 [2.55000000e+02 2.87000000e+02 3.44191215e+00 2.50000000e+01]
 [2.84000000e+02 2.88000000e+02 3.93266869e+00 4.50000000e+01]
 [2.72000000e+02 2.81000000e+02 3.95076988e+00 1.80000000e+01]
 [2.77000000e+02 2.82000000e+02 4.06134011e+00 2.90000000e+01]
 [2.79000000e+02 2.91000000e+02 4.22703313e+00 2.60000000e+01]
 [2.83000000e+02 2.89000000e+02 4.24348495e+00 3.00000000e+01]
 [2.80000000e+02 2.92000000e+02 6.60781224e+00 4.90000000e+01]
 [2.90000000e+02 2.93000000e+02 8.00474726e+00 7.10000000e+01]
 [2.94000000e+02 2.96000000e+02 1.26368435e+01 1.01000000e+02]
 [2.95000000e+02 2.97000000e+02 2.72499115e+01 1.50000000e+02]]

plt.figure(figsize=(10,5))
dendrogram(linked, orientation='top', distance_sort='ascending',show_leaf_counts=False)
plt.title("Dendrogram – Iris Data")
#plt.xlabel("Customer Index")
plt.ylabel("Distance")

Text(0, 0.5, 'Distance')

# cut the dandogram - 3 clusters
aglo = AgglomerativeClustering(n_clusters=3, metric='euclidean',linkage='ward')
labels=aglo.fit_predict(X_scaled)
print(labels)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 2 1 1 1 1 1 1 1 1 0 0 0 2 0 2 0 2 0 2 2 0 2 0 2 0 2 2 2 2 0 0 0 0
 0 0 0 0 0 2 2 2 2 0 2 0 0 2 2 2 2 0 2 2 2 2 2 0 2 2 0 0 0 0 0 0 2 0 0 0 0
 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]

df = pd.DataFrame(data=iris.data,columns=iris.feature_names)
df['cluster']=labels
df[df['cluster']==2].head()

plt.figure(figsize=(8,6))

for cluster_id in range(5):
  plt.scatter(X[labels == cluster_id,0],X[labels == cluster_id,1], label=f"Cluster {cluster_id}")

from sklearn.decomposition import PCA

X_2d = PCA(n_components=2).fit_transform(X_scaled)
print(X_2d)
plt.scatter(X_2d[labels == cluster_id, 0], X_2d[labels == cluster_id, 1])

[[-2.26470281  0.4800266 ]
 [-2.08096115 -0.67413356]
 [-2.36422905 -0.34190802]
 [-2.29938422 -0.59739451]
 [-2.38984217  0.64683538]
 [-2.07563095  1.48917752]
 [-2.44402884  0.0476442 ]
 [-2.23284716  0.22314807]
 [-2.33464048 -1.11532768]
 [-2.18432817 -0.46901356]
 [-2.1663101   1.04369065]
 [-2.32613087  0.13307834]
 [-2.2184509  -0.72867617]
 [-2.6331007  -0.96150673]
 [-2.1987406   1.86005711]
 [-2.26221453  2.68628449]
 [-2.2075877   1.48360936]
 [-2.19034951  0.48883832]
 [-1.898572    1.40501879]
 [-2.34336905  1.12784938]
 [-1.914323    0.40885571]
 [-2.20701284  0.92412143]
 [-2.7743447   0.45834367]
 [-1.81866953  0.08555853]
 [-2.22716331  0.13725446]
 [-1.95184633 -0.62561859]
 [-2.05115137  0.24216355]
 [-2.16857717  0.52714953]
 [-2.13956345  0.31321781]
 [-2.26526149 -0.3377319 ]
 [-2.14012214 -0.50454069]
 [-1.83159477  0.42369507]
 [-2.61494794  1.79357586]
 [-2.44617739  2.15072788]
 [-2.10997488 -0.46020184]
 [-2.2078089  -0.2061074 ]
 [-2.04514621  0.66155811]
 [-2.52733191  0.59229277]
 [-2.42963258 -0.90418004]
 [-2.16971071  0.26887896]
 [-2.28647514  0.44171539]
 [-1.85812246 -2.33741516]
 [-2.5536384  -0.47910069]
 [-1.96444768  0.47232667]
 [-2.13705901  1.14222926]
 [-2.0697443  -0.71105273]
 [-2.38473317  1.1204297 ]
 [-2.39437631 -0.38624687]
 [-2.22944655  0.99795976]
 [-2.20383344  0.00921636]
 [ 1.10178118  0.86297242]
 [ 0.73133743  0.59461473]
 [ 1.24097932  0.61629765]
 [ 0.40748306 -1.75440399]
 [ 1.0754747  -0.20842105]
 [ 0.38868734 -0.59328364]
 [ 0.74652974  0.77301931]
 [-0.48732274 -1.85242909]
 [ 0.92790164  0.03222608]
 [ 0.01142619 -1.03401828]
 [-0.11019628 -2.65407282]
 [ 0.44069345 -0.06329519]
 [ 0.56210831 -1.76472438]
 [ 0.71956189 -0.18622461]
 [-0.0333547  -0.43900321]
 [ 0.87540719  0.50906396]
 [ 0.35025167 -0.19631173]
 [ 0.15881005 -0.79209574]
 [ 1.22509363 -1.6222438 ]
 [ 0.1649179  -1.30260923]
 [ 0.73768265  0.39657156]
 [ 0.47628719 -0.41732028]
 [ 1.2341781  -0.93332573]
 [ 0.6328582  -0.41638772]
 [ 0.70266118 -0.06341182]
 [ 0.87427365  0.25079339]
 [ 1.25650912 -0.07725602]
 [ 1.35840512  0.33131168]
 [ 0.66480037 -0.22592785]
 [-0.04025861 -1.05871855]
 [ 0.13079518 -1.56227183]
 [ 0.02345269 -1.57247559]
 [ 0.24153827 -0.77725638]
 [ 1.06109461 -0.63384324]
 [ 0.22397877 -0.28777351]
 [ 0.42913912  0.84558224]
 [ 1.04872805  0.5220518 ]
 [ 1.04453138 -1.38298872]
 [ 0.06958832 -0.21950333]
 [ 0.28347724 -1.32932464]
 [ 0.27907778 -1.12002852]
 [ 0.62456979  0.02492303]
 [ 0.33653037 -0.98840402]
 [-0.36218338 -2.01923787]
 [ 0.28858624 -0.85573032]
 [ 0.09136066 -0.18119213]
 [ 0.22771687 -0.38492008]
 [ 0.57638829 -0.1548736 ]
 [-0.44766702 -1.54379203]
 [ 0.25673059 -0.5988518 ]
 [ 1.84456887  0.87042131]
 [ 1.15788161 -0.69886986]
 [ 2.20526679  0.56201048]
 [ 1.44015066 -0.04698759]
 [ 1.86781222  0.29504482]
 [ 2.75187334  0.8004092 ]
 [ 0.36701769 -1.56150289]
 [ 2.30243944  0.42006558]
 [ 2.00668647 -0.71143865]
 [ 2.25977735  1.92101038]
 [ 1.36417549  0.69275645]
 [ 1.60267867 -0.42170045]
 [ 1.8839007   0.41924965]
 [ 1.2601151  -1.16226042]
 [ 1.4676452  -0.44227159]
 [ 1.59007732  0.67624481]
 [ 1.47143146  0.25562182]
 [ 2.42632899  2.55666125]
 [ 3.31069558  0.01778095]
 [ 1.26376667 -1.70674538]
 [ 2.0377163   0.91046741]
 [ 0.97798073 -0.57176432]
 [ 2.89765149  0.41364106]
 [ 1.33323218 -0.48181122]
 [ 1.7007339   1.01392187]
 [ 1.95432671  1.0077776 ]
 [ 1.17510363 -0.31639447]
 [ 1.02095055  0.06434603]
 [ 1.78834992 -0.18736121]
 [ 1.86364755  0.56229073]
 [ 2.43595373  0.25928443]
 [ 2.30492772  2.62632347]
 [ 1.86270322 -0.17854949]
 [ 1.11414774 -0.29292262]
 [ 1.2024733  -0.81131527]
 [ 2.79877045  0.85680333]
 [ 1.57625591  1.06858111]
 [ 1.3462921   0.42243061]
 [ 0.92482492  0.0172231 ]
 [ 1.85204505  0.67612817]
 [ 2.01481043  0.61388564]
 [ 1.90178409  0.68957549]
 [ 1.15788161 -0.69886986]
 [ 2.04055823  0.8675206 ]
 [ 1.9981471   1.04916875]
 [ 1.87050329  0.38696608]
 [ 1.56458048 -0.89668681]
 [ 1.5211705   0.26906914]
 [ 1.37278779  1.01125442]
 [ 0.96065603 -0.02433167]]

<matplotlib.collections.PathCollection at 0x7f6ac42bf050>

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

# Step 1: Create synthetic customer data
np.random.seed(42)
n = 200
age = np.random.normal(40, 12, n).clip(18, 70).astype(int)
income = np.random.normal(60, 20, n).clip(15, 120)
spending_score = np.random.normal(50, 25, n).clip(1, 100)

df = pd.DataFrame({
    'Age': age,
    'Annual Income (k$)': income,
    'Spending Score (1-100)': spending_score
})

# Step 2: Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

# Step 3: Compute linkage matrix using Ward's method
linked = linkage(X_scaled, method='ward')

# Step 4: Display dendrogram to decide number of clusters
plt.figure(figsize=(10, 5))
dendrogram(linked, orientation='top', distance_sort='ascending', show_leaf_counts=False)
plt.title("Dendrogram – Market Segmentation")
plt.xlabel("Customer Index")
plt.ylabel("Distance")
plt.show()

# Step 5: Cut dendrogram – form 4 clusters
num_clusters = 4
df['Cluster'] = fcluster(linked, num_clusters, criterion='maxclust') - 1  # 0-index

# Step 6: Analyze each segment
summary = df.groupby('Cluster').mean().round(2)
print("\nCluster Profile:")
display(summary)

# Step 7: Visualize segments in 2D
plt.figure(figsize=(8, 6))
for cluster in sorted(df['Cluster'].unique()):
    subset = df[df['Cluster'] == cluster]
    plt.scatter(subset['Annual Income (k$)'], subset['Spending Score (1-100)'],
                label=f'Segment {cluster}', alpha=0.6)
plt.title("Customer Segments by Income & Spending Score")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.legend()
plt.grid(True)
plt.show()
df.head(100)

Cluster Profile:

Algorithm	Shape Support	Requires K?	Handles Noise?
K-Means	Spherical	Yes	No
Hierarchical	Varies	Optional	No
DBSCAN	Arbitrary	No	Yes
GMM	Elliptical	Yes	No

Parameter	Description
`n_clusters`	Number of clusters (K).
`init`	Method to initialize centroids. Options: `'k-means++'`, `'random'`, or ndarray.
`n_init`	Number of times the algorithm will be run with different centroid seeds.
`max_iter`	Maximum iterations for a single run.
`tol`	Convergence threshold for centroid changes.
`verbose`	Output verbosity level. Use 0 (silent), 1 (info), etc.
`random_state`	Fix seed for reproducibility.
`algorithm`	Algorithm to use: `'lloyd'` (default) or `'elkan'`.

Parameter	Description
`n_samples`	Total number of data points to generate (int or list).
`n_features`	Number of features (dimensions). Default is 2.
`centers`	Number of clusters or actual coordinates of centers.
`cluster_std`	Standard deviation of clusters (float or list). Controls spread.
`center_box`	Tuple `(min, max)` — bounds for cluster center generation.
`shuffle`	Whether to shuffle samples after generation.
`random_state`	Controls randomness. Set integer for reproducibility.
`return_centers`	If `True`, returns both data and center locations.

	test	label_name
0	From: Mamatha Devineni Ratnam <mr47+@andrew.cm...	rec.sport.hockey
1	From: mblawson@midway.ecn.uoknor.edu (Matthew ...	comp.sys.ibm.pc.hardware
2	From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...	talk.politics.mideast
3	From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...	comp.sys.ibm.pc.hardware
4	From: Alexander Samuel McDiarmid <am2o+@andrew...	comp.sys.mac.hardware

Component	Used for	Input Type	Output Type
`TfidfVectorizer`	All-in-one (tokenize + TF + IDF)	Raw text	TF-IDF sparse matrix
`TfidfTransformer`	Just converts word counts → TF-IDF	Count matrix	TF-IDF sparse matrix

K-Means Clustering¶

Notebook Intro

Introduction to Clustering¶

Types of Clustering Algorithms¶

K-Means Clustering¶

Algorithm Steps:¶

Objective Function¶

Hierarchical Clustering¶

Linkage Criteria¶

DBSCAN¶

Key Parameters¶

Terms¶

Gaussian Mixture Models (GMM)¶

Probability Model¶

How to Choose K?¶

1. Elbow Method¶

2. Silhouette Score¶

3. BIC / AIC (GMM Only)¶

Summary Table¶

Parameters of `KMeans`¶

Parameters of `make_blobs`¶

Step-by-Step K-Means with Code and Explanation¶

Parameters of DBSCAN¶

Let us visualize DBSCAN¶

Explanation¶

DBSCAN: Core Points with ε-Circles¶

Drawing Circle Basics¶

Method	Purpose	Output
`fit_predict(X)`	Fit the model and return cluster labels	➜ Array of cluster labels for each sample
`fit_transform(X)`	Fit the model and return transformed data (not labels!)	➜ Distance of each sample to every centroid

Score Range	Meaning
+1	Perfect clustering (tight, well-separated)
0	Clusters are overlapping or ambiguous
< 0	Bad clustering (wrong assignment)

Parameter	Type	Default	What it does
`eps`	float	`0.5`	Maximum distance between two samples for one to be considered a neighbor.
`min_samples`	int	`5`	Minimum number of points to form a core point (includes itself).
`metric`	str	`'euclidean'`	Distance metric (can use `'manhattan'`, `'cosine'`, `'haversine'`, etc).
`metric_params`	dict or None	`None`	Additional keyword args for metric function.
`algorithm`	str	`'auto'`	Search algorithm: `'auto'`, `'ball_tree'`, `'kd_tree'`, or `'brute'`.
`leaf_size`	int	`30`	Leaf size for BallTree or KDTree (only relevant if used).
`n_jobs`	int or None	`None`	Parallel jobs. `-1` uses all processors. (only applies to some metrics).

Parameter	Type	Default	Description
`n_samples`	int or tuple	`100`	Total number of samples (or a tuple like `(n1, n2)` for class imbalance).
`noise`	float	`0.0`	Standard deviation of Gaussian noise added to the data.
`shuffle`	bool	`True`	Whether to shuffle the samples.
`random_state`	int, `RandomState`, or `None`	`None`	Ensures reproducibility if set.

Parameter	Purpose
`n_clusters`	Number of clusters to form
`metric`	Metric used to compute linkage (`'euclidean'`, `'manhattan'`, etc.)
`linkage`	Method to merge clusters (`ward`, `complete`, `average`, `single`)
`distance_threshold`	Cut tree at a given distance instead of setting `n_clusters`

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	cluster
41	4.5	2.3	1.3	0.3	2
53	5.5	2.3	4.0	1.3	2
55	5.7	2.8	4.5	1.3	2
57	4.9	2.4	3.3	1.0	2
59	5.2	2.7	3.9	1.4	2

	Age	Annual Income (k$)	Spending Score (1-100)
Cluster
0	30.56	51.55	58.93
1	51.97	55.46	23.93
2	37.41	81.83	23.65
3	44.78	70.09	56.48

	Age	Annual Income (k$)	Spending Score (1-100)	Cluster
0	45	67.155747	10.139309	1
1	38	71.215691	35.015624	2
2	47	81.661025	50.131092	3
3	58	81.076041	51.174515	1
4	37	32.446613	38.748363	0
...	...	...	...	...
95	22	46.141808	63.472751	0
96	43	77.991998	24.068846	2
97	43	66.145990	45.241533	3
98	40	76.257242	28.109544	2
99	37	72.592577	15.430007	2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	cluster
41	4.5	2.3	1.3	0.3	2
53	5.5	2.3	4.0	1.3	2
55	5.7	2.8	4.5	1.3	2
57	4.9	2.4	3.3	1.0	2
59	5.2	2.7	3.9	1.4	2

K-Means Clustering¶

Notebook Intro

Introduction to Clustering¶

Types of Clustering Algorithms¶

K-Means Clustering¶

Algorithm Steps:¶

Objective Function¶

Hierarchical Clustering¶

Linkage Criteria¶

DBSCAN¶

Key Parameters¶

Terms¶

Gaussian Mixture Models (GMM)¶

Probability Model¶

How to Choose K?¶

1. Elbow Method¶

2. Silhouette Score¶

3. BIC / AIC (GMM Only)¶

Summary Table¶

Parameters of KMeans¶

Parameters of make_blobs¶

Step-by-Step K-Means with Code and Explanation¶

Parameters of DBSCAN¶

Let us visualize DBSCAN¶

Explanation¶

DBSCAN: Core Points with ε-Circles¶

Drawing Circle Basics¶

Parameters of `KMeans`¶

Parameters of `make_blobs`¶

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	cluster
41	4.5	2.3	1.3	0.3	2
53	5.5	2.3	4.0	1.3	2
55	5.7	2.8	4.5	1.3	2
57	4.9	2.4	3.3	1.0	2
59	5.2	2.7	3.9	1.4	2