Machine-Learning-Based-Commit-Quality-Classification-System/step4_kmedoids_clustering.py at main · HarnoorSingh200/Machine-Learning-Based-Commit-Quality-Classification-System · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
from KMedoids import KMedoids   # using custom implementation
import numpy as np

INPUT_FILE = "data/normalized_for_clustering.xlsx"
OUTPUT_FILE = "data/clustered_commits.xlsx"

print("Loading normalized dataset...")
df = pd.read_excel(INPUT_FILE)

# ------------------ SAMPLE DATASET HERE ------------------
# Reduce dataset size for faster clustering (choose your number)
SAMPLE_SIZE = 10000        # limited due to laptop processing capacity

if len(df) > SAMPLE_SIZE:
    print(f"Sampling dataset from {len(df)} rows down to {SAMPLE_SIZE} rows...")
    df = df.sample(SAMPLE_SIZE, random_state=42).reset_index(drop=True)

# ---------------------------------------------------------

# Select SCALED features for clustering
FEATURES_SCALED = [
    "READABILITY_SCALED",
    "Entropy_SCALED",
    "LOC_SCALED",
    "FILES CHANGED_SCALED",
]

print("Preparing clustering input...")
data = df[FEATURES_SCALED].values

# Convert numpy rows to list of tuples (format expected by custom KMedoids)
data_list = [tuple(row) for row in data]

# Run K-Medoids
print("Running K-Medoids clustering with k = 3")
kmedoids = KMedoids(n_cluster=3, max_iter=100, tol=0.001)
kmedoids.fit(data_list)

clusters = kmedoids.clusters
medoids = list(kmedoids.medoids)

print("\nCluster Medoids (row indices):", medoids)
print("Cluster sizes:")
for idx, medoid in enumerate(medoids):
    print(f"  Cluster {idx+1}: {len(clusters[medoid])} commits")

# Assign cluster labels to dataframe
cluster_labels = [-1] * len(df)
for cluster_index, medoid in enumerate(medoids):
    for row_index in clusters[medoid]:
        cluster_labels[row_index] = cluster_index

df["CLUSTER"] = cluster_labels

CLUSTER_MAP = {
    0: "LOW_QUALITY",
    1: "MEDIUM_QUALITY",
    2: "HIGH_QUALITY"
}
df["QUALITY_LABEL"] = df["CLUSTER"].map(CLUSTER_MAP)

df.to_excel(OUTPUT_FILE, index=False)

print("\nCluster labeling complete!")
print(df['QUALITY_LABEL'].value_counts())
print(f"Saved clustered dataset to: {OUTPUT_FILE}")
print("Proceed to Step 5: Random Forest validation.\n")