|
1 | 1 | """README, Author - Anurag Kumar(mailto:anuragkumarak95@gmail.com)
|
2 |
| -
|
3 | 2 | Requirements:
|
4 | 3 | - sklearn
|
5 | 4 | - numpy
|
6 | 5 | - matplotlib
|
7 |
| -
|
8 | 6 | Python:
|
9 | 7 | - 3.5
|
10 |
| -
|
11 | 8 | Inputs:
|
12 | 9 | - X , a 2D numpy array of features.
|
13 | 10 | - k , number of clusters to create.
|
|
16 | 13 | - maxiter , maximum number of iterations to process.
|
17 | 14 | - heterogeneity , empty list that will be filled with hetrogeneity values if passed
|
18 | 15 | to kmeans func.
|
19 |
| -
|
20 | 16 | Usage:
|
21 | 17 | 1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
|
22 |
| -
|
23 | 18 | 2. create initial_centroids,
|
24 | 19 | initial_centroids = get_initial_centroids(
|
25 | 20 | X,
|
26 | 21 | k,
|
27 | 22 | seed=0 # seed value for initial centroid generation,
|
28 | 23 | # None for randomness(default=None)
|
29 | 24 | )
|
30 |
| -
|
31 | 25 | 3. find centroids and clusters using kmeans function.
|
32 |
| -
|
33 | 26 | centroids, cluster_assignment = kmeans(
|
34 | 27 | X,
|
35 | 28 | k,
|
|
38 | 31 | record_heterogeneity=heterogeneity,
|
39 | 32 | verbose=True # whether to print logs in console or not.(default=False)
|
40 | 33 | )
|
41 |
| -
|
42 |
| -
|
43 | 34 | 4. Plot the loss function, hetrogeneity values for every iteration saved in
|
44 | 35 | hetrogeneity list.
|
45 | 36 | plot_heterogeneity(
|
46 | 37 | heterogeneity,
|
47 | 38 | k
|
48 | 39 | )
|
49 |
| -
|
50 | 40 | 5. Transfers Dataframe into excel format it must have feature called
|
51 | 41 | 'Clust' with k means clustering numbers in it.
|
52 |
| -
|
53 |
| -
|
54 | 42 | """
|
55 | 43 | import warnings
|
56 | 44 |
|
@@ -222,7 +210,6 @@ def ReportGenerator(
|
222 | 210 | in order to run the function following libraries must be imported:
|
223 | 211 | import pandas as pd
|
224 | 212 | import numpy as np
|
225 |
| -
|
226 | 213 | >>> data = pd.DataFrame()
|
227 | 214 | >>> data['numbers'] = [1, 2, 3]
|
228 | 215 | >>> data['col1'] = [0.5, 2.5, 4.5]
|
@@ -287,10 +274,10 @@ def ReportGenerator(
|
287 | 274 | .T.reset_index()
|
288 | 275 | .rename(index=str, columns={"level_0": "Features", "level_1": "Type"})
|
289 | 276 | ) # rename columns
|
290 |
| - |
| 277 | + # calculate the size of cluster(count of clientID's) |
291 | 278 | clustersize = report[
|
292 | 279 | (report["Features"] == "dummy") & (report["Type"] == "count")
|
293 |
| - ] # calculate the size of cluster(count of clientID's) |
| 280 | + ].copy() # avoid SettingWithCopyWarning |
294 | 281 | clustersize.Type = (
|
295 | 282 | "ClusterSize" # rename created cluster df to match report column names
|
296 | 283 | )
|
|
0 commit comments