From aea5f0d528b659a81cea13669b637830bbe943b1 Mon Sep 17 00:00:00 2001 From: beqakd Date: Fri, 12 Jun 2020 16:52:10 +0400 Subject: [PATCH 1/7] add visualization of kmneas clust as excel format --- machine_learning/k_means_clust.py | 82 ++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index 86a5dd968779..5acfe4eba722 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -47,10 +47,12 @@ k ) - 5. Have fun.. + 5. Transfers Dataframe into excel format it must have feature called 'Clust' with clust numbers in it. + """ import numpy as np +import pandas as pd from matplotlib import pyplot as plt from sklearn.metrics import pairwise_distances @@ -202,3 +204,81 @@ def kmeans( verbose=True, ) plot_heterogeneity(heterogeneity, k) + + +def ReportGenerator(df, ClusteringVariables, FillMissingReport=None): + """ + Function generates easy-erading clustering report. It takes 2 arguments as an input: + DataFrame - dataframe with predicted cluester column; + FillMissingReport - dcitionary of rules how we are going to fill missing + values of for final report generate (not included in modeling); + in order to run the function following libraries must be imported: + import pandas as pd + import numpy as np + """ + #Fill missing values with given rules + if FillMissingReport is None: + pass + else: + df.fillna(value=FillMissingReport, inplace=True) + df['dummy'] = 1 + numeric_cols = df.select_dtypes(np.number).columns + report = (df # constract report dataframe + .groupby(['Cluster'])[numeric_cols] # group by cluster number + .agg([("sum", np.sum), + ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))), + ("mean_without_zeros", lambda x : x.replace(0, np.NaN).mean()), + ("mean_25-75", lambda x : np.mean(np.nan_to_num(sorted(x)[round((len(x)*25/100)):round(len(x)*75/100)]))), + ("mean_with_na", np.mean), + ('min', lambda x: x.min()), + ("5%",lambda x: x.quantile(0.05)), + ("25%",lambda x: x.quantile(0.25)), + ("50%", lambda x:x.quantile(0.50)), + ("75%", lambda x:x.quantile(0.75)), + ("95%",lambda x: x.quantile(0.95)), + ('max', lambda x:x.max()), + ("count", lambda x:x.count()), + ('stdev', lambda x:x.std()), + ('mode', lambda x: x.mode()[0]), + ('median', lambda x:x.median()), + ("# > 0", lambda x:(x>0).sum())]) + .T + .reset_index() + .rename(index=str, columns={"level_0": "Features", 'level_1': 'Type'})) # rename columns + + clustersize = report[(report['Features'] == 'dummy') \ + & (report['Type'] == 'count')] # caclulating size of cluster(count of clientID's) + clustersize.Type = 'ClusterSize' # rename created cluster df to match report column names + clustersize.Features = '# of Customers' + clusterproportion = pd.DataFrame(clustersize.iloc[:, 2:].values / # caclulating proportion of cluster + clustersize.iloc[:, 2:].values.sum()) + clusterproportion['Type'] = '% of Customers' # rename created cluster df to match report column names + clusterproportion['Features'] = 'ClusterProportion' + cols = clusterproportion.columns.tolist() + cols = cols[-2:] + cols[:-2] + clusterproportion = clusterproportion[cols] # rearrange columns to match report + clusterproportion.columns = report.columns + a = pd.DataFrame(abs(report[report['Type'] == 'count'] \ + .iloc[:, 2:].values - clustersize.iloc[:, 2:].values)) # generating df with count of nan values + a['Features'] = 0 + a['Type'] = '# of nan' + a.Features = report[report['Type'] == 'count'].Features.tolist() # filling values in order to match report + cols = a.columns.tolist() + cols = cols[-2:] + cols[:-2] + a = a[cols] # rearrange columns to match report + a.columns = report.columns # rename columns to match report + report = report.drop(report[report.Type == 'count'].index) # drop count values except cluster size + report = pd.concat([report, a, clustersize, clusterproportion], + axis=0) # concat report with clustert size and nan values + report['Mark'] = report['Features'].isin(ClusteringVariables) + cols = report.columns.tolist() + cols = cols[0:2] + cols[-1:] + cols[2:-1] + report = report[cols] + sorter1 = {'ClusterSize': 9, 'ClusterProportion': 8, 'mean_with_zeros': 7, 'mean_with_na': 6, 'max': 5, '50%': 4, 'min': 3, '25%': 2, '75%': 1, + '# of nan': 0, "# > 0": -1, "sum_with_na": -2} + report = (report.assign(Sorter1 = lambda x:x.Type.map(sorter1), + Sorter2 = lambda x:list(reversed(range(len(x))))) + .sort_values(['Sorter1', 'Mark', "Sorter2"], ascending=False) + .drop(['Sorter1', "Sorter2"], axis=1)) + return report + From 3a62fc0d87f5316c16cf8067ddbf2dfc137a33f4 Mon Sep 17 00:00:00 2001 From: beqakd Date: Fri, 12 Jun 2020 20:15:50 +0400 Subject: [PATCH 2/7] style changes --- machine_learning/k_means_clust.py | 144 ++++++++++++++++++++---------- 1 file changed, 95 insertions(+), 49 deletions(-) diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index 5acfe4eba722..6c7681765fac 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -216,69 +216,115 @@ def ReportGenerator(df, ClusteringVariables, FillMissingReport=None): import pandas as pd import numpy as np """ - #Fill missing values with given rules + # Fill missing values with given rules if FillMissingReport is None: pass else: df.fillna(value=FillMissingReport, inplace=True) - df['dummy'] = 1 + df["dummy"] = 1 numeric_cols = df.select_dtypes(np.number).columns - report = (df # constract report dataframe - .groupby(['Cluster'])[numeric_cols] # group by cluster number - .agg([("sum", np.sum), - ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))), - ("mean_without_zeros", lambda x : x.replace(0, np.NaN).mean()), - ("mean_25-75", lambda x : np.mean(np.nan_to_num(sorted(x)[round((len(x)*25/100)):round(len(x)*75/100)]))), - ("mean_with_na", np.mean), - ('min', lambda x: x.min()), - ("5%",lambda x: x.quantile(0.05)), - ("25%",lambda x: x.quantile(0.25)), - ("50%", lambda x:x.quantile(0.50)), - ("75%", lambda x:x.quantile(0.75)), - ("95%",lambda x: x.quantile(0.95)), - ('max', lambda x:x.max()), - ("count", lambda x:x.count()), - ('stdev', lambda x:x.std()), - ('mode', lambda x: x.mode()[0]), - ('median', lambda x:x.median()), - ("# > 0", lambda x:(x>0).sum())]) - .T - .reset_index() - .rename(index=str, columns={"level_0": "Features", 'level_1': 'Type'})) # rename columns - - clustersize = report[(report['Features'] == 'dummy') \ - & (report['Type'] == 'count')] # caclulating size of cluster(count of clientID's) - clustersize.Type = 'ClusterSize' # rename created cluster df to match report column names - clustersize.Features = '# of Customers' - clusterproportion = pd.DataFrame(clustersize.iloc[:, 2:].values / # caclulating proportion of cluster - clustersize.iloc[:, 2:].values.sum()) - clusterproportion['Type'] = '% of Customers' # rename created cluster df to match report column names - clusterproportion['Features'] = 'ClusterProportion' + report = ( + df.groupby(["Cluster"])[ # constract report dataframe + numeric_cols + ] # group by cluster number + .agg( + [ + ("sum", np.sum), + ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))), + ("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()), + ( + "mean_25-75", + lambda x: np.mean( + np.nan_to_num( + sorted(x)[ + round((len(x) * 25 / 100)) : round(len(x) * 75 / 100) + ] + ) + ), + ), + ("mean_with_na", np.mean), + ("min", lambda x: x.min()), + ("5%", lambda x: x.quantile(0.05)), + ("25%", lambda x: x.quantile(0.25)), + ("50%", lambda x: x.quantile(0.50)), + ("75%", lambda x: x.quantile(0.75)), + ("95%", lambda x: x.quantile(0.95)), + ("max", lambda x: x.max()), + ("count", lambda x: x.count()), + ("stdev", lambda x: x.std()), + ("mode", lambda x: x.mode()[0]), + ("median", lambda x: x.median()), + ("# > 0", lambda x: (x > 0).sum()), + ] + ) + .T.reset_index() + .rename(index=str, columns={"level_0": "Features", "level_1": "Type"}) + ) # rename columns + + clustersize = report[ + (report["Features"] == "dummy") & (report["Type"] == "count") + ] # caclulating size of cluster(count of clientID's) + clustersize.Type = ( + "ClusterSize" # rename created cluster df to match report column names + ) + clustersize.Features = "# of Customers" + clusterproportion = pd.DataFrame( + clustersize.iloc[:, 2:].values + / clustersize.iloc[:, 2:].values.sum() # caclulating proportion of cluster + ) + clusterproportion[ + "Type" + ] = "% of Customers" # rename created cluster df to match report column names + clusterproportion["Features"] = "ClusterProportion" cols = clusterproportion.columns.tolist() cols = cols[-2:] + cols[:-2] clusterproportion = clusterproportion[cols] # rearrange columns to match report clusterproportion.columns = report.columns - a = pd.DataFrame(abs(report[report['Type'] == 'count'] \ - .iloc[:, 2:].values - clustersize.iloc[:, 2:].values)) # generating df with count of nan values - a['Features'] = 0 - a['Type'] = '# of nan' - a.Features = report[report['Type'] == 'count'].Features.tolist() # filling values in order to match report + a = pd.DataFrame( + abs( + report[report["Type"] == "count"].iloc[:, 2:].values + - clustersize.iloc[:, 2:].values + ) + ) # generating df with count of nan values + a["Features"] = 0 + a["Type"] = "# of nan" + a.Features = report[ + report["Type"] == "count" + ].Features.tolist() # filling values in order to match report cols = a.columns.tolist() cols = cols[-2:] + cols[:-2] a = a[cols] # rearrange columns to match report a.columns = report.columns # rename columns to match report - report = report.drop(report[report.Type == 'count'].index) # drop count values except cluster size - report = pd.concat([report, a, clustersize, clusterproportion], - axis=0) # concat report with clustert size and nan values - report['Mark'] = report['Features'].isin(ClusteringVariables) + report = report.drop( + report[report.Type == "count"].index + ) # drop count values except cluster size + report = pd.concat( + [report, a, clustersize, clusterproportion], axis=0 + ) # concat report with clustert size and nan values + report["Mark"] = report["Features"].isin(ClusteringVariables) cols = report.columns.tolist() cols = cols[0:2] + cols[-1:] + cols[2:-1] report = report[cols] - sorter1 = {'ClusterSize': 9, 'ClusterProportion': 8, 'mean_with_zeros': 7, 'mean_with_na': 6, 'max': 5, '50%': 4, 'min': 3, '25%': 2, '75%': 1, - '# of nan': 0, "# > 0": -1, "sum_with_na": -2} - report = (report.assign(Sorter1 = lambda x:x.Type.map(sorter1), - Sorter2 = lambda x:list(reversed(range(len(x))))) - .sort_values(['Sorter1', 'Mark', "Sorter2"], ascending=False) - .drop(['Sorter1', "Sorter2"], axis=1)) + sorter1 = { + "ClusterSize": 9, + "ClusterProportion": 8, + "mean_with_zeros": 7, + "mean_with_na": 6, + "max": 5, + "50%": 4, + "min": 3, + "25%": 2, + "75%": 1, + "# of nan": 0, + "# > 0": -1, + "sum_with_na": -2, + } + report = ( + report.assign( + Sorter1=lambda x: x.Type.map(sorter1), + Sorter2=lambda x: list(reversed(range(len(x)))), + ) + .sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False) + .drop(["Sorter1", "Sorter2"], axis=1) + ) return report - From 4f4918527161e2a0312a396b90ef1ed8dff1edaa Mon Sep 17 00:00:00 2001 From: beqakd Date: Fri, 12 Jun 2020 23:31:37 +0400 Subject: [PATCH 3/7] style changes --- machine_learning/k_means_clust.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index 6c7681765fac..72c463d714a9 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -210,7 +210,7 @@ def ReportGenerator(df, ClusteringVariables, FillMissingReport=None): """ Function generates easy-erading clustering report. It takes 2 arguments as an input: DataFrame - dataframe with predicted cluester column; - FillMissingReport - dcitionary of rules how we are going to fill missing + FillMissingReport - dcitionary of rules how we are going to fill missing values of for final report generate (not included in modeling); in order to run the function following libraries must be imported: import pandas as pd From b523bd5904a6ca3f1e81267d4fe48b0bf6a3aae3 Mon Sep 17 00:00:00 2001 From: beqakd Date: Fri, 19 Jun 2020 15:01:56 +0400 Subject: [PATCH 4/7] Add doctest and typehint! --- machine_learning/k_means_clust.py | 40 +++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index 72c463d714a9..baa1a8503978 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -55,6 +55,9 @@ import pandas as pd from matplotlib import pyplot as plt from sklearn.metrics import pairwise_distances +import warnings + +warnings.filterwarnings("ignore") TAG = "K-MEANS-CLUST/ " @@ -206,15 +209,39 @@ def kmeans( plot_heterogeneity(heterogeneity, k) -def ReportGenerator(df, ClusteringVariables, FillMissingReport=None): +def ReportGenerator( + df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None +) -> pd.DataFrame: """ Function generates easy-erading clustering report. It takes 2 arguments as an input: DataFrame - dataframe with predicted cluester column; - FillMissingReport - dcitionary of rules how we are going to fill missing + FillMissingReport - dictionary of rules how we are going to fill missing values of for final report generate (not included in modeling); in order to run the function following libraries must be imported: import pandas as pd import numpy as np + + >>> data = pd.DataFrame() + >>> data['numbers'] = [1, 2, 3] + >>> data['col1'] = [0.5, 2.5, 4.5] + >>> data['col2'] = [100, 200, 300] + >>> data['col3'] = [10, 20, 30] + >>> data['Cluster'] = [1, 1, 2] + >>> ReportGenerator(data, ['col1', 'col2'], 0) + Features Type Mark 1 2 + 0 # of Customers ClusterSize False 2.000000 1.000000 + 1 % of Customers ClusterProportion False 0.666667 0.333333 + 2 col1 mean_with_zeros True 1.500000 4.500000 + 3 col2 mean_with_zeros True 150.000000 300.000000 + 4 numbers mean_with_zeros False 1.500000 3.000000 + .. ... ... ... ... ... + 99 dummy 5% False 1.000000 1.000000 + 100 dummy 95% False 1.000000 1.000000 + 101 dummy stdev False 0.000000 NaN + 102 dummy mode False 1.000000 1.000000 + 103 dummy median False 1.000000 1.000000 + + [104 rows x 5 columns] """ # Fill missing values with given rules if FillMissingReport is None: @@ -327,4 +354,13 @@ def ReportGenerator(df, ClusteringVariables, FillMissingReport=None): .sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False) .drop(["Sorter1", "Sorter2"], axis=1) ) + report.columns.name = "" + report = report.reset_index() + report.drop(columns=["index"], inplace=True) return report + + +if __name__ == "__main__": + import doctest + + doctest.testmod() From 9a6ba97cf9ee34114f8e84c3b507848054b0a5a9 Mon Sep 17 00:00:00 2001 From: beqakd Date: Fri, 19 Jun 2020 15:08:23 +0400 Subject: [PATCH 5/7] style change --- machine_learning/k_means_clust.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index baa1a8503978..37cca70f1e50 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -47,7 +47,8 @@ k ) - 5. Transfers Dataframe into excel format it must have feature called 'Clust' with clust numbers in it. + 5. Transfers Dataframe into excel format it must have feature called + 'Clust' with clust numbers in it. """ From 8367113895e02230dd9ce8a5afb8d1348f41a375 Mon Sep 17 00:00:00 2001 From: beqakd <39763019+beqakd@users.noreply.github.com> Date: Fri, 19 Jun 2020 15:41:44 +0400 Subject: [PATCH 6/7] Update machine_learning/k_means_clust.py Co-authored-by: Christian Clauss --- machine_learning/k_means_clust.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index 37cca70f1e50..d9724f624edd 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -47,8 +47,8 @@ k ) - 5. Transfers Dataframe into excel format it must have feature called - 'Clust' with clust numbers in it. + 5. Transfers Dataframe into excel format it must have feature called + 'Clust' with k means clustering numbers in it. """ From f1e2cec123575ccc9a0a4b38558ee6e0e4aa8f62 Mon Sep 17 00:00:00 2001 From: beqakd <39763019+beqakd@users.noreply.github.com> Date: Fri, 19 Jun 2020 19:40:16 +0400 Subject: [PATCH 7/7] Update machine_learning/k_means_clust.py Co-authored-by: Christian Clauss --- machine_learning/k_means_clust.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index d9724f624edd..d5fa31135073 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -245,9 +245,7 @@ def ReportGenerator( [104 rows x 5 columns] """ # Fill missing values with given rules - if FillMissingReport is None: - pass - else: + if FillMissingReport: df.fillna(value=FillMissingReport, inplace=True) df["dummy"] = 1 numeric_cols = df.select_dtypes(np.number).columns