From aea5f0d528b659a81cea13669b637830bbe943b1 Mon Sep 17 00:00:00 2001
From: beqakd <bgoga16@freeuni.edu.ge>
Date: Fri, 12 Jun 2020 16:52:10 +0400
Subject: [PATCH 1/7] add visualization of kmneas clust as excel format

---
 machine_learning/k_means_clust.py | 82 ++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py
index 86a5dd968779..5acfe4eba722 100644
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@@ -47,10 +47,12 @@
             k
         )
 
-  5. Have fun..
+  5. Transfers Dataframe into excel format it must have feature called 'Clust' with clust numbers in it.
+
 
 """
 import numpy as np
+import pandas as pd
 from matplotlib import pyplot as plt
 from sklearn.metrics import pairwise_distances
 
@@ -202,3 +204,81 @@ def kmeans(
         verbose=True,
     )
     plot_heterogeneity(heterogeneity, k)
+
+
+def ReportGenerator(df, ClusteringVariables, FillMissingReport=None):
+    """
+    Function generates easy-erading clustering report. It takes 2 arguments as an input:
+        DataFrame - dataframe with predicted cluester column;
+        FillMissingReport - dcitionary of rules how we are going to fill missing 
+        values of for final report generate (not included in modeling);
+    in order to run the function following libraries must be imported:
+        import pandas as pd
+        import numpy as np
+    """
+    #Fill missing values with given rules
+    if FillMissingReport is None:
+        pass
+    else:
+        df.fillna(value=FillMissingReport, inplace=True)
+    df['dummy'] = 1
+    numeric_cols = df.select_dtypes(np.number).columns
+    report = (df  # constract report dataframe
+              .groupby(['Cluster'])[numeric_cols]  # group by cluster number
+              .agg([("sum", np.sum),
+                    ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
+                    ("mean_without_zeros", lambda x : x.replace(0, np.NaN).mean()),
+                    ("mean_25-75", lambda x : np.mean(np.nan_to_num(sorted(x)[round((len(x)*25/100)):round(len(x)*75/100)]))),
+                    ("mean_with_na", np.mean),
+                    ('min', lambda x: x.min()),
+                    ("5%",lambda x: x.quantile(0.05)), 
+                    ("25%",lambda x: x.quantile(0.25)),
+                    ("50%", lambda x:x.quantile(0.50)),
+                    ("75%", lambda x:x.quantile(0.75)),
+                    ("95%",lambda x: x.quantile(0.95)),
+                    ('max', lambda x:x.max()),
+                    ("count", lambda x:x.count()),
+                    ('stdev', lambda x:x.std()),
+                    ('mode', lambda x: x.mode()[0]),
+                    ('median', lambda x:x.median()),                  
+                    ("# > 0", lambda x:(x>0).sum())])
+              .T
+              .reset_index()
+              .rename(index=str, columns={"level_0": "Features", 'level_1': 'Type'}))  # rename columns
+
+    clustersize = report[(report['Features'] == 'dummy') \
+                         & (report['Type'] == 'count')]  # caclulating size of cluster(count of clientID's)
+    clustersize.Type = 'ClusterSize'  # rename created cluster df to match report column names
+    clustersize.Features = '# of Customers'
+    clusterproportion = pd.DataFrame(clustersize.iloc[:, 2:].values /  # caclulating proportion of cluster
+                                     clustersize.iloc[:, 2:].values.sum())
+    clusterproportion['Type'] = '% of Customers'  # rename created cluster df to match report column names
+    clusterproportion['Features'] = 'ClusterProportion'
+    cols = clusterproportion.columns.tolist()
+    cols = cols[-2:] + cols[:-2]
+    clusterproportion = clusterproportion[cols]  # rearrange columns to match report
+    clusterproportion.columns = report.columns
+    a = pd.DataFrame(abs(report[report['Type'] == 'count'] \
+                         .iloc[:, 2:].values - clustersize.iloc[:, 2:].values))  # generating df with count of nan values
+    a['Features'] = 0
+    a['Type'] = '# of nan'
+    a.Features = report[report['Type'] == 'count'].Features.tolist()  # filling values in order to match report  
+    cols = a.columns.tolist()
+    cols = cols[-2:] + cols[:-2]
+    a = a[cols]  # rearrange columns to match report
+    a.columns = report.columns  # rename columns to match report
+    report = report.drop(report[report.Type == 'count'].index)  # drop count values except cluster size
+    report = pd.concat([report, a, clustersize, clusterproportion],
+                       axis=0)  # concat report with clustert size and nan values
+    report['Mark'] = report['Features'].isin(ClusteringVariables)
+    cols = report.columns.tolist()
+    cols = cols[0:2] + cols[-1:] + cols[2:-1]
+    report = report[cols]
+    sorter1 = {'ClusterSize': 9, 'ClusterProportion': 8, 'mean_with_zeros': 7, 'mean_with_na': 6, 'max': 5, '50%': 4, 'min': 3, '25%': 2, '75%': 1,
+              '# of nan': 0, "# > 0": -1, "sum_with_na": -2}
+    report = (report.assign(Sorter1 = lambda x:x.Type.map(sorter1),
+                            Sorter2 = lambda x:list(reversed(range(len(x)))))
+                    .sort_values(['Sorter1', 'Mark', "Sorter2"], ascending=False)
+                    .drop(['Sorter1', "Sorter2"], axis=1))
+    return report
+

From 3a62fc0d87f5316c16cf8067ddbf2dfc137a33f4 Mon Sep 17 00:00:00 2001
From: beqakd <bgoga16@freeuni.edu.ge>
Date: Fri, 12 Jun 2020 20:15:50 +0400
Subject: [PATCH 2/7] style changes

---
 machine_learning/k_means_clust.py | 144 ++++++++++++++++++++----------
 1 file changed, 95 insertions(+), 49 deletions(-)

diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py
index 5acfe4eba722..6c7681765fac 100644
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@@ -216,69 +216,115 @@ def ReportGenerator(df, ClusteringVariables, FillMissingReport=None):
         import pandas as pd
         import numpy as np
     """
-    #Fill missing values with given rules
+    # Fill missing values with given rules
     if FillMissingReport is None:
         pass
     else:
         df.fillna(value=FillMissingReport, inplace=True)
-    df['dummy'] = 1
+    df["dummy"] = 1
     numeric_cols = df.select_dtypes(np.number).columns
-    report = (df  # constract report dataframe
-              .groupby(['Cluster'])[numeric_cols]  # group by cluster number
-              .agg([("sum", np.sum),
-                    ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
-                    ("mean_without_zeros", lambda x : x.replace(0, np.NaN).mean()),
-                    ("mean_25-75", lambda x : np.mean(np.nan_to_num(sorted(x)[round((len(x)*25/100)):round(len(x)*75/100)]))),
-                    ("mean_with_na", np.mean),
-                    ('min', lambda x: x.min()),
-                    ("5%",lambda x: x.quantile(0.05)), 
-                    ("25%",lambda x: x.quantile(0.25)),
-                    ("50%", lambda x:x.quantile(0.50)),
-                    ("75%", lambda x:x.quantile(0.75)),
-                    ("95%",lambda x: x.quantile(0.95)),
-                    ('max', lambda x:x.max()),
-                    ("count", lambda x:x.count()),
-                    ('stdev', lambda x:x.std()),
-                    ('mode', lambda x: x.mode()[0]),
-                    ('median', lambda x:x.median()),                  
-                    ("# > 0", lambda x:(x>0).sum())])
-              .T
-              .reset_index()
-              .rename(index=str, columns={"level_0": "Features", 'level_1': 'Type'}))  # rename columns
-
-    clustersize = report[(report['Features'] == 'dummy') \
-                         & (report['Type'] == 'count')]  # caclulating size of cluster(count of clientID's)
-    clustersize.Type = 'ClusterSize'  # rename created cluster df to match report column names
-    clustersize.Features = '# of Customers'
-    clusterproportion = pd.DataFrame(clustersize.iloc[:, 2:].values /  # caclulating proportion of cluster
-                                     clustersize.iloc[:, 2:].values.sum())
-    clusterproportion['Type'] = '% of Customers'  # rename created cluster df to match report column names
-    clusterproportion['Features'] = 'ClusterProportion'
+    report = (
+        df.groupby(["Cluster"])[  # constract report dataframe
+            numeric_cols
+        ]  # group by cluster number
+        .agg(
+            [
+                ("sum", np.sum),
+                ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
+                ("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()),
+                (
+                    "mean_25-75",
+                    lambda x: np.mean(
+                        np.nan_to_num(
+                            sorted(x)[
+                                round((len(x) * 25 / 100)) : round(len(x) * 75 / 100)
+                            ]
+                        )
+                    ),
+                ),
+                ("mean_with_na", np.mean),
+                ("min", lambda x: x.min()),
+                ("5%", lambda x: x.quantile(0.05)),
+                ("25%", lambda x: x.quantile(0.25)),
+                ("50%", lambda x: x.quantile(0.50)),
+                ("75%", lambda x: x.quantile(0.75)),
+                ("95%", lambda x: x.quantile(0.95)),
+                ("max", lambda x: x.max()),
+                ("count", lambda x: x.count()),
+                ("stdev", lambda x: x.std()),
+                ("mode", lambda x: x.mode()[0]),
+                ("median", lambda x: x.median()),
+                ("# > 0", lambda x: (x > 0).sum()),
+            ]
+        )
+        .T.reset_index()
+        .rename(index=str, columns={"level_0": "Features", "level_1": "Type"})
+    )  # rename columns
+
+    clustersize = report[
+        (report["Features"] == "dummy") & (report["Type"] == "count")
+    ]  # caclulating size of cluster(count of clientID's)
+    clustersize.Type = (
+        "ClusterSize"  # rename created cluster df to match report column names
+    )
+    clustersize.Features = "# of Customers"
+    clusterproportion = pd.DataFrame(
+        clustersize.iloc[:, 2:].values
+        / clustersize.iloc[:, 2:].values.sum()  # caclulating proportion of cluster
+    )
+    clusterproportion[
+        "Type"
+    ] = "% of Customers"  # rename created cluster df to match report column names
+    clusterproportion["Features"] = "ClusterProportion"
     cols = clusterproportion.columns.tolist()
     cols = cols[-2:] + cols[:-2]
     clusterproportion = clusterproportion[cols]  # rearrange columns to match report
     clusterproportion.columns = report.columns
-    a = pd.DataFrame(abs(report[report['Type'] == 'count'] \
-                         .iloc[:, 2:].values - clustersize.iloc[:, 2:].values))  # generating df with count of nan values
-    a['Features'] = 0
-    a['Type'] = '# of nan'
-    a.Features = report[report['Type'] == 'count'].Features.tolist()  # filling values in order to match report  
+    a = pd.DataFrame(
+        abs(
+            report[report["Type"] == "count"].iloc[:, 2:].values
+            - clustersize.iloc[:, 2:].values
+        )
+    )  # generating df with count of nan values
+    a["Features"] = 0
+    a["Type"] = "# of nan"
+    a.Features = report[
+        report["Type"] == "count"
+    ].Features.tolist()  # filling values in order to match report
     cols = a.columns.tolist()
     cols = cols[-2:] + cols[:-2]
     a = a[cols]  # rearrange columns to match report
     a.columns = report.columns  # rename columns to match report
-    report = report.drop(report[report.Type == 'count'].index)  # drop count values except cluster size
-    report = pd.concat([report, a, clustersize, clusterproportion],
-                       axis=0)  # concat report with clustert size and nan values
-    report['Mark'] = report['Features'].isin(ClusteringVariables)
+    report = report.drop(
+        report[report.Type == "count"].index
+    )  # drop count values except cluster size
+    report = pd.concat(
+        [report, a, clustersize, clusterproportion], axis=0
+    )  # concat report with clustert size and nan values
+    report["Mark"] = report["Features"].isin(ClusteringVariables)
     cols = report.columns.tolist()
     cols = cols[0:2] + cols[-1:] + cols[2:-1]
     report = report[cols]
-    sorter1 = {'ClusterSize': 9, 'ClusterProportion': 8, 'mean_with_zeros': 7, 'mean_with_na': 6, 'max': 5, '50%': 4, 'min': 3, '25%': 2, '75%': 1,
-              '# of nan': 0, "# > 0": -1, "sum_with_na": -2}
-    report = (report.assign(Sorter1 = lambda x:x.Type.map(sorter1),
-                            Sorter2 = lambda x:list(reversed(range(len(x)))))
-                    .sort_values(['Sorter1', 'Mark', "Sorter2"], ascending=False)
-                    .drop(['Sorter1', "Sorter2"], axis=1))
+    sorter1 = {
+        "ClusterSize": 9,
+        "ClusterProportion": 8,
+        "mean_with_zeros": 7,
+        "mean_with_na": 6,
+        "max": 5,
+        "50%": 4,
+        "min": 3,
+        "25%": 2,
+        "75%": 1,
+        "# of nan": 0,
+        "# > 0": -1,
+        "sum_with_na": -2,
+    }
+    report = (
+        report.assign(
+            Sorter1=lambda x: x.Type.map(sorter1),
+            Sorter2=lambda x: list(reversed(range(len(x)))),
+        )
+        .sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False)
+        .drop(["Sorter1", "Sorter2"], axis=1)
+    )
     return report
-

From 4f4918527161e2a0312a396b90ef1ed8dff1edaa Mon Sep 17 00:00:00 2001
From: beqakd <bgoga16@freeuni.edu.ge>
Date: Fri, 12 Jun 2020 23:31:37 +0400
Subject: [PATCH 3/7] style changes

---
 machine_learning/k_means_clust.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py
index 6c7681765fac..72c463d714a9 100644
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@@ -210,7 +210,7 @@ def ReportGenerator(df, ClusteringVariables, FillMissingReport=None):
     """
     Function generates easy-erading clustering report. It takes 2 arguments as an input:
         DataFrame - dataframe with predicted cluester column;
-        FillMissingReport - dcitionary of rules how we are going to fill missing 
+        FillMissingReport - dcitionary of rules how we are going to fill missing
         values of for final report generate (not included in modeling);
     in order to run the function following libraries must be imported:
         import pandas as pd

From b523bd5904a6ca3f1e81267d4fe48b0bf6a3aae3 Mon Sep 17 00:00:00 2001
From: beqakd <bgoga16@freeuni.edu.ge>
Date: Fri, 19 Jun 2020 15:01:56 +0400
Subject: [PATCH 4/7] Add doctest and typehint!

---
 machine_learning/k_means_clust.py | 40 +++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py
index 72c463d714a9..baa1a8503978 100644
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@@ -55,6 +55,9 @@
 import pandas as pd
 from matplotlib import pyplot as plt
 from sklearn.metrics import pairwise_distances
+import warnings
+
+warnings.filterwarnings("ignore")
 
 TAG = "K-MEANS-CLUST/ "
 
@@ -206,15 +209,39 @@ def kmeans(
     plot_heterogeneity(heterogeneity, k)
 
 
-def ReportGenerator(df, ClusteringVariables, FillMissingReport=None):
+def ReportGenerator(
+    df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None
+) -> pd.DataFrame:
     """
     Function generates easy-erading clustering report. It takes 2 arguments as an input:
         DataFrame - dataframe with predicted cluester column;
-        FillMissingReport - dcitionary of rules how we are going to fill missing
+        FillMissingReport - dictionary of rules how we are going to fill missing
         values of for final report generate (not included in modeling);
     in order to run the function following libraries must be imported:
         import pandas as pd
         import numpy as np
+
+    >>> data = pd.DataFrame()
+    >>> data['numbers'] = [1, 2, 3]
+    >>> data['col1'] = [0.5, 2.5, 4.5]
+    >>> data['col2'] = [100, 200, 300]
+    >>> data['col3'] = [10, 20, 30]
+    >>> data['Cluster'] = [1, 1, 2]
+    >>> ReportGenerator(data, ['col1', 'col2'], 0)
+               Features               Type   Mark           1           2
+    0    # of Customers        ClusterSize  False    2.000000    1.000000
+    1    % of Customers  ClusterProportion  False    0.666667    0.333333
+    2              col1    mean_with_zeros   True    1.500000    4.500000
+    3              col2    mean_with_zeros   True  150.000000  300.000000
+    4           numbers    mean_with_zeros  False    1.500000    3.000000
+    ..              ...                ...    ...         ...         ...
+    99            dummy                 5%  False    1.000000    1.000000
+    100           dummy                95%  False    1.000000    1.000000
+    101           dummy              stdev  False    0.000000         NaN
+    102           dummy               mode  False    1.000000    1.000000
+    103           dummy             median  False    1.000000    1.000000
+    <BLANKLINE>
+    [104 rows x 5 columns]
     """
     # Fill missing values with given rules
     if FillMissingReport is None:
@@ -327,4 +354,13 @@ def ReportGenerator(df, ClusteringVariables, FillMissingReport=None):
         .sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False)
         .drop(["Sorter1", "Sorter2"], axis=1)
     )
+    report.columns.name = ""
+    report = report.reset_index()
+    report.drop(columns=["index"], inplace=True)
     return report
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()

From 9a6ba97cf9ee34114f8e84c3b507848054b0a5a9 Mon Sep 17 00:00:00 2001
From: beqakd <bgoga16@freeuni.edu.ge>
Date: Fri, 19 Jun 2020 15:08:23 +0400
Subject: [PATCH 5/7] style change

---
 machine_learning/k_means_clust.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py
index baa1a8503978..37cca70f1e50 100644
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@@ -47,7 +47,8 @@
             k
         )
 
-  5. Transfers Dataframe into excel format it must have feature called 'Clust' with clust numbers in it.
+  5. Transfers Dataframe into excel format it must have feature called 
+  'Clust' with clust numbers in it.
 
 
 """

From 8367113895e02230dd9ce8a5afb8d1348f41a375 Mon Sep 17 00:00:00 2001
From: beqakd <39763019+beqakd@users.noreply.github.com>
Date: Fri, 19 Jun 2020 15:41:44 +0400
Subject: [PATCH 6/7] Update machine_learning/k_means_clust.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/k_means_clust.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py
index 37cca70f1e50..d9724f624edd 100644
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@@ -47,8 +47,8 @@
             k
         )
 
-  5. Transfers Dataframe into excel format it must have feature called 
-  'Clust' with clust numbers in it.
+  5. Transfers Dataframe into excel format it must have feature called
+      'Clust' with k means clustering numbers in it.
 
 
 """

From f1e2cec123575ccc9a0a4b38558ee6e0e4aa8f62 Mon Sep 17 00:00:00 2001
From: beqakd <39763019+beqakd@users.noreply.github.com>
Date: Fri, 19 Jun 2020 19:40:16 +0400
Subject: [PATCH 7/7] Update machine_learning/k_means_clust.py

Co-authored-by: Christian Clauss <cclauss@me.com>
---
 machine_learning/k_means_clust.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py
index d9724f624edd..d5fa31135073 100644
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@@ -245,9 +245,7 @@ def ReportGenerator(
     [104 rows x 5 columns]
     """
     # Fill missing values with given rules
-    if FillMissingReport is None:
-        pass
-    else:
+    if FillMissingReport:
         df.fillna(value=FillMissingReport, inplace=True)
     df["dummy"] = 1
     numeric_cols = df.select_dtypes(np.number).columns