CC_DF = pd.read_csv('CC GENERAL.csv', sep = ',')
CC_DF.isnull().sum()

CUST_ID                               0
BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
TENURE                                0
dtype: int64


CC_DF.describe()


CC_DF_Z = []
for i in CC_DF.columns:
    CC_DF_Z.append(((CC_DF.describe()[i][7]) - (CC_DF.describe()[i][1])) / CC_DF.describe()[i][2])
CC_DF_Z = pd.DataFrame(CC_DF_Z, columns = ['Standard deviations from the mean of MAX'])
CC_DF_Z.insert(0,'Variable',CC_DF.columns)
CC_DF_Z.sort_values(by = ['Standard deviations from the mean of MAX'], ascending = False)


table_1_CC_DF = [CC_DF_std, CC_DF_z]
table_1_CC_RS = [CC_RS_std, CC_RS_z]
table_1_CC_T = [CC_T_std, CC_T_z]
table_1 = pd.DataFrame([table_1_CC_DF, table_1_CC_RS, table_1_CC_T], columns = ['Standard deviation', 'Distance of Max to mean (stdv)']) 
table_1.insert(0,'Method', ['Original Data', 'Robust Scaler', 'Power Transformation'])
table_1


# Graph to compare original data woth transformed: one hist and one boxplot for each case
f, axs = plt.subplots(2,2, figsize = (10,6), sharex = False, gridspec_kw = dict(height_ratios = [1,3]))
sns.histplot(data = CC_DF, x = "MINIMUM_PAYMENTS", ax = axs[0,0], legend = False)
sns.boxplot(data = CC_DF, x = "MINIMUM_PAYMENTS", ax = axs[1,0])
sns.histplot(data = CC_T, x ="MINIMUM_PAYMENTS", ax = axs[0,1], legend = False)
sns.boxplot(data = CC_T, x = "MINIMUM_PAYMENTS", ax = axs[1,1])
f.tight_layout()


f, axs = plt.subplots(2,2, figsize = (10,6), sharex = False, gridspec_kw = dict(height_ratios = [1,3]))
# makde density plot along x-axis without legend
sns.histplot(data = CC_DF, x = "BALANCE", ax = axs[0,0],legend = False)
sns.boxplot(data = CC_DF, x = "BALANCE", ax = axs[1,0])
sns.histplot(data = CC_T, x ="BALANCE", ax = axs[0,1], legend = False)
sns.boxplot(data = CC_T, x = "BALANCE", ax = axs[1,1])
f.tight_layout()


sns.heatmap(CC_T.corr(), cmap= 'YlGnBu')

<AxesSubplot:>


# Define optimal number of components
optimal_PC = 4
f, axs = plt.subplots(1,2, figsize = (8,4))
sns.lineplot(x = np.arange(1,9), y = PCA_1.explained_variance_ratio_, ax = axs[0])
sns.lineplot(x = np.arange(1,9), y = var_explained, ax = axs[1])
axs[0].set_title("Screetest")
axs[1].set_title("Accumulated variance explained")
axs[0].set_xlabel('Principal Components')
axs[1].set_xlabel('Principal Components')
axs[0].axvline(x = optimal_PC, color='r', linestyle=':')
axs[1].axhline(y = var_explained[(optimal_PC-1)], color='r', linestyle=':')
f.tight_layout()


PCA_lv_1.iloc[:, : optimal_PC][(PCA_lv_1 > 0.28) | (PCA_lv_1 < -0.28)].fillna('-')


sns.pairplot(df_pca[['PC1', 'PC2', 'PC3', 'PC4']], kind = "scatter", plot_kws = dict(alpha = 0.03), height = 1.4)

<seaborn.axisgrid.PairGrid at 0x1679b2d60>


CLUSTER_X = df_pca.iloc[:,:4]
C_model_5 = AgglomerativeClustering(distance_threshold = 0, n_clusters = None, linkage = 'ward')
C_model_5.fit_predict(CLUSTER_X)
plt.title("Hierarchical Clustering Dendrogram: Ward linkage")
plot_dendrogram(C_model_5, truncate_mode="level", p = 12)
plt.axhline(y = 160, color='r', linestyle=':')
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()


classes = ['Cluster 0', 'Cluster 1', 'Cluster 2']
fig, ax = plt.subplots(1, 3, figsize=(14, 6))
scatter_0 = ax[0].scatter(CLUSTER_X['PC1'], CLUSTER_X['PC2'], c = C_model_10.labels_, alpha = .8, cmap = 'RdYlGn')
ax[0].set_xlabel("Intensiveness, use installments", fontsize = 16)
ax[0].set_ylabel("Indebtedness", fontsize = 16)
ax[0].legend(handles = scatter_0.legend_elements()[0], labels = classes)
scatter_1 = ax[1].scatter(CLUSTER_X['PC1'], CLUSTER_X['PC3'], c = C_model_10.labels_, alpha = 0.8, cmap = 'RdYlGn')
ax[1].set_xlabel("Intensiveness, use installments", fontsize = 16)
ax[1].set_ylabel("Purchases one-off", fontsize = 16)
ax[1].legend(handles = scatter_1.legend_elements()[0], labels = classes)
scatter_2 = ax[2].scatter(CLUSTER_X['PC1'], CLUSTER_X['PC4'], c = C_model_10.labels_, alpha = 0.8, cmap = 'RdYlGn')
ax[2].set_xlabel("Intensiveness, use installments", fontsize = 16)
ax[2].set_ylabel("Pay in full, use cash in advance & new clients", fontsize = 16)
ax[2].legend(handles = scatter_2.legend_elements()[0], labels = classes)
plt.tight_layout()
plt.show()


fig, ax = plt.subplots(1, 2, figsize=(10, 6))
scatter_0 = ax[0].scatter(CLUSTER_X['PC2'], CLUSTER_X['PC3'], c = C_model_10.labels_, alpha = 0.8, cmap = 'RdYlGn')
ax[0].set_xlabel("Indebtedness", fontsize = 16)
ax[0].set_ylabel("Purchases one-off", fontsize = 16)
ax[0].legend(handles = scatter_0.legend_elements()[0], labels = classes)
scatter_1 = ax[1].scatter(CLUSTER_X['PC2'], CLUSTER_X['PC4'], c = C_model_10.labels_, alpha = 0.8, cmap = 'RdYlGn')
ax[1].set_xlabel("Indebtedness", fontsize = 16)
ax[1].set_ylabel("Pay in full, use cash in advance & new clients", fontsize = 16)
ax[1].legend(handles = scatter_1.legend_elements()[0], labels = classes)
plt.tight_layout()
plt.show()


CC_DF['CLUSTER'] = C_model_10.labels_
# selected_vars = ['BALANCE', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'CREDIT_LIMIT', 'TENURE', 'CLUSTER']
# mean_complete = CC_DF[selected_vars].mean()
mean_complete = CC_DF.mean()
mean_complete = pd.DataFrame(mean_complete)
mean_complete.drop('CLUSTER', axis = 0, inplace = True)
# mean_grouped = CC_DF[selected_vars].groupby('CLUSTER').mean()
mean_grouped = CC_DF.groupby('CLUSTER').mean()
final_table = pd.concat([mean_complete.T, mean_grouped], ignore_index = True, axis = 0)
final_table.rename(index = {0 : 'Complete data', 1 : 'Cluster 0', 2 : 'Cluster 1', 3 : 'Cluster 2'}, inplace = True)
final_table

	BALANCE	BALANCE_FREQUENCY	PURCHASES	ONEOFF_PURCHASES	INSTALLMENTS_PURCHASES	CASH_ADVANCE	PURCHASES_FREQUENCY	ONEOFF_PURCHASES_FREQUENCY	PURCHASES_INSTALLMENTS_FREQUENCY	CASH_ADVANCE_FREQUENCY	CASH_ADVANCE_TRX	PURCHASES_TRX	CREDIT_LIMIT	PAYMENTS	MINIMUM_PAYMENTS	PRC_FULL_PAYMENT	TENURE
count	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000	8950.000000
mean	1564.474828	0.877271	1003.204834	592.437371	411.067645	978.871112	0.490351	0.202458	0.364437	0.135144	3.248827	14.709832	4494.282473	1733.143852	844.906767	0.153715	11.517318
std	2081.531879	0.236904	2136.634782	1659.887917	904.338115	2097.163877	0.401371	0.298336	0.397448	0.200121	6.824647	24.857649	3638.646702	2895.063757	2332.792322	0.292499	1.338331
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	50.000000	0.000000	0.019163	0.000000	6.000000
25%	128.281915	0.888889	39.635000	0.000000	0.000000	0.000000	0.083333	0.000000	0.000000	0.000000	0.000000	1.000000	1600.000000	383.276166	170.857654	0.000000	12.000000
50%	873.385231	1.000000	361.280000	38.000000	89.000000	0.000000	0.500000	0.083333	0.166667	0.000000	0.000000	7.000000	3000.000000	856.901546	312.343947	0.000000	12.000000
75%	2054.140036	1.000000	1110.130000	577.405000	468.637500	1113.821139	0.916667	0.300000	0.750000	0.222222	4.000000	17.000000	6500.000000	1901.134317	788.713501	0.142857	12.000000
max	19043.138560	1.000000	49039.570000	40761.250000	22500.000000	47137.211760	1.000000	1.000000	1.000000	1.500000	123.000000	358.000000	30000.000000	50721.483360	76406.207520	1.000000	12.000000

	V1	V2	V3	V4
BALANCE	-	0.433761	-	-
BALANCE_FREQUENCY	-	0.30332	-	-0.356114
PURCHASES	0.353849	-	-	-
ONEOFF_PURCHASES	-	-	0.50492	-
INSTALLMENTS_PURCHASES	0.317172	-	-0.389956	-
CASH_ADVANCE	-0.299421	-	-	0.292591
PURCHASES_FREQUENCY	0.355422	-	-	-
ONEOFF_PURCHASES_FREQUENCY	-	-	0.470402	-
PURCHASES_INSTALLMENTS_FREQUENCY	0.309445	-	-0.447063	-
CASH_ADVANCE_FREQUENCY	-0.291577	-	-	0.298498
CASH_ADVANCE_TRX	-0.294386	-	-	0.292329
PURCHASES_TRX	0.364443	-	-	-
CREDIT_LIMIT	-	-	-	-
PAYMENTS	-	0.321255	-	0.31056
MINIMUM_PAYMENTS	-	0.37198	-	-0.290518
PRC_FULL_PAYMENT	-	-	-	0.512515
TENURE	-	-	-	-0.296175

	BALANCE	BALANCE_FREQUENCY	PURCHASES	ONEOFF_PURCHASES	INSTALLMENTS_PURCHASES	CASH_ADVANCE	PURCHASES_FREQUENCY	ONEOFF_PURCHASES_FREQUENCY	PURCHASES_INSTALLMENTS_FREQUENCY	CASH_ADVANCE_FREQUENCY	CASH_ADVANCE_TRX	PURCHASES_TRX	CREDIT_LIMIT	PAYMENTS	MINIMUM_PAYMENTS	PRC_FULL_PAYMENT	TENURE
Complete data	1564.474828	0.877271	1003.204834	592.437371	411.067645	978.871112	0.490351	0.202458	0.364437	0.135144	3.248827	14.709832	4494.282473	1733.143852	844.906767	0.153715	11.517318
Cluster 0	933.706163	0.849932	1249.809328	695.620828	554.629137	184.732268	0.666572	0.234626	0.517996	0.025218	0.588490	18.815924	4304.695666	1388.925592	671.797045	0.228381	11.605834
Cluster 1	2131.115470	0.882550	0.883766	0.753361	0.172329	1974.015044	0.001715	0.001300	0.000496	0.273109	6.288332	0.020251	3982.240437	1634.708858	992.030606	0.045101	11.305689
Cluster 2	2688.396865	0.948175	1462.440511	982.892297	479.750150	2069.621381	0.556545	0.343402	0.350929	0.285881	7.241398	20.054939	5617.444870	2815.673059	1163.011032	0.068479	11.511654

Profiling of credit card clients (abbreviated version).¶

Ricardo Ibarra-Gil¶

This is my first project using unsupervised machine learning techniques in Python.¶

Content¶

1. About the data¶

Variables:¶

2. Data Cleaning¶

Standarization¶

4. Principal Components Analysis (PCA)¶

Definition of components (interpretation) from the information of our loadings¶

5. Model. Hierarchical clustering, using 4 components, ward linkage¶

5. Conclusions and recommendations¶

Interpretation of clusters¶

6. Closing comments¶

7. References¶

	Variable	Standard deviations from the mean of MAX
14	MINIMUM_PAYMENTS	32.390925
4	INSTALLMENTS_PURCHASES	24.425524
3	ONEOFF_PURCHASES	24.199714
2	PURCHASES	22.482254
5	CASH_ADVANCE	22.009887
10	CASH_ADVANCE_TRX	17.546868
13	PAYMENTS	16.921334
11	PURCHASES_TRX	13.810243
0	BALANCE	8.397019
12	CREDIT_LIMIT	7.009671
9	CASH_ADVANCE_FREQUENCY	6.820140
15	PRC_FULL_PAYMENT	2.893291
7	ONEOFF_PURCHASES_FREQUENCY	2.673302
8	PURCHASES_INSTALLMENTS_FREQUENCY	1.599110
6	PURCHASES_FREQUENCY	1.269772
1	BALANCE_FREQUENCY	0.518055
16	TENURE	0.360659

	Method	Standard deviation	Distance of Max to mean (stdv)
0	Original Data	2332.792322	32.390925
1	Robust Scaler	3.775626	32.390925
2	Power Transformation	1.000056	4.036337