# packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from numpy.linalg import eig
import warnings
# ignore all warnings
warnings.filterwarnings("ignore")


def pca_report(cov_data, standardization, variables, numbers, case):
    
    # standardize data
    if standardization == True:
        scaler = StandardScaler()
        data = scaler.fit_transform(cov_data)
    else:
        data = cov_data

    pca = PCA()
    pca.fit(data)
    explained_variances = pca.explained_variance_[:numbers]
    tick_of_pc = ['PC' + str(x) for x in range(1, len(explained_variances) + 1)]
    
    if case == 1:
        # create figure and axes
        fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2, figsize=(13, 4))
        
        # scree plot
        ax1.plot(range(1, len(explained_variances) + 1), explained_variances, marker='o')
        ax1.set_title('Scree Plot')
        ax1.set_xlabel('Principal Component')
        ax1.set_ylabel('Explained Variance Ratio')

        # Perato Plot
        # calculate cumulative sum and percentage
        cumulative_variances = np.cumsum(explained_variances)
        percentage = [x/sum(explained_variances)*100 for x in cumulative_variances]
        # plot bar chart
        color1 = "blue"
        ax2.bar(tick_of_pc, explained_variances, color=color1, alpha=0.5)
        ax2.set_xlabel('The Number of Principal Components')
        ax2.set_ylabel('Explained Variance', color=color1)
        ax2.tick_params(axis='y', color=color1)
        # create secondary y-axis
        ax3 = ax2.twinx()
        # plot cumulative percentage line chart
        ax3.plot(tick_of_pc, percentage, color='purple', marker='o', alpha = 0.5)
        ax3.set_ylabel('Cumulative Percentage (%)', color='purple')
        ax3.tick_params(axis='y', labelcolor='purple')
        # set title
        ax2.set_title('Pareto Plot')
        return plt
    
    if case == 2:
        # Create dataframe with variable names and PCA coefficients
        pca_df = pd.DataFrame(pca.components_[:numbers, :].T, columns=tick_of_pc)
        pca_df.index = variables

        # Round down to 2 decimal places
        pca_df = pca_df.round(2)

        # Add cumulative proportion and explained variance at the last two rows
        cumulative_proportion = np.cumsum(pca.explained_variance_ratio_[:numbers]) * 100
        explained_variance = pca.explained_variance_[:numbers]

        pca_df.loc['Cumulative Proportion(%)'] = cumulative_proportion.round(2)
        pca_df.loc['Explained Variance'] = explained_variance.round(2)
        return pca_df
    
    if case == 3:
        # tranfrom data
        tranformed_data = pd.DataFrame(pca.transform(data)[:, :numbers], columns = tick_of_pc)
        return tranformed_data
    
def draw_3d(tranformed_data, group, angel1, angel2, label):
    fig = plt.figure(figsize=(12,6))
    ax = fig.add_subplot(111, projection='3d')
    ax.view_init(angel1, angel2)
    for i in  tranformed_data[group].unique():
        draw_data = tranformed_data[tranformed_data[group] == i]
        x = draw_data.PC1
        y = draw_data.PC2
        z = draw_data.PC3
        ax.scatter(x, y, z, s = 40)
    ax.set_xlabel('P1')
    ax.set_ylabel('P2')
    ax.set_zlabel('P3', labelpad=-5)
    ax.set_title(label)
    plt.legend(tranformed_data[group].unique(), loc=(0.8, 0.7), fontsize = 10)
    plt.show()


data = pd.read_excel("wine.xlsx")
data.head()


# Create correlation matrix - Get Numerical Data
variables = ["Alcohol", "Malic_Acid", "Ash", "Ash_Alcanity", "Magnesium", "Total_Phenols", "Flavanoids", "Nonflavanoid_Phenols", "Proanthocyanins", "Color_Intensity", "Hue", "OD280", "Proline"]
cov_data = data[variables]
cov_corr = cov_data.corr()
# figure size
fig, ax = plt.subplots(figsize=(8,6))
# Set up mask to only show lower triangle
mask = np.zeros_like(cov_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Plot heatmap
sns.heatmap(cov_corr, mask = mask, annot=True, cmap='plasma', square = False, vmin=-1, vmax=1)
plt.show()


fig, ax = plt.subplots(figsize=(13, 4))

# box plot without standardization
plt.subplot(1, 2, 1)
sns.boxplot(data= cov_data, orient='horizontal')
plt.title('Without Standardization')

# box plot with standardization
plt.subplot(1, 2, 2)
sns.boxplot(data=(cov_data - cov_data.mean()) / cov_data.std(), orient='horizontal')
plt.title('With Standardization')

# show plot
plt.tight_layout()
plt.show()


pca_report(cov_data, standardization=1, variables = variables, numbers = 13, case = 1)

<module 'matplotlib.pyplot' from 'c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\matplotlib\\pyplot.py'>


pca_report(cov_data, standardization=1, variables = variables, numbers = 13, case = 2)


tranformed_data = pca_report(cov_data, standardization=1, variables = variables, numbers = 13, case = 3).iloc[:, :4]
tranformed_data["group"] = np.where(data["Customer_Segment"] == 1, "first", np.where(data["Customer_Segment"] == 2, "second", "third"))
# Use the pairplot function to display a scatter plot of all variables in a square plot
sns.pairplot(tranformed_data, hue = "group")
tranformed_data.to_excel("transformed_data.xlsx")


draw_3d(tranformed_data, "group", -152, -28, "More like a three dimensions")
draw_3d(tranformed_data, "group", -117, 21, "More like a two dimensions")


from sklearn.datasets import load_breast_cancer

# Load the breast cancer dataset
breast_data = pd.DataFrame(load_breast_cancer().data, columns= load_breast_cancer().feature_names)

# Add the target variable as a column to the dataframe
breast_data['target'] = load_breast_cancer().target


# Create correlation matrix - Get Numerical Data
numeric_cols = [x for x in breast_data.columns if x != "target"]
cov_data = breast_data[numeric_cols]
cov_corr = cov_data.corr()
# figure size
fig, ax = plt.subplots(figsize=(20,15))
# Set up mask to only show lower triangle
mask = np.zeros_like(cov_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Plot heatmap
sns.heatmap(cov_corr, mask = mask, annot=True, cmap='plasma', square = False, vmin=-1, vmax=1)
# plt.xticks(rotation=60)
plt.show()


fig, ax = plt.subplots(figsize=(13, 10))

# box plot without standardization
plt.subplot(1, 2, 1)
sns.boxplot(data= cov_data, orient='horizontal')
plt.title('Without Standardization')

# box plot with standardization
plt.subplot(1, 2, 2)
sns.boxplot(data=(cov_data - cov_data.mean()) / cov_data.std(), orient='horizontal')
plt.title('With Standardization')

# show plot
plt.tight_layout()
plt.show()


pca_report(cov_data, standardization=1, variables = numeric_cols, numbers = 15, case = 1)

<module 'matplotlib.pyplot' from 'c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\matplotlib\\pyplot.py'>


pca_report(cov_data, standardization=1, variables = numeric_cols, numbers = 15, case = 2)


tranformed_data = pca_report(cov_data, standardization=1, variables = variables, numbers = 15, case = 3).iloc[:, :4]
tranformed_data["target"] = breast_data["target"]
tranformed_data.to_excel("transformed_data.xlsx")
# Use the pairplot function to display a scatter plot of all variables in a square plot
sns.pairplot(tranformed_data, hue = "target")

<seaborn.axisgrid.PairGrid at 0x27d5f98ef80>


draw_3d(tranformed_data, "target", -152, -28, "More like a three dimensions")
draw_3d(tranformed_data, "target", -117, 21, "More like a two dimensions")

	Alcohol	Malic_Acid	Ash	Ash_Alcanity	Magnesium	Total_Phenols	Flavanoids	Nonflavanoid_Phenols	Proanthocyanins	Color_Intensity	Hue	OD280	Proline	Customer_Segment
0	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065	1
1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050	1
2	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185	1
3	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480	1
4	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735	1

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	PC11	PC12	PC13
Alcohol	0.14	-0.48	-0.21	-0.02	-0.27	-0.21	-0.06	-0.40	0.51	0.21	-0.23	-0.27	0.01
Malic_Acid	-0.25	-0.22	0.09	0.54	0.04	-0.54	0.42	-0.07	-0.08	-0.31	0.08	0.12	0.03
Ash	-0.00	-0.32	0.63	-0.21	-0.14	-0.15	-0.15	0.17	-0.31	-0.03	-0.50	-0.05	-0.14
Ash_Alcanity	-0.24	0.01	0.61	0.06	0.07	0.10	-0.29	-0.43	0.20	0.05	0.48	-0.06	0.09
Magnesium	0.14	-0.30	0.13	-0.35	0.73	-0.04	0.32	0.16	0.27	0.07	0.07	0.06	0.06
Total_Phenols	0.39	-0.07	0.15	0.20	-0.15	0.08	-0.03	0.41	0.29	-0.32	0.30	-0.30	-0.46
Flavanoids	0.42	0.00	0.15	0.15	-0.11	0.02	-0.06	0.19	0.05	-0.16	-0.03	-0.04	0.83
Nonflavanoid_Phenols	-0.30	-0.03	0.17	-0.20	-0.50	0.26	0.60	0.23	0.20	0.22	0.12	0.04	0.11
Proanthocyanins	0.31	-0.04	0.15	0.40	0.14	0.53	0.37	-0.37	-0.21	0.13	-0.24	-0.10	-0.12
Color_Intensity	-0.09	-0.53	-0.14	0.07	-0.08	0.42	-0.23	0.03	0.06	-0.29	0.03	0.60	-0.01
Hue	0.30	0.28	0.09	-0.43	-0.17	-0.11	0.23	-0.44	0.09	-0.52	-0.05	0.26	-0.09
OD280	0.38	0.16	0.17	0.18	-0.10	-0.27	-0.04	0.08	0.14	0.52	0.05	0.60	-0.16
Proline	0.29	-0.36	-0.13	-0.23	-0.16	-0.12	0.08	-0.12	-0.58	0.16	0.54	-0.08	0.01
Cumulative Proportion(%)	36.20	55.41	66.53	73.60	80.16	85.10	89.34	92.02	94.24	96.17	97.91	99.20	100.00
Explained Variance	4.73	2.51	1.45	0.92	0.86	0.65	0.55	0.35	0.29	0.25	0.23	0.17	0.10

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	PC11	PC12	PC13	PC14	PC15
mean radius	0.22	-0.23	-0.01	0.04	0.04	0.02	-0.12	-0.01	-0.22	0.10	0.04	0.05	0.01	0.06	0.05
mean texture	0.10	-0.06	0.06	-0.60	-0.05	-0.03	0.01	0.13	0.11	0.24	-0.30	0.25	0.20	-0.02	0.11
mean perimeter	0.23	-0.22	-0.01	0.04	0.04	0.02	-0.11	-0.02	-0.22	0.09	0.02	0.04	0.04	0.05	0.04
mean area	0.22	-0.23	0.03	0.05	0.01	-0.00	-0.05	0.03	-0.20	0.07	0.11	0.07	0.07	0.01	-0.01
mean smoothness	0.14	0.19	-0.10	0.16	-0.37	-0.29	-0.14	-0.29	0.01	-0.07	-0.14	0.32	0.05	0.45	0.12
mean compactness	0.24	0.15	-0.07	0.03	0.01	-0.01	0.03	-0.15	-0.17	0.01	-0.31	-0.10	0.23	0.01	-0.23
mean concavity	0.26	0.06	0.00	0.02	0.09	-0.01	-0.11	-0.07	0.04	-0.14	0.12	0.07	0.39	-0.19	0.13
mean concave points	0.26	-0.03	-0.03	0.07	-0.04	-0.05	-0.15	-0.15	-0.11	0.01	-0.07	0.04	0.13	-0.24	0.22
mean symmetry	0.14	0.19	-0.04	0.07	-0.31	0.36	-0.09	-0.23	0.26	0.57	0.16	-0.29	0.19	0.03	0.07
mean fractal dimension	0.06	0.37	-0.02	0.05	-0.04	-0.12	0.30	-0.18	-0.12	0.08	-0.04	0.24	0.11	-0.38	-0.52
radius error	0.21	-0.11	0.27	0.10	-0.15	-0.03	0.31	0.02	0.25	-0.05	-0.03	-0.02	-0.07	0.01	0.11
texture error	0.02	0.09	0.37	-0.36	-0.19	-0.03	-0.09	-0.48	-0.25	-0.29	0.34	-0.31	-0.17	-0.01	-0.03
perimeter error	0.21	-0.09	0.27	0.09	-0.12	0.00	0.31	-0.01	0.23	-0.11	-0.17	-0.10	-0.04	-0.05	0.01
area error	0.20	-0.15	0.22	0.11	-0.13	-0.04	0.35	0.09	0.23	-0.09	0.05	-0.02	0.06	0.08	0.05
smoothness error	0.01	0.20	0.31	0.04	-0.23	-0.34	-0.24	0.57	-0.14	0.16	0.08	-0.29	0.15	-0.20	-0.02
compactness error	0.17	0.23	0.15	-0.03	0.28	0.07	0.02	0.12	-0.15	0.04	-0.21	-0.26	0.01	0.49	-0.17
concavity error	0.15	0.20	0.18	0.00	0.35	0.06	-0.21	0.06	0.36	-0.14	0.35	0.25	0.16	0.13	-0.25
concave points error	0.18	0.13	0.22	0.07	0.20	-0.03	-0.37	-0.11	0.27	0.09	-0.34	-0.01	-0.49	-0.20	-0.06
symmetry error	0.04	0.18	0.29	0.04	-0.25	0.49	-0.08	0.22	-0.30	-0.32	-0.19	0.32	0.01	-0.05	0.11
fractal dimension error	0.10	0.28	0.21	0.02	0.26	-0.05	0.19	0.01	-0.21	0.37	0.25	0.28	-0.24	0.15	0.35
worst radius	0.23	-0.22	-0.05	0.02	-0.00	-0.00	-0.01	0.04	-0.11	0.08	0.11	0.04	-0.14	0.02	-0.17
worst texture	0.10	-0.05	-0.04	-0.63	-0.09	-0.05	0.01	0.04	0.10	0.03	0.01	0.08	-0.08	0.05	-0.10
worst perimeter	0.24	-0.20	-0.05	0.01	0.01	0.01	-0.00	0.03	-0.11	0.05	0.05	-0.01	-0.10	0.01	-0.18
worst area	0.22	-0.22	-0.01	0.03	-0.03	-0.03	0.07	0.08	-0.08	0.07	0.18	0.05	-0.10	-0.01	-0.31
worst smoothness	0.13	0.17	-0.26	0.02	-0.32	-0.37	-0.11	0.21	0.11	-0.13	0.14	0.06	-0.21	0.16	-0.05
worst compactness	0.21	0.14	-0.24	-0.09	0.12	0.05	0.14	0.08	-0.10	-0.17	-0.20	-0.37	0.01	0.17	0.05
worst concavity	0.23	0.10	-0.17	-0.07	0.19	0.03	-0.06	0.07	0.16	-0.31	0.19	-0.09	0.22	-0.07	0.20
worst concave points	0.25	-0.01	-0.17	0.01	0.04	-0.03	-0.17	-0.04	0.06	-0.08	-0.12	-0.07	-0.25	-0.28	0.17
worst symmetry	0.12	0.14	-0.27	-0.04	-0.24	0.50	-0.02	0.23	0.06	-0.03	0.16	0.04	-0.26	0.01	-0.14
worst fractal dimension	0.13	0.28	-0.23	-0.08	0.09	-0.08	0.37	0.05	-0.13	0.01	0.12	-0.03	-0.17	-0.21	0.26
Cumulative Proportion(%)	44.27	63.24	72.64	79.24	84.73	88.76	91.01	92.60	93.99	95.16	96.14	97.01	97.81	98.34	98.65
Explained Variance	13.30	5.70	2.82	1.98	1.65	1.21	0.68	0.48	0.42	0.35	0.29	0.26	0.24	0.16	0.09

Principal Component Analysis Using Python - The Examples of Wine and Breast Data¶

Introduction¶

Methods¶

Function of PCA¶

Wine data introduction¶

Linear correlation of original variables¶

Range of variables¶

Tranformation results¶

Sources of principal component¶

Discrimation of principal components¶

Breast Cancer Data Introduction¶

Linear correlation of original variables¶

Range of variables¶

Tranformation results¶

Sources of principal component¶

Discrimation of principal components¶

Conclusion¶