import os
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import seaborn.objects as so
import matplotlib as mpl
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

#List files in directory
print('The files in our directory are:')
for file_path in os.scandir('Data/'):
    if file_path.is_file():
        print(file_path.name)

The files in our directory are:
IBM-HR-Employee-Attrition - Workbook.xlsx
IBM-HR-Employee-Attrition.csv

#import the data
import pandas as pd
pd.options.mode.chained_assignment = None

#Specify our file names
imb_hr_csv = 'Data/IBM-HR-Employee-Attrition.csv'
#import Alfred data, and add filename column
df_ibm_hr = pd.read_csv(imb_hr_csv, low_memory=False) #Read file into dataframe

#Verify the data fields in each file
print('File Name:',imb_hr_csv)
df_ibm_hr.info()

File Name: Data/IBM-HR-Employee-Attrition.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB

pd.set_option('display.max_columns', None) #Display all of the columns
display(df_ibm_hr)

df_ibm_hr_attrition = df_ibm_hr.groupby(df_ibm_hr.Attrition)['Attrition'].count().reset_index(name='Rows') #Create a new frame, counting rows grouped by year 
display(df_ibm_hr_attrition)

df_ibm_hr_businesstravel = df_ibm_hr.groupby(df_ibm_hr.BusinessTravel)['BusinessTravel'].count().reset_index(name='Rows') #Create a new frame, counting rows grouped by year 
display(df_ibm_hr_businesstravel)

df_ibm_hr_department = df_ibm_hr.groupby(df_ibm_hr.Department)['Department'].count().reset_index(name='Rows') #Create a new frame, counting rows grouped by year 
display(df_ibm_hr_department)

df_ibm_hr_educationfield = df_ibm_hr.groupby(df_ibm_hr.EducationField)['EducationField'].count().reset_index(name='Rows') #Create a new frame, counting rows grouped by year 
display(df_ibm_hr_educationfield)

df_ibm_hr_jobrole = df_ibm_hr.groupby(df_ibm_hr.JobRole)['JobRole'].count().reset_index(name='Rows') #Create a new frame, counting rows grouped by year 
display(df_ibm_hr_jobrole)

df_ibm_hr_maritalstatus = df_ibm_hr.groupby(df_ibm_hr.MaritalStatus)['MaritalStatus'].count().reset_index(name='Rows') #Create a new frame, counting rows grouped by year 
display(df_ibm_hr_maritalstatus)

df_ibm_hr_overtime = df_ibm_hr.groupby(df_ibm_hr.OverTime)['OverTime'].count().reset_index(name='Rows') #Create a new frame, counting rows grouped by year 
display(df_ibm_hr_overtime)

#Make a new dataframe with only useful data
df_ibm_hr_filtered = df_ibm_hr[[
                             'Age'
                            ,'Attrition'
                            ,'BusinessTravel'
                            #,'Department'
                            ,'DistanceFromHome'
                            ,'Education'
                            ,'EnvironmentSatisfaction'
                            ,'Gender'
                            ,'HourlyRate'
                            ,'JobLevel'
                            ,'JobSatisfaction'
                            #,'MaritalStatus'
                            ,'MonthlyIncome'
                            ,'NumCompaniesWorked'
                            ,'OverTime'
                            ,'PerformanceRating'
                            ,'RelationshipSatisfaction'
                            ,'StockOptionLevel'
                            ,'TotalWorkingYears'
                            ,'TrainingTimesLastYear'
                            ,'WorkLifeBalance'
                            ,'YearsAtCompany'
                            ,'YearsInCurrentRole'
                            ,'YearsSinceLastPromotion'
                            ,'YearsWithCurrManager'
                         ]]

df_ibm_hr_cat = df_ibm_hr_filtered
#Replace string data with integer data
df_ibm_hr_cat['Attrition'] = np.where(df_ibm_hr_cat['Attrition'] == 'Yes', 1, 0) #Categorize Attrition as "Yes" = 1, "No = 0
df_ibm_hr_cat['Gender'] = np.where(df_ibm_hr_cat['Gender'] == 'Female', 0, 1) #Categorize Gender alphabetically as Female = 0, Male = 1
df_ibm_hr_cat['OverTime'] = np.where(df_ibm_hr_cat['OverTime'] == 'Yes', 1, 0) #Categorize OverTime as "Yes" = 1, "No" = 0
df_ibm_hr_cat['BusinessTravel'] = df_ibm_hr_cat['BusinessTravel'].map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}) 
#df_ibm_hr_cat['BusinessTravel'] = np.where(df_ibm_hr_cat['BusinessTravel'] == 'Non-Travel', 0) #Categorize BusinessTravel
#df_ibm_hr_cat['BusinessTravel'] = np.where(df_ibm_hr_cat['BusinessTravel'] == 'Travel_Rarely', 1) #Categorize BusinessTravel
#df_ibm_hr_cat['BusinessTravel'] = np.where(df_ibm_hr_cat['BusinessTravel'] == 'Travel_Frequently', 2) #Categorize BusinessTravel

print("Our original dataset contains:")
display(df_ibm_hr_filtered)
print('\n')
print("And our codified dataset contains:")
display(df_ibm_hr_cat)

Our original dataset contains:


And our codified dataset contains:

from IPython.display import Image, display
display(Image(filename='Images/Correlation_Coefficients.png'))
display(Image(filename='Images/Correlation_Coefficient_Examples.png'))

# Compute the correlation matrix
corr = df_ibm_hr_cat.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 18))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(20, 230, sep=7, s=95, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
heatmap_tri = sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0, annot=True, fmt='.3f',
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

heatmap_tri.set_title('Triangle Correlation Heatmap - IBM HR Data', fontdict={'fontsize':18}, pad=16);

df_ibm_hr_cat.corr()[['Attrition']].sort_values(by='Attrition', ascending=False)

plt.figure(figsize=(1, 6))
heatmap_attrition = sns.heatmap(df_ibm_hr_cat.corr()[['Attrition']].sort_values(by='Attrition', ascending=False), vmin=-1, vmax=1, annot=True, fmt='.3f', cmap=cmap)
heatmap_attrition.set_title('Features Correlating with Attrition', fontdict={'fontsize':18}, pad=16);

display(Image(filename='Images/employee_attrition_by_overtime_auth.png'))

display(Image(filename='Images/employee_attrition_by_overtime_auth.png'))

display(Image(filename='Images/overtime_auth_by_gender.png'))

display(Image(filename='Images/attrition_rate_by_overtime_auth_and_gender.png'))

	Age	Attrition	BusinessTravel	DailyRate	Department	DistanceFromHome	Education	EducationField	EmployeeCount	EmployeeNumber	EnvironmentSatisfaction	Gender	HourlyRate	JobInvolvement	JobLevel	JobRole	JobSatisfaction	MaritalStatus	MonthlyIncome	MonthlyRate	NumCompaniesWorked	Over18	OverTime	PercentSalaryHike	PerformanceRating	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
0	41	Yes	Travel_Rarely	1102	Sales	1	2	Life Sciences	1	1	2	Female	94	3	2	Sales Executive	4	Single	5993	19479	8	Y	Yes	11	3	1	80	0	8	0	1	6	4	0	5
1	49	No	Travel_Frequently	279	Research & Development	8	1	Life Sciences	1	2	3	Male	61	2	2	Research Scientist	2	Married	5130	24907	1	Y	No	23	4	4	80	1	10	3	3	10	7	1	7
2	37	Yes	Travel_Rarely	1373	Research & Development	2	2	Other	1	4	4	Male	92	2	1	Laboratory Technician	3	Single	2090	2396	6	Y	Yes	15	3	2	80	0	7	3	3	0	0	0	0
3	33	No	Travel_Frequently	1392	Research & Development	3	4	Life Sciences	1	5	4	Female	56	3	1	Research Scientist	3	Married	2909	23159	1	Y	Yes	11	3	3	80	0	8	3	3	8	7	3	0
4	27	No	Travel_Rarely	591	Research & Development	2	1	Medical	1	7	1	Male	40	3	1	Laboratory Technician	2	Married	3468	16632	9	Y	No	12	3	4	80	1	6	3	3	2	2	2	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1465	36	No	Travel_Frequently	884	Research & Development	23	2	Medical	1	2061	3	Male	41	4	2	Laboratory Technician	4	Married	2571	12290	4	Y	No	17	3	3	80	1	17	3	3	5	2	0	3
1466	39	No	Travel_Rarely	613	Research & Development	6	1	Medical	1	2062	4	Male	42	2	3	Healthcare Representative	1	Married	9991	21457	4	Y	No	15	3	1	80	1	9	5	3	7	7	1	7
1467	27	No	Travel_Rarely	155	Research & Development	4	3	Life Sciences	1	2064	2	Male	87	4	2	Manufacturing Director	2	Married	6142	5174	1	Y	Yes	20	4	2	80	1	6	0	3	6	2	0	3
1468	49	No	Travel_Frequently	1023	Sales	2	3	Medical	1	2065	4	Male	63	2	2	Sales Executive	2	Married	5390	13243	2	Y	No	14	3	4	80	0	17	3	2	9	6	0	8
1469	34	No	Travel_Rarely	628	Research & Development	8	3	Medical	1	2068	2	Male	82	4	2	Laboratory Technician	3	Married	4404	10228	2	Y	No	12	3	1	80	0	6	3	4	4	3	1	2

	Department	Rows
0	Human Resources	63
1	Research & Development	961
2	Sales	446

	EducationField	Rows
0	Human Resources	27
1	Life Sciences	606
2	Marketing	159
3	Medical	464
4	Other	82
5	Technical Degree	132

	JobRole	Rows
0	Healthcare Representative	131
1	Human Resources	52
2	Laboratory Technician	259
3	Manager	102
4	Manufacturing Director	145
5	Research Director	80
6	Research Scientist	292
7	Sales Executive	326
8	Sales Representative	83

	MaritalStatus	Rows
0	Divorced	327
1	Married	673
2	Single	470

IBM HR Analytics - Employee Attrition & Performance

The Data in Question

The data file includes:

Let's get started

What data do we need to wrangle?

I think we can leave out some of those variables for now.

Let's see if any data is correlated to another

What is correlation?

Does anything interesting come out here?

Let's look right at our attrition data

Very interesting data here

Let's dig into that `OverTime` data a little further

Let's dive in further, then.

Conclusions

Appendix¶

Codified Data

Education

EnvironmentSatisfaction

JobInvolvement

JobSatisfaction

PerformanceRating

RelationshipSatisfaction

WorkLifeBalance

	BusinessTravel	Rows
0	Non-Travel	150
1	Travel_Frequently	277
2	Travel_Rarely	1043

IBM HR Analytics - Employee Attrition & Performance

The Data in Question

The data file includes:

Let's get started

What data do we need to wrangle?

I think we can leave out some of those variables for now.

Let's see if any data is correlated to another

What is correlation?

Does anything interesting come out here?

Let's look right at our attrition data

Very interesting data here

Let's dig into that OverTime data a little further

Let's dive in further, then.

Conclusions

Appendix¶

Codified Data

Education

EnvironmentSatisfaction

JobInvolvement

JobSatisfaction

PerformanceRating

RelationshipSatisfaction

WorkLifeBalance

Let's dig into that `OverTime` data a little further