###################============== Loading Packages =============== ==== ################# Library(plyr) # Rmisc association package, if you need to load dplyr package at the same time, you must first load the plyr package. Library(dplyr) # filter() Library(ggplot2) # ggplot() Library(DT) # datatable() Create an interactive data table Library(caret) # createDataParTITIon() stratified sampling function Library(rpart) # rpart() Library(e1071) # naiveBayes() Library(pROC) # roc() Library(Rmisc) # mulTIplot() Split drawing area ################### ============= Import data ================ == ################# Hr "- read.csv("D:/R/天善智能/书豪å大案例/Employee turnover prediction \\HR_comma_sep.csv") Str(hr) # View the basic data structure of the data Descriptive analysis ################### ============= Descriptive Analysis ================ === ############### Str(hr) # View the basic data structure of the data Summary(hr) # Calculate the main descriptive statistics of the data # subsequent individual models need the target variable to be factor type, we convert it to factor type Hr$left "- factor(hr$left, levels = c('0', '1')) ## Exploring the relationship between employee satisfaction, performance evaluation, and average monthly working hours and resignation # Draw a box plot of satisfaction with the company and whether or not to leave Box_sat "- ggplot(hr, aes(x = left, y = saTIsfaction_level, fill = left)) + Geom_boxplot() + Theme_bw() + # a ggplot theme Labs(x = 'left', y = 'satisfaction_level') # Set the horizontal and vertical coordinates Box_sat Box line chart of employee satisfaction with the company and whether or not to leave Retired employees are less satisfied with the company, mostly concentrated around 0.4; # Draw a performance assessment and a box line diagram of whether to leave Box_eva "- ggplot(hr, aes(x = left, y = last_evaluation, fill = left)) + Geom_boxplot() + Theme_bw() + Labs(x = 'left', y = 'last_evaluation') Box_eva Performance appraisal and box line diagram of resignation The performance evaluation of the departing employees is higher, and the concentration is above 0.8; # Draw a box plot of the average monthly working hours and whether or not to leave Box_mon "- ggplot(hr, aes(x = left, y = average_montly_hours, fill = left)) + Geom_boxplot() + Theme_bw() + Labs(x = 'left', y = 'average_montly_hours') Box_mon The average monthly working hours of retired employees is higher, more than half of the average (200 hours) # Draw a box plot of the employee's working years in the company and whether or not to leave Box_time "- ggplot(hr, aes(x = left, y = time_spend_company, fill = left)) + Geom_boxplot() + Theme_bw() + Labs(x = 'left', y = 'time_spend_company') Box_time The working years of the departing employees are around 4 years. # Combine these graphics in a drawing area, cols = 2 means that the layout is a row and two columns Multiplot(box_sat, box_eva, box_mon, box_time, cols = 2) ## Explore the number of participating projects, whether there is promotion in five years, and the relationship between salary and turnover # Need to convert this variable into a factor type when drawing a bar chart of participating items Hr$number_project "- factor(hr$number_project, Levels = c('2', '3', '4', '5', '6', '7')) # Draw the number of participating projects and whether or not to leave the percentage of the stacked bar chart Bar_pro "- ggplot(hr, aes(x = number_project, fill = left)) + Geom_bar(position = 'fill') + # position = 'fill' is to draw a percentage stacked bar chart Theme_bw() + Labs(x = 'left', y = 'number_project') Bar_pro Employees participating in the number of projects and the percentage of whether they left the stacked bar chart The more employees attending the project, the greater the turnover rate of employees (samples with 2 items removed) # Draw a percentage bar chart of whether to promote and resign within 5 years Bar_5years "- ggplot(hr, aes(x = as.factor(promotion_last_5years), fill = left)) + Geom_bar(position = 'fill') + Theme_bw() + Labs(x = 'left', y = 'promotion_last_5years') Bar_5years Percentage bar chart of whether to promote and resign within 5 years The turnover rate of employees who have not been promoted within five years is relatively large. # Plot the salary and the percentage of the resignation stacked bar chart Bar_salary "- ggplot(hr, aes(x = salary, fill = left)) + Geom_bar(position = 'fill') + Theme_bw() + Labs(x = 'left', y = 'salary') Bar_salary Payroll and percentage of whether or not to leave a stacked bar chart The higher the salary, the lower the turnover rate # Combine these graphics in a drawing area, cols = 3 means that the layout is a row and three columns Multiplot(bar_pro, bar_5years, bar_salary, cols = 3) Modeling prediction regression tree ############## =============== Extracting Excellent Employees =========== ####### ############ # filter() is used to filter the eligible samples Hr_model "- filter(hr, last_evaluation 》 = 0.70 | time_spend_company 》 = 4 | number_project 》 5) ############### ============ Custom cross-validation method ========== ######## ########## # Set 5-fold cross-validation method = 'cv' is to set the cross-validation method, number = 5 means 5-fold cross-validation Train_control "- trainControl(method = 'cv', number = 5) ################ =========== Divided into samples ============== ####### ################### Set.seed(1234) # Set random seeds in order to make the results consistent for each sample # 7:3 stratified sampling based on the dependent variable of the data, returning the row index vector p = 0.7 means sampling according to 7:3, #list=FI will not return the list, return vector Index "- createDataPartition(hr_model$left, p = 0.7, list = F) Traindata "- hr_model[index, ] # extracts the data of the index corresponding to the index in the data as a training set Testdata "- hr_model[-index, ] # rest as a test set #####################================================================================================= #################### # Using the trian function in the caret package to establish a decision tree model using the 5-fold crossover method for the training set # left ~. Means modeling from dependent variables and all independent variables; trCintrol is the control used to model # methon is to set which algorithm to use Rpartmodel "- train(left ~ ., data = traindata, trControl = train_control, method = 'rpart') # Use the rpartmodel model to predict the test set, ([-7] means to eliminate the dependent variable of the test set) Pred_rpart "- predict(rpartmodel, testdata[-7]) #Create confusion matrix, positive='1' set our positive example to "1" Con_rpart "- table(pred_rpart, testdata$left) Con_rpart Modeling prediction of naive Bayes ###################============ Naives Bayes =============== ## ############### Nbmodel "- train(left ~ ., data = traindata, trControl = train_control, method = 'nb') Pred_nb "- predict(nbmodel, testdata[-7]) Con_nb "- table(pred_nb, testdata$left) Con_nb Model evaluation + application ##################====================================================================== ====== ################# # When using the roc function, the predicted value must be numeric Pred_rpart "- as.numeric(as.character(pred_rpart)) Pred_nb "- as.numeric(as.character(pred_nb)) Roc_rpart "- roc(testdata$left, pred_rpart) # Get the information used in subsequent drawing #å‡æ£ä¾‹çŽ‡:(1-Specififity[) Specificity "- roc_rpart$specificities # lays the foundation for the subsequent horizontal and vertical axis, true counterexample rate Sensitivity "- roc_rpart$sensitivities # recall rate: sensitivities, also true case rate #ç”» ROC curve #we only need the horizontal and vertical coordinates NULL is to declare that we are not using any data P_rpart "- ggplot(data = NULL, aes(x = 1- Specificity, y = Sensitivity)) + Geom_line(colour = 'red') + # Draw ROC curve Geom_abline() + # draw diagonal Annotate('text', x = 0.4, y = 0.5, label = paste('AUC=', #text is a text comment on the declaration layer #'3' is a parameter inside the round function, retaining three decimal places Round(roc_rpart$auc, 3))) + theme_bw() + # Add AUC value in the figure (0.4, 0.5) Labs(x = '1 - Specificity', y = 'Sensitivities') # Set the horizontal and vertical axis labels P_rpart Returning tree ROC curve Roc_nb "- roc(testdata$left, pred_nb) Specificity "- roc_nb$specificities Sensitivity "- roc_nb$sensitivities P_nb "- ggplot(data = NULL, aes(x = 1- Specificity, y = Sensitivity)) + Geom_line(colour = 'red') + geom_abline() + Annotate('text', x = 0.4, y = 0.5, label = paste('AUC=', Round(roc_nb$auc, 3))) + theme_bw() + Labs(x = '1 - Specificity', y = 'Sensitivities') P_nb Naive Bayes ROC Curve AUC value of the regression tree (0.93) 》 AUC value of naive Bayes (0.839) Finally, we chose the regression tree model as our actual prediction model. ###############################==================================================================== ==#################### # Use the regression tree model to predict the probability of classification, type='prob' set the prediction result as the probability of leaving the job and the probability of not leaving the job. Pred_end "- predict(rpartmodel, testdata[-7], type = 'prob') # Combined forecast results and predicted probability results Data_end "- cbind(round(pred_end, 3), pred_rpart) # Rename the forecast results table Names(data_end) "- c('pred.0', 'pred.1', 'pred') # Generate an interactive data table Datatable(data_end) Finally we will generate a forecast result table Pressurized With Tubes is one style of the Solar Water Heater. Solar Panel Water Heater,Pressurized Solar Water Heater,Passive Solar Water Heater,Solahart Water Heater NANTONG RONGCHANG IMPORT&EXPORT CO.,LTD , https://www.ergsolarcn.com