2. Data Visualization with R

Batur Şeker
4 min readJan 30, 2021

Used dataset

This story is the continuation of this article.

#Get working directory
getwd()

#Set working directory
setwd(“C:\\Users\\batur\\Desktop\\R Tutorial”)

#Read csv data file and store as data frame
bankChurnersData=read.csv(file=”BankChurners.csv”)

#Drop columns has number of 22 and 23
df <- bankChurnersData[-c(22:23)]

#Encode Attrition_Flag column of df as a factor — Binary variable
df$Attrition_Flag=factor(df$Attrition_Flag,levels=c(“Attrited Customer”,”Existing Customer”))

#Encode Gender column of df as a factor — Binary variable
df$Gender=factor(df$Gender,levels=c(“M”,”F”))

#Encode Education_Level column of df as an ordered factor — Ordinal variable
df$Education_Level=factor(df$Education_Level, ordered=TRUE, levels=c(“Unknown”,”Uneducated”,”High School”,”College”,”Graduate”,”Post-Graduate”,”Doctorate”) )

#Encode Marital_Status column of df as a factor — Nominal variable
df$Marital_Status=factor(df$Marital_Status,levels=c(“Married”,”Single”,”Unknown”,”Divorced”))

#Encode Income_Category column of df as an ordered factor — Ordinal variable
df$Income_Category=factor(df$Income_Category,ordered=TRUE,levels=c(“Unknown”,”Less than $40K”,”$40K — $60K”,”$60K — $80K”,”$80K — $120K”,”$120K +”))

#Encode Card_Category column of df as an ordered factor — Ordinal variable
df$Card_Category<-factor(df$Card_Category,ordered=TRUE,levels = c(“Blue”,”Silver”,”Gold”,”Platinum”))

#Smash the window
par(mfrow=c(2,2))

1.PLOT
#Draw Education_Level vs Months_on_book plot
plot(df$Education_Level,df$Months_on_book,
xlab=”Education Level”, ylab=”Months on Book”,
main=”Edu. Lev. vs Book”)

#Draw vertical and horizontal lines
abline(v=3)
abline(h=31)

2.BOXPLOT
#Customer_Age vs Marital_Status Box Plot
boxplot(Customer_Age ~ Marital_Status ,data = df,
xlab = “Marital Status”,
ylab = “Customer Age”,
main = “Marital Data Box Plot”,
varwidth = TRUE,
col = c(“green”,”yellow”,”purple”,”red”))

3.DATA DISTRIBUTION
#Show data distribution of Months_on_book
qqnorm(df$Months_on_book)
qqline(df$Months_on_book)

4.HISTOGRAM
#Relative density histogram graph of Dependent_count
hist(df$Dependent_count,prob=T, breaks=5,
col=”red”,
xlab=”Dependent Count”,
main=”Dependent Count Histogram Graph”)

#Real accuracy histogram graph of Dependent_count
hist(df$Dependent_count, breaks=5,
col=”darkmagenta”,
xlab=”Dependent Count”,
main=”Dependent Count Histogram Graph”)

#Draw Customer_Age histogram graph with line
hist(df$Customer_Age, breaks=15, prob=T,col=”darkmagenta”,
xlab=”Customer Age”,
main=”Customer Age Histogram Graph”)
density(df$Customer_Age)
lines(density(df$Customer_Age))

5.PIE CHART
#Draw Marital_Status pie chart
result<-table(df$Marital_Status)
marital_status_names<-names(result)
pie(result, labels=marital_status_names, main=”Marital Status Pie Chart”)

6.BARPLOT
#Create data frame from Education_Level, Income_Category and Customer_Age columns
df_2 <- data.frame(Education_Level=df$Education_Level,
Income_Category=df$Income_Category,
Customer_Age=df$Customer_Age)

#Calculate mean of Customer_Age according to Education_Level and Income_Category
agg = aggregate(df_2,
by = list(df_2$Education_Level, df_2$Income_Category),
FUN = mean)

#Create row and column names
rownames = unique(agg$Group.2)
colnames = unique(agg$Group.1)

#Create empty matrix of Education_Level and Income_Category
matrix_1 <- matrix(nrow = length(rownames),ncol = length(colnames), dimnames = list(rownames, colnames))

#Fill matrix_1 with using mean of Customer_Age according to Education_Level and Income_Category
for (row in rownames){
for (col in colnames){
value<-agg[agg$Group.1==col & agg$Group.2==row,]$Customer_Age
matrix_1[row,col]<-value
}
}

print(matrix_1)

#Draw bar plot of matrix_1
barplot(matrix_1, legend=rownames(matrix_1),las=1,
xlab=”Education Level”, ylab=”Frequency”,
main=c(“Mean of Customer Age”,”Bar Plot”))

7.DOTCHART
#Create data frame from Card_Category and Credit_Limit
df_3 <- data.frame(Card_Category=df$Card_Category,
Credit_Limit=df$Credit_Limit)

#Calculate maximum of Credit_Limit by Card_Category
agg_2 = aggregate(df_3,
by = list(df_3$Card_Category),
FUN = max)
#Create row names
rownames_2 = unique(agg_2$Group.1)

#Create empty matrix of Card_Category
matrix_2 <- matrix(nrow = length(rownames_2),ncol = 1, dimnames = list(rownames_2))

#Fill matrix_2 with using maximum of Credit_Limit by Card_Category
for (row in rownames_2){
value<-agg_2[agg_2$Group.1==row,]$Credit_Limit
matrix_2[row,1]<-value
}

print(matrix_2)

#Draw maximum credit limit by card category dotchart
dotchart(matrix_2, labels = row.names(matrix_2),
cex = .6,
main = “Maximum Credit Limit By Card Category”,
xlab = “Maximum Credit Limit “,
ylab = “Card Category”,
pch = 19,
col = c(“red”,”blue”),
lcolor = “gray90”,
cex.main = 2, cex.lab = 1.5)

Next article

--

--