1. Introduction to R

Batur Şeker
5 min readJan 30, 2021

Used dataset

#Get working directory
getwd()

#Set working directory
setwd(“C:\\Users\\batur\\Desktop\\R Tutorial”)

#Read csv data file and store as data frame
bankChurnersData=read.csv(file=”BankChurners.csv”)

#Drop columns has number of 22 and 23
df <- bankChurnersData[-c(22:23)]

#Remove bankChurnersData
rm(bankChurnersData)

#Return first 10 row of data
head(df, n=10)

#Display the internal structure of bankChurnersData
str(df)
Result:
‘data.frame’: 10127 obs. of 21 variables:
$ CLIENTNUM : int 768805383 818770008 713982108 769911858 709106358 713061558 810347208 818906208 710930508 719661558 …
$ Attrition_Flag : chr “Existing Customer” “Existing Customer” “Existing Customer” “Existing Customer” …
$ Customer_Age : int 45 49 51 40 40 44 51 32 37 48 …
$ Gender : chr “M” “F” “M” “F” …
$ Dependent_count : int 3 5 3 4 3 2 4 0 3 2 …
$ Education_Level : chr “High School” “Graduate” “Graduate” “High School” …
$ Marital_Status : chr “Married” “Single” “Married” “Unknown” …
$ Income_Category : chr “$60K — $80K” “Less than $40K” “$80K — $120K” “Less than $40K” …
$ Card_Category : chr “Blue” “Blue” “Blue” “Blue” …
$ Months_on_book : int 39 44 36 34 21 36 46 27 36 36 …
$ Total_Relationship_Count: int 5 6 4 3 5 3 6 2 5 6 …
$ Months_Inactive_12_mon : int 1 1 1 4 1 1 1 2 2 3 …
$ Contacts_Count_12_mon : int 3 2 0 1 0 2 3 2 0 3 …
$ Credit_Limit : num 12691 8256 3418 3313 4716 …
$ Total_Revolving_Bal : int 777 864 0 2517 0 1247 2264 1396 2517 1677 …
$ Avg_Open_To_Buy : num 11914 7392 3418 796 4716 …
$ Total_Amt_Chng_Q4_Q1 : num 1.33 1.54 2.59 1.4 2.17 …
$ Total_Trans_Amt : int 1144 1291 1887 1171 816 1088 1330 1538 1350 1441 …
$ Total_Trans_Ct : int 42 33 20 20 28 24 31 36 24 32 …
$ Total_Ct_Chng_Q4_Q1 : num 1.62 3.71 2.33 2.33 2.5 …
$ Avg_Utilization_Ratio : num 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 …

#Produce result summaries of the results of various model fitting functions
summary(df)

#Return number of rows
nrow(df)
[1] 10127

#Return number of columns
ncol(df)
[1] 21

#Getting dimension of the bankChurnersData dataset
dim(df)
[1] 10127 21

#Get names of bankChurnersData
names(df)
Result:
[1] “CLIENTNUM” “Attrition_Flag” “Customer_Age” “Gender”
[5] “Dependent_count” “Education_Level” “Marital_Status” “Income_Category”
[9] “Card_Category” “Months_on_book” “Total_Relationship_Count” “Months_Inactive_12_mon”
[13] “Contacts_Count_12_mon” “Credit_Limit” “Total_Revolving_Bal” “Avg_Open_To_Buy”
[17] “Total_Amt_Chng_Q4_Q1” “Total_Trans_Amt” “Total_Trans_Ct” “Total_Ct_Chng_Q4_Q1”
[21] “Avg_Utilization_Ratio”

#Encode Attrition_Flag column of df as a factor — Binary variable
df$Attrition_Flag=factor(df$Attrition_Flag,levels=c(“Attrited Customer”,”Existing Customer”))

#Encode Gender column of df as a factor — Binary variable
df$Gender=factor(df$Gender,levels=c(“M”,”F”))

#Returns a vector, data frame or array but with duplicate elements/rows removed
unique(df$Education_Level)

#Encode Education_Level column of df as an ordered factor — Ordinal variable
df$Education_Level=factor(df$Education_Level, ordered=TRUE, levels=c(“Unknown”,”Uneducated”,”High School”,”College”,”Graduate”,”Post-Graduate”,”Doctorate”) )

#Encode Marital_Status column of df as a factor — Nominal variable
df$Marital_Status=factor(df$Marital_Status,levels=c(“Married”,”Single”,”Unknown”,”Divorced”))

#Encode Income_Category column of df as an ordered factor — Ordinal variable
df$Income_Category=factor(df$Income_Category,ordered=TRUE,levels=c(“Unknown”,”Less than $40K”,”$40K — $60K”,”$60K — $80K”,”$80K — $120K”,”$120K +”))

#Encode Card_Category column of df as an ordered factor — Ordinal variable
df$Card_Category<-factor(df$Card_Category,ordered=TRUE,levels = c(“Blue”,”Silver”,”Gold”,”Platinum”))

#Display the internal structure of df after updates
str(df)
Result:
‘data.frame’: 10127 obs. of 21 variables:
$ CLIENTNUM : int 768805383 818770008 713982108 769911858 709106358 713061558 810347208 818906208 710930508 719661558 …
$ Attrition_Flag : Factor w/ 2 levels “Attrited Customer”,..: 2 2 2 2 2 2 2 2 2 2 …
$ Customer_Age : int 45 49 51 40 40 44 51 32 37 48 …
$ Gender : Factor w/ 2 levels “M”,”F”: 1 2 1 2 1 1 1 1 1 1 …
$ Dependent_count : int 3 5 3 4 3 2 4 0 3 2 …
$ Education_Level : Ord.factor w/ 7 levels “Unknown”<”Uneducated”<..: 3 5 5 3 2 5 1 3 2 5 …
$ Marital_Status : Factor w/ 4 levels “Married”,”Single”,..: 1 2 1 3 1 1 1 3 2 2 …
$ Income_Category : Ord.factor w/ 6 levels “Unknown”<”Less than $40K”<..: 4 2 5 2 4 3 6 4 4 5 …
$ Card_Category : Ord.factor w/ 4 levels “Blue”<”Silver”<..: 1 1 1 1 1 1 3 2 1 1 …
$ Months_on_book : int 39 44 36 34 21 36 46 27 36 36 …
$ Total_Relationship_Count: int 5 6 4 3 5 3 6 2 5 6 …
$ Months_Inactive_12_mon : int 1 1 1 4 1 1 1 2 2 3 …
$ Contacts_Count_12_mon : int 3 2 0 1 0 2 3 2 0 3 …
$ Credit_Limit : num 12691 8256 3418 3313 4716 …
$ Total_Revolving_Bal : int 777 864 0 2517 0 1247 2264 1396 2517 1677 …
$ Avg_Open_To_Buy : num 11914 7392 3418 796 4716 …
$ Total_Amt_Chng_Q4_Q1 : num 1.33 1.54 2.59 1.4 2.17 …
$ Total_Trans_Amt : int 1144 1291 1887 1171 816 1088 1330 1538 1350 1441 …
$ Total_Trans_Ct : int 42 33 20 20 28 24 31 36 24 32 …
$ Total_Ct_Chng_Q4_Q1 : num 1.62 3.71 2.33 2.33 2.5 …
$ Avg_Utilization_Ratio : num 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 …

#Get number of element in df
length(df)
[1] 21

#Return maximum number of Credit_Limit column of df
max(df$Credit_Limit)
[1] 34516

#Return minimum number of Credit_Limit column of df
min(df$Credit_Limit)
[1] 1438.3

#Return summation of Avg_Open_To_Buy column of df
sum(df$Avg_Open_To_Buy)
[1] 75639977

#Returns the product of all the values in Avg_Utilization_Ratio
prod(df$Avg_Utilization_Ratio)
[1] 0

#Use Mode to get average for Nominal Variables
#Use Median to get average for Ordinal Variables
#Use Mean to get average for Interval and Ratio Variables

#Calculate mode of Marital_Status — Nominal Variable
names(table(df$Marital_Status))[table(df$Marital_Status)==max(table(df$Marital_Status))]
[1] “Married”

#Calculate median of Income_Category — Ordinal Variable
#Not: df$Income_Category is a factor but median() need numeric data to calculate median
median(unclass(df$Income_Category), na.rm = TRUE)
[1] “$80K — $120K”

#Calculate mean of Dependent_count — Interval Variable
#Dependent_count which is demographic variable, number of dependents
mean(df$Dependent_count)
[1] 2.346203

#Calculate mean of Customer_Age — Ratio Variable
mean(df$Customer_Age)
[1] 46.32596

#Calculate standard deviation of Credit_Limit
sd(df$Credit_Limit)
[1] 9088.777

#Calculate variance of Credit_Limit
sd(df$Credit_Limit)²
[1] 82605861

#Get value range of Avg_Open_To_Buy
range(df$Avg_Open_To_Buy)
[1] 3 34516

#Calculate absolute values (It is already absolute)
abs(df$Avg_Open_To_Buy)

#Calculate the square root of values in Avg_Open_To_Buy
sqrt(df$Avg_Open_To_Buy)

#When base=e, calculate the logarithmic of values in Avg_Open_To_Buy
log(df$Avg_Open_To_Buy)

#When base=10, calculate the logarithmic of values in Avg_Open_To_Buy
log10(df$Avg_Open_To_Buy)

#Calculate e^Months_on_book
exp(df$Months_on_book)

#Round values in Total_Amt_Chng_Q4_Q1
round(df$Total_Amt_Chng_Q4_Q1)

Next article

--

--