R Language Tutorial

0 Comments

In this tutorial we will explore R language which is a programming language written in Function format, used for statistical analysis.

In this article we’ll explore this language and try to analyse churn data using R.

This data is focused on those customers who stop doing business with the company. Loss of customers will be Churn which is our Target column. We have data that’s related to services which the company provide like phone service, multiple lines, internet service etc. We also have customer account information like payment method, monthly charges etc. Other data include customer gender.

Libraries to import

library(car)
library(plyr)
library(lubridate)
library(reshape2)
library(ggplot2)
library(miscset)
library(tidyverse)
library(gridExtra)

Setting up working directory

setwd("D:/IBA MSCS/Semester1/MachineLearning/RWork/")
getwd() #for printing the working directory

Loading Data

df <- read.csv("../Dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv",header=TRUE,sep=",")

I am making a copy of dataframe so that i’ll do all modifications in other dataframe.

tele_customerdata <- cbind(df)
View(tele_customerdata) # for viewing the data
class(tele_customerdata) #for printing class of read dataset
names(tele_customerdata) # finding put names of all columns

There are total 21 columns, and 7043 rows. The target feature will be column “Churn”.

str(tele_customerdata) # this will return all data types of each column

Finding total count of male and female in gender column and plotting it

summary(tele_customerdata$gender)
plot(tele_customerdata$gender)

Finding total count of No and Yes in Partner column and plotting it

summary(tele_customerdata$Partner)
plot(tele_customerdata$Partner)

Finding total count of No and Yes in Dependents column and plotting it

summary(tele_customerdata$Dependents)
plot(tele_customerdata$Dependents)

Changing the Senior Citizan column from int to factor
seniorcitizan <- as.factor(tele_customerdata$SeniorCitizen)

 class(seniorcitizan) #verify the cast

Converting 0 and 1 into No and Yes of seniorcitizan column

seniorcitizan <- recode(seniorcitizan, "0='No';1='Yes'")
plot(seniorcitizan) #plot to see frequency

tele_customerdata$SeniorCitizen <- seniorcitizan #assign recode to dataframe

Converting ‘No phone service’ into ‘No’ of MultipleLines column

tele_customerdata$MultipleLines <- as.factor(mapvalues(tele_customerdata$MultipleLines,from = 'No phone service', to = 'No'))

Converting ‘No internet service’ into ‘No’ of OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV , StreamingMovies columns

tele_customerdata$OnlineSecurity <- as.factor(mapvalues(tele_customerdata$OnlineSecurity,
                                                        from = 'No internet service', to = 'No'))
tele_customerdata$OnlineBackup <- as.factor(mapvalues(tele_customerdata$OnlineBackup,
                                                      from = 'No internet service', to = 'No'))
tele_customerdata$DeviceProtection <- as.factor(mapvalues(tele_customerdata$DeviceProtection,
                                                          from = 'No internet service', to = 'No'))
tele_customerdata$TechSupport <- as.factor(mapvalues(tele_customerdata$TechSupport,
                                                     from = 'No internet service', to = 'No'))
tele_customerdata$StreamingTV <- as.factor(mapvalues(tele_customerdata$StreamingTV,
                                                     from = 'No internet service', to = 'No'))
tele_customerdata$StreamingMovies <- as.factor(mapvalues(tele_customerdata$StreamingMovies,
                                                         from = 'No internet service', to = 'No'))

Now I will plot these tables and some other tables as well which includes (PhoneSevice, InternetService, Contract, PaperlessBilling, PaymentMethod) in horizontal bar chart form

ps <- ggplot(tele_customerdata, aes(x = PhoneService)) + geom_bar(aes( y = 100 * (..count..) /
                                                                         sum(..count..)), width = 0.5) + ggtitle('Phone Service') + xlab('Phone Service') + 
  ylab('Percentage') + coord_flip() + theme_minimal()
ml <- ggplot(tele_customerdata, aes(x = MultipleLines)) + geom_bar(aes( y = 100 * (..count..) /
                                                                          sum(..count..)), width = 0.5) + ggtitle('Multiple Lines') + 
  xlab('Multiple Lines') + ylab('Percentage') + coord_flip() + theme_minimal() 
is <- ggplot(tele_customerdata, aes(x = InternetService)) + geom_bar(aes( y = 100 * (..count..) /
                                                                            sum(..count..)), width = 0.5) + ggtitle('Internet Service') + 
  xlab('Internet Service') + ylab('Percentage') + coord_flip() + theme_minimal()
os <- ggplot(tele_customerdata, aes(x = OnlineSecurity)) + geom_bar(aes( y = 100 * (..count..) /
                                                                           sum(..count..)), width = 0.5) + ggtitle('Online Security') + 
  xlab('Online Security') + ylab('Percentage') + coord_flip() + theme_minimal()
grid.arrange(ps,ml,is,os, ncol = 2)

ob <- ggplot(tele_customerdata, aes(x=OnlineBackup)) + ggtitle("Online Backup") + xlab("Online Backup") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
dp <- ggplot(tele_customerdata, aes(x=DeviceProtection)) + ggtitle("Device Protection") + xlab("Device Protection") + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
ts <- ggplot(tele_customerdata, aes(x=TechSupport)) + ggtitle("Tech Support") + xlab("Tech Support") + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
stv <- ggplot(tele_customerdata, aes(x=StreamingTV)) + ggtitle("Streaming TV") + xlab("Streaming TV") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
grid.arrange(ob, dp, ts, stv, ncol=2)

sm <- ggplot(tele_customerdata, aes(x=StreamingMovies)) + ggtitle("Streaming Movies") + xlab("Streaming Movies") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
cont <- ggplot(tele_customerdata, aes(x=Contract)) + ggtitle("Contract") + xlab("Contract") + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
pb <- ggplot(tele_customerdata, aes(x=PaperlessBilling)) + ggtitle("Paperless Billing") + xlab("Paperless Billing") + 
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
pm <- ggplot(tele_customerdata, aes(x=PaymentMethod)) + ggtitle("Payment Method") + xlab("Payment Method") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
grid.arrange(sm, cont, pb, pm, ncol=2)

Among all over half of the customers are on a monthly contract. Also we can see that all these features are important so we will use them in our analysis.
We will remove customerId which is unique for every customer.

tele_customerdata$customerID <- NULL

Correlation Between Variables

scatterplot(tele_customerdata$tenure,tele_customerdata$TotalCharges)
scatterplot(tele_customerdata$MonthlyCharges,tele_customerdata$TotalCharges)

By looking into above two graphs we can say that there is a strong correlation between TotalCharges and MonthlyCharges, TotalCharges and Tenure

Plotting Histograms

hist(tele_customerdata$MonthlyCharges,prob=TRUE)
curve(dnorm(x, mean = mean(tele_customerdata$MonthlyCharges), sd = sd(tele_customerdata$MonthlyCharges)), add = TRUE)
hist(tele_customerdata$TotalCharges, prob=TRUE)
curve(dnorm(x, mean = mean(tele_customerdata$TotalCharges), sd = sd(tele_customerdata$TotalCharges)), add = TRUE)
hist(tele_customerdata$tenure,prob=TRUE)
curve(dnorm(x, mean = mean(tele_customerdata$tenure), sd = sd(tele_customerdata$tenure)), add = TRUE)

By seeing the above charts we can say that montlycharges and tenure both have normal distribution and Totalcharges have a positive skewed

Through this chart we can say that around 26% customers left the company and around 74% customers stayed.

sc_df <- data.frame(table(tele_customerdata$SeniorCitizen))
colnames(sc_df) <- c('Senior Citizen','Freq')
sc_df$Perc <- sc_df$Freq / sum(sc_df$Freq) * 100
sc_df
ggplot(tele_customerdata) +
  geom_bar(aes_string(x="SeniorCitizen", fill = "Churn"), position = "dodge")
ggplot(tele_customerdata) +
  geom_bar(aes_string(x="Partner", fill = "Churn"), position = "dodge")
ggplot(tele_customerdata) +
  geom_bar(aes_string(x="Dependents", fill="Churn"), position = "dodge")
ggplot(tele_customerdata) +
  geom_bar(aes_string(x = "tenure", fill = "Churn"), position = "dodge")
ggplot(tele_customerdata, aes(y= tenure, x = "", fill = Churn)) + 
  geom_boxplot()

This end our journey with R language so far… Thanks you for reading 🙂


Leave a Reply

Your email address will not be published. Required fields are marked *