How to analyse DataSet in R

In [3]:
# ------------------------------------------------------------------------------
# How to analyse DataSet in R 
# ------------------------------------------------------------------------------

# load the library
library(corrgram)
library(mlbench)
library(caret)

# load data
data(PimaIndiansDiabetes)
DataSet <- as.data.frame(PimaIndiansDiabetes)
dim(DataSet)
X <- DataSet[, 1:8]; y <- DataSet[, 9]

# display first 20 rows of data
head(PimaIndiansDiabetes, n=20)
head(DataSet, n=20)

# data types
sapply(DataSet, class)

# distribution of class variable
yData <- DataSet$diabetes
cbind(freq=table(yData), percentage=prop.table(table(y))*100)

# data summary
summary(DataSet)

# mean and sd
sapply(X, mean)
sapply(X, sd)

# calculate a correlation matrix for numeric variables
correlations <- cor(X)
print(correlations)

# visualise a correlation matrix for numeric variables
corrgram(X)
  1. 768
  2. 9
pregnantglucosepressuretricepsinsulinmasspedigreeagediabetes
6 148 72 35 0 33.6 0.62750 pos
1 85 66 29 0 26.6 0.35131 neg
8 183 64 0 0 23.3 0.67232 pos
1 89 66 23 94 28.1 0.16721 neg
0 137 40 35 168 43.1 2.28833 pos
5 116 74 0 0 25.6 0.20130 neg
3 78 50 32 88 31.0 0.24826 pos
10 115 0 0 0 35.3 0.13429 neg
2 197 70 45 543 30.5 0.15853 pos
8 125 96 0 0 0.0 0.23254 pos
4 110 92 0 0 37.6 0.19130 neg
10 168 74 0 0 38.0 0.53734 pos
10 139 80 0 0 27.1 1.44157 neg
1 189 60 23 846 30.1 0.39859 pos
5 166 72 19 175 25.8 0.58751 pos
7 100 0 0 0 30.0 0.48432 pos
0 118 84 47 230 45.8 0.55131 pos
7 107 74 0 0 29.6 0.25431 pos
1 103 30 38 83 43.3 0.18333 neg
1 115 70 30 96 34.6 0.52932 pos
pregnantglucosepressuretricepsinsulinmasspedigreeagediabetes
6 148 72 35 0 33.6 0.62750 pos
1 85 66 29 0 26.6 0.35131 neg
8 183 64 0 0 23.3 0.67232 pos
1 89 66 23 94 28.1 0.16721 neg
0 137 40 35 168 43.1 2.28833 pos
5 116 74 0 0 25.6 0.20130 neg
3 78 50 32 88 31.0 0.24826 pos
10 115 0 0 0 35.3 0.13429 neg
2 197 70 45 543 30.5 0.15853 pos
8 125 96 0 0 0.0 0.23254 pos
4 110 92 0 0 37.6 0.19130 neg
10 168 74 0 0 38.0 0.53734 pos
10 139 80 0 0 27.1 1.44157 neg
1 189 60 23 846 30.1 0.39859 pos
5 166 72 19 175 25.8 0.58751 pos
7 100 0 0 0 30.0 0.48432 pos
0 118 84 47 230 45.8 0.55131 pos
7 107 74 0 0 29.6 0.25431 pos
1 103 30 38 83 43.3 0.18333 neg
1 115 70 30 96 34.6 0.52932 pos
pregnant
'numeric'
glucose
'numeric'
pressure
'numeric'
triceps
'numeric'
insulin
'numeric'
mass
'numeric'
pedigree
'numeric'
age
'numeric'
diabetes
'factor'
freqpercentage
neg500 65.10417
pos268 34.89583
    pregnant         glucose         pressure         triceps     
 Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
 1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
 Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
 Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
 3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
 Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
    insulin           mass          pedigree           age        diabetes 
 Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00   neg:500  
 1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00   pos:268  
 Median : 30.5   Median :32.00   Median :0.3725   Median :29.00            
 Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24            
 3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00            
 Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00            
pregnant
3.84505208333333
glucose
120.89453125
pressure
69.10546875
triceps
20.5364583333333
insulin
79.7994791666667
mass
31.992578125
pedigree
0.471876302083333
age
33.2408854166667
pregnant
3.36957806269887
glucose
31.9726181951362
pressure
19.3558071706448
triceps
15.9522175677276
insulin
115.244002351338
mass
7.88416032037545
pedigree
0.331328595012775
age
11.7602315406787
            pregnant    glucose   pressure     triceps     insulin       mass
pregnant  1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461 0.01768309
glucose   0.12945867 1.00000000 0.15258959  0.05732789  0.33135711 0.22107107
pressure  0.14128198 0.15258959 1.00000000  0.20737054  0.08893338 0.28180529
triceps  -0.08167177 0.05732789 0.20737054  1.00000000  0.43678257 0.39257320
insulin  -0.07353461 0.33135711 0.08893338  0.43678257  1.00000000 0.19785906
mass      0.01768309 0.22107107 0.28180529  0.39257320  0.19785906 1.00000000
pedigree -0.03352267 0.13733730 0.04126495  0.18392757  0.18507093 0.14064695
age       0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295 0.03624187
            pedigree         age
pregnant -0.03352267  0.54434123
glucose   0.13733730  0.26351432
pressure  0.04126495  0.23952795
triceps   0.18392757 -0.11397026
insulin   0.18507093 -0.04216295
mass      0.14064695  0.03624187
pedigree  1.00000000  0.03356131
age       0.03356131  1.00000000