Introduction

This file contains some examples of the functions related to the DDC routine. More specifically, DetectDeviatingCells, cellMap and checkDataset will be illustrated.

library("cellWise")

# Default options for DetectDeviatingCells:
DDCpars=list(fracNA=0.5,numDiscrete=3,precScale=1e-12,
             tolProb=0.99,corrlim=0.5,combinRule ="wmean",
             includeSelf=T,rowdetect=TRUE,returnBigXimp=F)

Data examples

Clean multivariate normal data

We will first look at a clean generated dataset.

set.seed(12345) # reproducible
n = 100; d = 10
A = matrix(0.9, d, d); diag(A) = 1
A # true covariance matrix
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [2,]  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [3,]  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [4,]  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9   0.9
##  [5,]  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9   0.9
##  [6,]  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9   0.9
##  [7,]  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9   0.9
##  [8,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9   0.9
##  [9,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0   0.9
## [10,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   1.0
library(MASS) # only needed for the function mvrnorm in the next line:
xclean = mvrnorm(n, rep(0,d), A)
round(xclean[1:3,1:5],2)
##       [,1]  [,2]  [,3]  [,4]  [,5]
## [1,] -0.55 -0.73 -0.09 -0.73 -0.25
## [2,] -0.15 -1.35 -0.66 -1.11 -0.26
## [3,]  0.19  0.26  0.12  0.17 -0.15
DDCxclean = DetectDeviatingCells(xclean,DDCpars)
##  
##  The input data has 100 rows and 10 columns.
##  The computation started at: Tue Dec 06 16:58:53 2016
##  The computation ended at: Tue Dec 06 16:58:53 2016
## 
# TRUE # OK, all parts of the output match

summary(DDCxclean)
##                 Length Class  Mode   
## colInAnalysis     10   -none- numeric
## rowInAnalysis    100   -none- numeric
## namesNotNumeric    0   -none- NULL   
## namesCaseNumber    0   -none- NULL   
## namesNAcol         0   -none- NULL   
## namesNArow         0   -none- NULL   
## namesDiscrete      0   -none- NULL   
## namesZeroScale     0   -none- NULL   
## remX            1000   -none- numeric
## Z               1000   -none- numeric
## k                  1   -none- numeric
## ngbrs            100   -none- numeric
## robcors          100   -none- numeric
## robslopes        100   -none- numeric
## Xest            1000   -none- numeric
## stdResid        1000   -none- numeric
## indcells          11   -none- numeric
## Ti               100   -none- numeric
## indrows            0   -none- numeric
## indall            11   -none- numeric
## Ximp            1000   -none- numeric
round(DDCxclean$Z[1:12,],2)# robustly standardized data:
##       X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
## 1  -0.25 -0.38  0.17 -0.38  0.06  0.02 -0.47 -0.46  0.19 -0.61
## 2   0.11 -0.93 -0.34 -0.71  0.04 -0.18 -0.16 -0.35  0.13 -0.75
## 3   0.41  0.48  0.35  0.41  0.15  0.24  0.34  0.44  0.33  0.44
## 4   0.66  0.57  0.29  0.72  1.06  0.20  0.43  0.74  0.97  0.79
## 5  -0.23 -0.21 -0.33  0.34 -0.69 -0.50 -0.31  0.29 -0.73  0.04
## 6   2.13  1.70  1.87  1.83  1.73  1.54  1.60  1.56  1.84  1.90
## 7  -0.31 -0.16  0.14 -0.19 -0.07 -0.86 -0.27 -0.20 -0.12 -0.42
## 8   0.50  0.44  0.35  0.69  0.53  0.94  0.45  0.76  0.00  0.29
## 9   0.81  0.47  0.22 -0.01  0.78  0.15  0.69  0.76  0.79  0.39
## 10  1.18  1.12  0.94  1.10  1.46  0.67  0.94  1.18  0.82  0.88
## 11  0.30  0.07 -0.01 -0.05  0.59  0.40  0.35  0.61  0.73  0.63
## 12 -1.46 -1.37 -1.16 -1.59 -1.03 -1.64 -1.06 -1.11 -1.02 -0.87
# Due to the high correlations, cells in the same row look similar


DDCxclean$k 
## [1] 9
# The code looked for up to 9 non-self neighbors of each variable.
# (It goes through all of them, unless d is huge.)


DDCxclean$ngbrs # show the neighbors:
##     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## X1     1    3    7    9    2    4    5   10    8     6
## X2     2    3    1    4    5    8    9    6   10     7
## X3     3    1    4    5    2    9    8   10    7     6
## X4     4    3    8    2    1   10    7    5    9     6
## X5     5    3    9    8    2    1    7    4   10     6
## X6     6    1    3   10    2    9    5    4    8     7
## X7     7    8    1    9    4    3   10    5    2     6
## X8     8    7    4    5    9    2    1    3   10     6
## X9     9    1    5    3    8   10    2    7    6     4
## X10   10    4    1    9    3    7    6    2    5     8
# i.e. the nearest non-self neighbor of X10 is X4, then X1,...

round(DDCxclean$robcors,2) # correlations with these neighbors:
##     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## X1     1 0.94 0.93 0.93 0.93 0.93 0.93 0.92 0.92  0.92
## X2     1 0.93 0.93 0.93 0.93 0.92 0.92 0.91 0.91  0.91
## X3     1 0.94 0.93 0.93 0.93 0.92 0.92 0.92 0.91  0.91
## X4     1 0.93 0.93 0.93 0.93 0.93 0.91 0.91 0.90  0.89
## X5     1 0.93 0.93 0.93 0.93 0.93 0.91 0.91 0.90  0.89
## X6     1 0.92 0.91 0.91 0.91 0.91 0.89 0.89 0.89  0.88
## X7     1 0.94 0.93 0.91 0.91 0.91 0.91 0.91 0.91  0.88
## X8     1 0.94 0.93 0.93 0.92 0.92 0.92 0.92 0.90  0.89
## X9     1 0.93 0.93 0.92 0.92 0.92 0.92 0.91 0.91  0.90
## X10    1 0.93 0.92 0.92 0.92 0.91 0.91 0.91 0.90  0.90
# Per row, the correlation are sorted by decreasing absolute value.

round(DDCxclean$robslopes,2) # slope of each neighbor predicting the column:
##     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## X1     1 0.96 0.96 0.99 0.98 0.96 0.94 0.94 0.93  0.96
## X2     1 0.90 0.90 0.90 0.90 0.90 0.92 0.89 0.91  0.90
## X3     1 0.92 0.95 0.93 0.97 0.96 0.92 0.92 0.92  0.97
## X4     1 0.90 0.91 0.95 0.90 0.92 0.92 0.89 0.89  0.90
## X5     1 0.94 0.94 0.93 0.97 0.92 0.93 0.93 0.95  0.92
## X6     1 0.88 0.87 0.88 0.90 0.88 0.86 0.88 0.86  0.88
## X7     1 0.94 0.92 0.91 0.92 0.91 0.91 0.88 0.91  0.89
## X8     1 0.94 0.94 0.92 0.93 0.95 0.90 0.90 0.92  0.91
## X9     1 0.89 0.90 0.89 0.92 0.90 0.91 0.90 0.92  0.90
## X10    1 0.94 0.90 0.92 0.90 0.92 0.93 0.92 0.90  0.90
# For instance, X10 is predicted by its first neighbor with slope 0.94
# and by its second neighbor with slope 0.90 .

round(DDCxclean$Xest[1:12,],2) # the estimated cells:
##       X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
## 1  -0.51 -0.53 -0.51 -0.54 -0.55 -0.39 -0.66 -0.72 -0.54 -0.62
## 2  -0.63 -0.65 -0.65 -0.66 -0.67 -0.51 -0.76 -0.84 -0.65 -0.75
## 3   0.14  0.12  0.13  0.12  0.10  0.26  0.02 -0.03  0.11  0.08
## 4   0.47  0.43  0.46  0.43  0.44  0.58  0.35  0.31  0.44  0.42
## 5  -0.54 -0.54 -0.55 -0.54 -0.59 -0.43 -0.66 -0.72 -0.58 -0.63
## 6   1.78  1.68  1.76  1.71  1.72  1.90  1.68  1.65  1.72  1.78
## 7  -0.55 -0.56 -0.56 -0.57 -0.59 -0.46 -0.69 -0.75 -0.58 -0.66
## 8   0.30  0.26  0.29  0.27  0.25  0.43  0.18  0.14  0.25  0.24
## 9   0.31  0.27  0.29  0.27  0.27  0.42  0.20  0.15  0.28  0.24
## 10  0.92  0.86  0.91  0.87  0.88  1.03  0.81  0.78  0.87  0.88
## 11  0.14  0.11  0.13  0.10  0.11  0.27  0.02 -0.03  0.12  0.08
## 12 -1.69 -1.66 -1.70 -1.69 -1.72 -1.59 -1.84 -1.92 -1.70 -1.84
round(DDCxclean$stdResid[1:12,],1) # the standardized residuals:
##      X1   X2   X3   X4   X5   X6   X7   X8   X9  X10
## 1  -0.2 -0.8  1.6 -0.7  1.0  0.8 -1.0 -0.9  1.5 -1.5
## 2   1.8 -2.7 -0.1 -1.6  1.3  0.4  0.6 -0.1  1.6 -1.6
## 3   0.2  0.5 -0.1  0.2 -0.8 -0.4 -0.1  0.3 -0.1  0.3
## 4   0.0 -0.3 -1.5  0.3  1.4 -1.4 -0.9  0.3  1.3  0.6
## 5   0.1  0.1 -0.4  2.3 -1.6 -0.9 -0.3  2.0 -1.9  1.0
## 6   1.2 -0.1  0.2  0.3 -0.2 -0.6 -0.8 -0.9  0.4  0.5
## 7  -0.2  0.3  1.6  0.2  0.6 -2.1 -0.1  0.2  0.5 -0.6
## 8   0.0 -0.2 -0.6  0.8  0.1  1.6 -0.2  1.0 -1.8 -0.8
## 9   1.2 -0.1 -1.2 -2.1  0.9 -1.2  0.7  0.9  1.1 -0.4
## 10  0.5  0.5 -0.5  0.3  1.5 -1.1 -0.4  0.5 -0.7 -0.6
## 11 -0.3 -1.2 -1.6 -1.7  0.8  0.2 -0.1  0.9  1.4  1.0
## 12 -0.7 -0.8  0.4 -1.5  0.8 -1.5  0.7  0.5  0.7  1.3
DDCxclean$indcells # indices of the cells that were flagged:
##  [1] 102 124 135 177 214 247 349 583 660 847 969
d
## [1] 10
# The standardized residuals of the flagged cells:
round(DDCxclean$stdResid[DDCxclean$indcells],1)
##  [1] -2.7  3.0  2.9  2.6 -2.6 -2.9 -2.7 -2.6  2.7  3.6 -3.2
# They are just barely larger in absolute value than 
# sqrt(qchisq(0.99,1)) = 2.576 and there are not many,
# given that there are 1000 cells here.


# the standardized residuals are roughly standard gaussian:
qqnorm(as.vector(DDCxclean$stdResid)) 

plot of chunk unnamed-chunk-3

# is a straight line through 0 with slope about 1.
# So, nothing stands out in this clean dataset.

# outlyingness of the rows:
qqnorm(DDCxclean$Ti) # looks fairly gaussian too

plot of chunk unnamed-chunk-3

DDCxclean$indrows # no rows are flagged:
## integer(0)
DDCxclean$indall # flagged cells including those in flagged rows:
##  [1] 102 124 135 177 214 247 349 583 660 847 969
round(DDCxclean$Ximp[1:12,],2) # the imputed matrix:
##       X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
## 1  -0.55 -0.73 -0.09 -0.73 -0.25 -0.13 -0.94 -1.00 -0.09 -1.09
## 2  -0.15 -0.65 -0.66 -1.11 -0.26 -0.36 -0.58 -0.88 -0.15 -1.26
## 3   0.19  0.26  0.12  0.17 -0.15  0.14 -0.01  0.06  0.08  0.18
## 4   0.47  0.36  0.05  0.53  0.89  0.09  0.10  0.41  0.83  0.60
## 5  -0.52 -0.52 -0.65  0.10 -1.09 -0.75 -0.76 -0.11 -1.16 -0.31
## 6   2.10  1.65  1.82  1.80  1.65  1.70  1.46  1.38  1.84  1.94
## 7  -0.62 -0.47 -0.12 -0.51 -0.39 -1.19 -0.71 -0.70 -0.44 -0.87
## 8   0.29  0.22  0.11  0.49  0.29  0.97  0.13  0.44 -0.30 -0.01
## 9   0.63  0.25 -0.03 -0.30  0.57  0.02  0.40  0.43  0.62  0.11
## 10  1.05  1.00  0.77  0.96  1.35  0.65  0.69  0.94  0.66  0.70
## 11  0.07 -0.21 -0.30 -0.35  0.35  0.33  0.00  0.26  0.55  0.40
## 12 -1.89 -1.86 -1.59 -2.11 -1.48 -2.12 -1.63 -1.77 -1.49 -1.41
round((DDCxclean$Ximp - xclean)[1:12,],2)
##    X1   X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1   0 0.00  0  0  0  0  0  0  0   0
## 2   0 0.71  0  0  0  0  0  0  0   0
## 3   0 0.00  0  0  0  0  0  0  0   0
## 4   0 0.00  0  0  0  0  0  0  0   0
## 5   0 0.00  0  0  0  0  0  0  0   0
## 6   0 0.00  0  0  0  0  0  0  0   0
## 7   0 0.00  0  0  0  0  0  0  0   0
## 8   0 0.00  0  0  0  0  0  0  0   0
## 9   0 0.00  0  0  0  0  0  0  0   0
## 10  0 0.00  0  0  0  0  0  0  0   0
## 11  0 0.00  0  0  0  0  0  0  0   0
## 12  0 0.00  0  0  0  0  0  0  0   0
# Only a few cells were imputed in this example.

TopGear dataset

Let's now look at the TopGear dataset.

 library(robustHD)
 library(gridExtra)
 data(TopGear)
dim(TopGear)
## [1] 297  32
rownames(TopGear)[1:13] # "1" to "297" are not useful names
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13"
rownames(TopGear) = paste(TopGear[,1],TopGear[,2]) 
# Now the rownames are make and model of the cars.
rownames(TopGear)[165] = "Mercedes-Benz G" # name was too long
myTopGear = TopGear[,-31] # removes the subjective variable `Verdict'

# Transform some variables to get roughly gaussianity in the center:
transTG = myTopGear
transTG$Price = log(myTopGear$Price)
transTG$Displacement = log(myTopGear$Displacement)
transTG$BHP = log(myTopGear$BHP)
transTG$Torque = log(myTopGear$Torque)
transTG$TopSpeed = log(myTopGear$TopSpeed)


# Run the method:
DDCtransTG = DetectDeviatingCells(transTG,DDCpars)
##  
##  The input data has 297 rows and 31 columns.
##  
##  The input data contained 19 non-numeric columns (variables).
##  Their column names are:
##  
##  [1] Maker              Model              Type              
##  [4] Fuel               DriveWheel         AdaptiveHeadlights
##  [7] AdjustableSteering AlarmSystem        Automatic         
## [10] Bluetooth          ClimateControl     CruiseControl     
## [13] ElectricSeats      Leather            ParkingSensors    
## [16] PowerSteering      SatNav             ESP               
## [19] Origin            
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 12 numeric columns:
##  
##  [1] Price        Cylinders    Displacement BHP          Torque      
##  [6] Acceleration TopSpeed     MPG          Weight       Length      
## [11] Width        Height      
##  
##  The data contained 1 rows with over 50% of NAs.
##  Their row names are:
##  
## [1] Citroen C5 Tourer
##  
##  These rows will be ignored in the analysis.
##  We continue with the remaining 296 rows:
##  
##   [1] Alfa Romeo Giulietta             Alfa Romeo MiTo                 
##   [3] Aston Martin Cygnet              Aston Martin DB9                
##   [5] Aston Martin DB9 Volante         Aston Martin V12 Zagato         
##   [7] Aston Martin Vanquish            Aston Martin Vantage            
##   [9] Aston Martin Vantage Roadster    Audi A1                         
##  [11] Audi A3                          Audi A4                         
##  [13] Audi A4 Allroad                  Audi A5                         
##  [15] Audi A5 Sportback                Audi A6 Avant                   
##  [17] Audi A6 Saloon                   Audi A7 Sportback               
##  [19] Audi A8                          Audi Q3                         
##  [21] Audi Q5                          Audi Q7                         
##  [23] Audi R8                          Audi R8 V10                     
##  [25] Audi RS4 Avant                   Audi TT Coupe                   
##  [27] Audi TT Roadster                 Bentley Continental             
##  [29] Bentley Continental GTC          Bentley Flying Spur             
##  [31] Bentley Mulsanne                 BMW 1 Series                    
##  [33] BMW 1 Series Convertible         BMW 1 Series Coupe              
##  [35] BMW 3 Series                     BMW 3 Series Convertible        
##  [37] BMW 4 Series Coupe               BMW 6 Series                    
##  [39] BMW 6 Series Convertible         BMW 6 Series Gran Coupe         
##  [41] BMW 7 Series                     BMW i3                          
##  [43] BMW M3                           BMW M6                          
##  [45] BMW X1                           BMW X3                          
##  [47] BMW X5                           BMW X6                          
##  [49] BMW Z4                           Bugatti Veyron                  
##  [51] Caterham CSR                     Caterham Super 7                
##  [53] Chevrolet Aveo                   Chevrolet Camaro                
##  [55] Chevrolet Captiva                Chevrolet Cruze                 
##  [57] Chevrolet Orlando                Chevrolet Spark                 
##  [59] Chevrolet Volt                   Chrysler 300C                   
##  [61] Chrysler Delta                   Chrysler Grand Voyager          
##  [63] Chrysler Ypsilon                 Citroen C1                      
##  [65] Citroen C3                       Citroen C3 Picasso              
##  [67] Citroen C4                       Citroen C4 Picasso              
##  [69] Citroen C5                       Citroen DS3                     
##  [71] Citroen DS4                      Citroen DS5                     
##  [73] Corvette C6                      Dacia Duster                    
##  [75] Dacia Sandero                    Ferrari 458                     
##  [77] Ferrari California               Ferrari F12                     
##  [79] Ferrari FF                       Fiat 500                        
##  [81] Fiat 500 Abarth                  Fiat 500L                       
##  [83] Fiat Bravo                       Fiat Doblo                      
##  [85] Fiat Panda                       Fiat Punto Evo                  
##  [87] Ford B-Max                       Ford C-Max                      
##  [89] Ford Fiesta                      Ford Focus                      
##  [91] Ford Focus Estate                Ford Focus ST                   
##  [93] Ford Galaxy                      Ford Kuga                       
##  [95] Ford Mondeo                      Ford S-MAX                      
##  [97] Honda Accord                     Honda Civic                     
##  [99] Honda CR-V                       Honda CR-Z                      
## [101] Honda Insight                    Honda Jazz                      
## [103] Hyundai i10                      Hyundai i20                     
## [105] Hyundai i30                      Hyundai i40                     
## [107] Hyundai i800                     Hyundai ix20                    
## [109] Hyundai ix35                     Hyundai Santa Fe                
## [111] Hyundai Veloster                 Infiniti EX                     
## [113] Infiniti FX                      Infiniti G37                    
## [115] Infiniti M                       Jaguar F-Type                   
## [117] Jaguar XF                        Jaguar XF Sportbrake            
## [119] Jaguar XFR                       Jaguar XJ Series                
## [121] Jaguar XK                        Jeep Compass                    
## [123] Jeep Grand Cherokee              Jeep Wrangler                   
## [125] Kia Cee'd                        Kia Optima                      
## [127] Kia Picanto                      Kia Rio                         
## [129] Kia Sorento                      Kia Soul                        
## [131] Kia Sportage                     Kia Venga                       
## [133] Lamborghini Aventador            Lamborghini Gallardo            
## [135] Land Rover Defender              Land Rover Discovery 4          
## [137] Land Rover Freelander 2          Land Rover Range Rover          
## [139] Land Rover Range Rover Evoque    Land Rover Range Rover Sport    
## [141] Lexus CT 200h                    Lexus GS                        
## [143] Lexus IS                         Lexus RX                        
## [145] Lotus Elise                      Lotus Evora                     
## [147] Lotus Exige S                    Maserati GranTurismo            
## [149] Maserati Quattroporte            Mazda CX-5                      
## [151] Mazda Mazda3                     Mazda MX-5                      
## [153] McLaren MP4-12C                  Mercedes-Benz A-Class           
## [155] Mercedes-Benz B-Class            Mercedes-Benz C-Class           
## [157] Mercedes-Benz C63 AMG            Mercedes-Benz CL-Class          
## [159] Mercedes-Benz CLS Shooting Brake Mercedes-Benz CLS-Class         
## [161] Mercedes-Benz E-Class            Mercedes-Benz E-Class Coupe     
## [163] Mercedes-Benz E63 AMG            Mercedes-Benz G                 
## [165] Mercedes-Benz GL-Class           Mercedes-Benz M-Class           
## [167] Mercedes-Benz R-Class            Mercedes-Benz S-Class           
## [169] Mercedes-Benz SL 63              Mercedes-Benz SL-Class          
## [171] Mercedes-Benz SLK                Mercedes-Benz SLS               
## [173] Mini Clubman                     Mini Convertible                
## [175] Mini Countryman                  Mini Coupe                      
## [177] Mini John Cooper Works           Mini Roadster                   
## [179] Mitsubishi ASX                   Mitsubishi i-MiEV               
## [181] Mitsubishi Mirage                Mitsubishi Outlander            
## [183] Mitsubishi Shogun                Morgan 3 Wheeler                
## [185] Morgan Aero                      Morgan Roadster                 
## [187] Nissan 370Z                      Nissan Juke                     
## [189] Nissan Leaf                      Nissan Micra                    
## [191] Nissan Note                      Nissan Pathfinder               
## [193] Nissan Qashqai                   Nissan X-Trail                  
## [195] Noble M600                       Pagani Huayra                   
## [197] Perodua MYVI                     Peugeot 107                     
## [199] Peugeot 207 CC                   Peugeot 207 SW                  
## [201] Peugeot 208                      Peugeot 3008                    
## [203] Peugeot 308                      Peugeot 308 CC                  
## [205] Peugeot 308 SW                   Peugeot 5008                    
## [207] Peugeot RCZ                      Porsche 911                     
## [209] Porsche Boxster                  Porsche Cayenne                 
## [211] Porsche Panamera                 Proton GEN-2                    
## [213] Proton Satria-Neo                Proton Savvy                    
## [215] Renault Clio                     Renault Megane                  
## [217] Renault Scenic/Grand Scenic      Renault Twingo                  
## [219] Renault Twizy                    Renault Zoe                     
## [221] Rolls-Royce Ghost                Rolls-Royce Phantom             
## [223] Rolls-Royce Phantom Coupe        SEAT Alhambra                   
## [225] SEAT Altea                       SEAT Leon                       
## [227] SEAT Mii                         SEAT Toledo                     
## [229] Skoda Fabia                      Skoda Octavia                   
## [231] Skoda Roomster                   Skoda Yeti                      
## [233] Smart fortwo                     Ssangyong Rodius                
## [235] Subaru BRZ                       Subaru Forester                 
## [237] Subaru Legacy Outback            Subaru XV                       
## [239] Suzuki Alto                      Suzuki Grand Vitara             
## [241] Suzuki Jimny                     Suzuki Splash                   
## [243] Suzuki Swift                     Suzuki Swift Sport              
## [245] Suzuki SX4                       Toyota Auris                    
## [247] Toyota Avensis                   Toyota AYGO                     
## [249] Toyota GT 86                     Toyota iQ                       
## [251] Toyota Land Cruiser              Toyota Land Cruiser V8          
## [253] Toyota Prius                     Toyota RAV4                     
## [255] Toyota Verso                     Toyota Yaris                    
## [257] Vauxhall Adam                    Vauxhall Agila                  
## [259] Vauxhall Ampera                  Vauxhall Antara                 
## [261] Vauxhall Astra                   Vauxhall Astra GTC              
## [263] Vauxhall Astra VXR               Vauxhall Cascada                
## [265] Vauxhall Corsa                   Vauxhall Corsa VXR              
## [267] Vauxhall Insignia                Vauxhall Insignia Sports Tourer 
## [269] Vauxhall Meriva                  Vauxhall Mokka                  
## [271] Vauxhall VXR8                    Vauxhall Zafira                 
## [273] Vauxhall Zafira Tourer           Volkswagen Beetle               
## [275] Volkswagen CC                    Volkswagen Eos                  
## [277] Volkswagen Golf                  Volkswagen Golf Plus            
## [279] Volkswagen Jetta                 Volkswagen Passat               
## [281] Volkswagen Phaeton               Volkswagen Polo                 
## [283] Volkswagen Scirocco              Volkswagen Sharan               
## [285] Volkswagen Tiguan                Volkswagen Touareg              
## [287] Volkswagen Touran                Volkswagen Up                   
## [289] Volvo S60                        Volvo S80                       
## [291] Volvo V40                        Volvo V60                       
## [293] Volvo V70                        Volvo XC60                      
## [295] Volvo XC70                       Volvo XC90                      
##  
##  The data contained 1 columns with zero or tiny median absolute deviation.
##  Their column names are:
##  
## [1] Cylinders
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 11 columns:
##  
##  [1] Price        Displacement BHP          Torque       Acceleration
##  [6] TopSpeed     MPG          Weight       Length       Width       
## [11] Height      
##  
##  The final data set we will analyze has 296 rows and 11 columns.
##  
##  The computation started at: Tue Dec 06 16:58:54 2016
##  The computation ended at: Tue Dec 06 16:58:54 2016
## 
# the remaining part of the dataset:
remX = DDCtransTG$remX 
dim(remX)
## [1] 296  11
colSums(is.na(remX))
##        Price Displacement          BHP       Torque Acceleration 
##            0            8            3            3            0 
##     TopSpeed          MPG       Weight       Length        Width 
##            3           11           32           10           15 
##       Height 
##           10
# We have NAs left, mainly in `Weight'.

n = nrow(remX)
d = ncol(remX)
indexDDCcells = matrix(FALSE,nrow=n,ncol=d) 
indexDDCcells[DDCtransTG$indcells] = TRUE
dim(indexDDCcells)
## [1] 296  11
# Analyze the data by column:
standX = scale(remX,apply(remX,2,median,na.rm = TRUE), 
               apply(remX,2,mad,na.rm = TRUE))
dim(standX)
## [1] 296  11
round(standX[1:5,],1) # has NAs where remX does
##                          Price Displacement  BHP Torque Acceleration
## Alfa Romeo Giulietta      -0.4         -0.4 -0.7    0.0          0.6
## Alfa Romeo MiTo           -0.9         -0.7 -0.7   -1.6          0.4
## Aston Martin Cygnet        0.3         -0.8 -0.8   -1.7          0.7
## Aston Martin DB9           2.7          2.1  1.9    1.2         -1.2
## Aston Martin DB9 Volante   2.8          2.1  1.9    1.2         -1.2
##                          TopSpeed  MPG Weight Length Width Height
## Alfa Romeo Giulietta         -0.5  1.0   -0.3   -0.3  -0.2   -0.2
## Alfa Romeo MiTo              -0.4  0.1   -1.0   -0.9  -1.1   -0.3
## Aston Martin Cygnet          -0.9  0.6   -1.3   -3.1  -1.5    0.1
## Aston Martin DB9              2.0 -1.7    0.8    0.6    NA   -1.6
## Aston Martin DB9 Volante      2.0 -1.7    1.0    0.6    NA   -1.6
transTGcol = remX
transTGcol[abs(standX) > sqrt(qchisq(0.99,1))] = NA
round(transTGcol[1:5,],1) # also has NAs in outlying cells
##                          Price Displacement BHP Torque Acceleration
## Alfa Romeo Giulietta      10.0          7.4 4.7    5.5         11.3
## Alfa Romeo MiTo            9.6          7.2 4.7    4.6         10.7
## Aston Martin Cygnet       10.3          7.2 4.6    4.5         11.8
## Aston Martin DB9            NA          8.7 6.2    6.1          4.6
## Aston Martin DB9 Volante    NA          8.7 6.2    6.1          4.6
##                          TopSpeed MPG Weight Length Width Height
## Alfa Romeo Giulietta          4.7  64   1385   4351  1798   1465
## Alfa Romeo MiTo               4.8  49   1090   4063  1720   1446
## Aston Martin Cygnet           4.7  56    988     NA  1680   1500
## Aston Martin DB9              5.2  19   1785   4720    NA   1282
## Aston Martin DB9 Volante      5.2  19   1890   4720    NA   1282
# Make untransformed submatrix of X for labeling the cells in the plot:
tempX = myTopGear[DDCtransTG$rowInAnalysis,DDCtransTG$colInAnalysis]
dim(tempX)
## [1] 296  11
tempX$Price = tempX$Price/1000 # to avoid printing long numbers

xlabels = colnames(tempX)
ylabels = rownames(tempX)
# select the following cars for the cellmap:
yshowindex = c(12,42,56,73,81,94,99,135,150,164,176,198,209,215,234,241,277)
# make two ggplot2 objects:

ggpcol = cellMap(D=tempX,
                 R=standX,
                 indcells=which(is.na(transTGcol)),
                 indrows=integer(0),
                 xlabels=xlabels,
                 ylabels=ylabels,
                 mTitle="By column",
                 yshowindex=yshowindex,
                 showVals=TRUE,
                 hjustYlabels=0.5) 
plot(ggpcol)

plot of chunk unnamed-chunk-5

ggpDDC = cellMap(D=tempX,
                 R=DDCtransTG$stdResid, 
                 indcells=which(indexDDCcells==TRUE),
                 indrows=DDCtransTG$indrows,
                 xlabels=xlabels,
                 ylabels=ylabels,
                 mTitle="DetectDeviatingCells",
                 yshowindex=yshowindex,
                 showVals=TRUE,
                 hjustYlabels=0.5)
plot(ggpDDC)

plot of chunk unnamed-chunk-5

# Creating the pdf:
pdfName = "cellmap_TopGear.pdf"
pdf(pdfName, width = 20, height = 15 )
grid.arrange(ggpcol,ggpDDC,nrow=1) # arranges ggplot2,.. on a page
dev.off()
## png 
##   2

Philips data

We now illustrate DDC on the philips dataset.

data(philips)
dim(philips) 
## [1] 677   9
colnames(philips) = c("X1","X2","X3","X4","X5","X6","X7","X8","X9")
library(robustbase) # for covMcd
MCDphilips = covMcd(philips)
pdfName = "figure_philips_left.pdf"
pdf(pdfName, width = 10, height = 4)
plot(sqrt(mahalanobis(philips,MCDphilips$center,MCDphilips$cov)),
     main="Philips data",ylab="Robust distances",xlab="",pch=20)
abline(h=sqrt(qchisq(0.975,df=9)))
dev.off()
## png 
##   2
DDCphilips = DetectDeviatingCells(philips,DDCpars)
##  
##  The input data has 677 rows and 9 columns.
##  The computation started at: Tue Dec 06 16:58:55 2016
##  The computation ended at: Tue Dec 06 16:58:56 2016
## 
summary(DDCphilips)
##                 Length Class  Mode   
## colInAnalysis      9   -none- numeric
## rowInAnalysis    677   -none- numeric
## namesNotNumeric    0   -none- NULL   
## namesCaseNumber    0   -none- NULL   
## namesNAcol         0   -none- NULL   
## namesNArow         0   -none- NULL   
## namesDiscrete      0   -none- NULL   
## namesZeroScale     0   -none- NULL   
## remX            6093   -none- numeric
## Z               6093   -none- numeric
## k                  1   -none- numeric
## ngbrs             81   -none- numeric
## robcors           81   -none- numeric
## robslopes         81   -none- numeric
## Xest            6093   -none- numeric
## stdResid        6093   -none- numeric
## indcells         155   -none- numeric
## Ti               677   -none- numeric
## indrows            5   -none- numeric
## indall           195   -none- numeric
## Ximp            6093   -none- numeric
qqnorm(as.vector(DDCphilips$Z)) 

plot of chunk unnamed-chunk-9

# rather gaussian, here we only see 2 outliers

round(DDCphilips$stdResid[1:12,],1) # the standardized residuals:
##      X1   X2   X3   X4   X5   X6  X7   X8   X9
## 1  -0.9  2.2  0.0  1.2  0.8 -2.9 2.1  0.1 -1.4
## 2  -0.5  2.0  0.0  3.9  0.8 -1.1 3.5 -0.4 -1.7
## 3  -1.0  2.3 -0.4  0.7  0.6 -2.0 1.4  0.0 -1.7
## 4  -0.6  2.3 -0.1  3.8  1.1 -0.9 3.4 -0.5 -2.0
## 5  -0.6  2.8 -0.2  3.4  0.9 -0.9 3.4  0.2 -1.9
## 6  -0.7  2.6  0.0  4.4  1.0 -1.1 4.2 -0.1 -2.0
## 7   1.0 -0.1 -1.0 -0.3 -1.2 -1.0 1.4  0.8 -0.6
## 8  -0.4 -1.4 -0.7  1.8 -0.7 -2.8 1.9 -0.3 -1.7
## 9   1.0 -0.2  0.0 -0.1 -1.2 -2.1 1.6  1.0 -0.9
## 10  0.2 -1.3 -0.1  1.0 -0.5 -2.9 2.0 -0.1 -1.5
## 11 -0.1 -0.3 -0.1  0.2 -1.1 -1.5 1.9  0.8 -0.8
## 12 -0.1 -0.4 -1.5  2.5  0.0 -3.4 2.3  0.3 -2.5
DDCphilips$indcells # indices of the cells that were flagged:
##   [1]   55   70   95  104  116  144  151  156  161  170  174  175  324  468
##  [15]  682  683  712  747  750  792  985  986  987  988  989  991  992 1006
##  [29] 1007 1008 1010 1011 1196 1199 1450 1452 1455 1474 1529 1787 2033 2035
##  [43] 2036 2037 2045 2047 2094 2101 2114 2116 2126 2135 2267 2312 2355 2365
##  [57] 2464 2499 2592 2743 2775 2777 3120 3216 3226 3227 3228 3248 3386 3393
##  [71] 3395 3397 3399 3401 3412 3413 3414 3415 3416 3417 3423 3424 3425 3426
##  [85] 3427 3428 3430 3431 3437 3445 3456 3457 3460 3464 3465 3466 3468 3469
##  [99] 3470 3990 4064 4066 4067 4068 4076 4078 4088 4145 4147 4792 4794 5036
## [113] 5037 5168 5169 5170 5171 5172 5174 5175 5176 5230 5238 5241 5246 5248
## [127] 5250 5253 5256 5258 5259 5260 5262 5263 5265 5267 5268 5270 5510 5511
## [141] 5512 5513 5514 5515 5516 5517 5518 5519 5520 5569 5869 5873 5882 5890
## [155] 5902
DDCphilips$indrows # flagged rows:
## [1]  26  56  84 334 491
# cellmaps with rectangular blocks:

xblocksize = 1
yblocksize = 15 # 20
xlabels = colnames(philips)
ylabels = 1:dim(philips)[1] #rownames(philips)
n = nrow(philips)
d = ncol(philips)

indrows=which(mahalanobis(philips,MCDphilips$center,MCDphilips$cov) >
                qchisq(0.975,df=d))

ggpMCDphilips = cellMap(D=philips,R=matrix(0,n,d),
                        indcells=integer(0),
                        indrows=indrows,
                        xlabels=xlabels,
                        ylabels=ylabels,
                        mTitle="MCD",
                        xblocksize=xblocksize,
                        yblocksize=yblocksize,
                        autolabel=T)
plot(ggpMCDphilips)

plot of chunk unnamed-chunk-10

ggpDDCphilips = cellMap(D=philips, R=DDCphilips$stdResid,
                        indcells=DDCphilips$indcells,
                        indrows=integer(0), #DDCphilips$indrows,
                        xlabels=xlabels,
                        ylabels=ylabels,
                        mTitle="DetectDeviatingCells",
                        xblocksize=xblocksize,
                        yblocksize=yblocksize,
                        autolabel=T,
                        hjustYlabels=1)
plot(ggpDDCphilips)

plot of chunk unnamed-chunk-10

pdfName = "figure_philips_right.pdf"
pdf(pdfName, width = 6, height = 12)
plot(ggpDDCphilips) 
dev.off()
## png 
##   2

Mortality dataset

Let's now look at the Mortality dataset.

data(mortality)
dim(mortality)
## [1] 198  91
# 198  91
rownames(mortality) 
##   [1] "1816" "1817" "1818" "1819" "1820" "1821" "1822" "1823" "1824" "1825"
##  [11] "1826" "1827" "1828" "1829" "1830" "1831" "1832" "1833" "1834" "1835"
##  [21] "1836" "1837" "1838" "1839" "1840" "1841" "1842" "1843" "1844" "1845"
##  [31] "1846" "1847" "1848" "1849" "1850" "1851" "1852" "1853" "1854" "1855"
##  [41] "1856" "1857" "1858" "1859" "1860" "1861" "1862" "1863" "1864" "1865"
##  [51] "1866" "1867" "1868" "1869" "1870" "1871" "1872" "1873" "1874" "1875"
##  [61] "1876" "1877" "1878" "1879" "1880" "1881" "1882" "1883" "1884" "1885"
##  [71] "1886" "1887" "1888" "1889" "1890" "1891" "1892" "1893" "1894" "1895"
##  [81] "1896" "1897" "1898" "1899" "1900" "1901" "1902" "1903" "1904" "1905"
##  [91] "1906" "1907" "1908" "1909" "1910" "1911" "1912" "1913" "1914" "1915"
## [101] "1916" "1917" "1918" "1919" "1920" "1921" "1922" "1923" "1924" "1925"
## [111] "1926" "1927" "1928" "1929" "1930" "1931" "1932" "1933" "1934" "1935"
## [121] "1936" "1937" "1938" "1939" "1940" "1941" "1942" "1943" "1944" "1945"
## [131] "1946" "1947" "1948" "1949" "1950" "1951" "1952" "1953" "1954" "1955"
## [141] "1956" "1957" "1958" "1959" "1960" "1961" "1962" "1963" "1964" "1965"
## [151] "1966" "1967" "1968" "1969" "1970" "1971" "1972" "1973" "1974" "1975"
## [161] "1976" "1977" "1978" "1979" "1980" "1981" "1982" "1983" "1984" "1985"
## [171] "1986" "1987" "1988" "1989" "1990" "1991" "1992" "1993" "1994" "1995"
## [181] "1996" "1997" "1998" "1999" "2000" "2001" "2002" "2003" "2004" "2005"
## [191] "2006" "2007" "2008" "2009" "2010" "2011" "2012" "2013"
colnames(mortality) 
##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13"
## [15] "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27"
## [29] "28" "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41"
## [43] "42" "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55"
## [57] "56" "57" "58" "59" "60" "61" "62" "63" "64" "65" "66" "67" "68" "69"
## [71] "70" "71" "72" "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83"
## [85] "84" "85" "86" "87" "88" "89" "90"
DDCmortality = DetectDeviatingCells(mortality,DDCpars) # 3 seconds
##  
##  The input data has 198 rows and 91 columns.
##  The computation started at: Tue Dec 06 16:58:57 2016
##  The computation ended at: Tue Dec 06 16:59:01 2016
## 
remX = DDCmortality$remX
dim(remX)
## [1] 198  91
library(rrcov) # for robust PCA
PCAmortality = PcaHubert(mortality,alpha=0.75,scale=FALSE)

xblocksize = 5
yblocksize = 5
xlabels = colnames(remX) # the _columns_ are in the x-direction
ylabels = rownames(remX)
n = nrow(remX)
d = ncol(remX)

ggpROBPCA = cellMap(D=remX,R=matrix(0,n,d),
                           indcells=integer(0),
                           indrows=which(PCAmortality@flag==FALSE),
                           xlabels=xlabels,
                           ylabels=ylabels,
                           mTitle="By row",
                           xblocksize=xblocksize,
                           yblocksize=yblocksize,
                           autolabel=T)
plot(ggpROBPCA)

plot of chunk unnamed-chunk-13

ggpDDC = cellMap(D=remX, R=DDCmortality$stdResid,
                        indcells=DDCmortality$indcells,
                        indrows=DDCmortality$indrows,
                        xlabels=xlabels,
                        ylabels=ylabels,
                        mTitle="DetectDeviatingCells",
                        xblocksize=xblocksize,
                        yblocksize=yblocksize,
                        autolabel=T)
plot(ggpDDC)

plot of chunk unnamed-chunk-13

pdf(paste("cellmap_mortality_",xblocksize,".pdf",sep=""),width=14,height=12)
grid.arrange(ggpROBPCA,ggpDDC,nrow=1)
dev.off()
## png 
##   2

Glass dataset

Let's now look at the glass dataset.

data(glass)
DDCglass = DetectDeviatingCells(glass,DDCpars) # takes 1 or 2 minutes
##  
##  The input data has 180 rows and 750 columns.
##  
##  The data contained 11 discrete columns with 3 or fewer values.
##  Their column names are:
##  
##  [1] V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V11
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 739 columns:
##  
##   [1] V12  V13  V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25 
##  [15] V26  V27  V28  V29  V30  V31  V32  V33  V34  V35  V36  V37  V38  V39 
##  [29] V40  V41  V42  V43  V44  V45  V46  V47  V48  V49  V50  V51  V52  V53 
##  [43] V54  V55  V56  V57  V58  V59  V60  V61  V62  V63  V64  V65  V66  V67 
##  [57] V68  V69  V70  V71  V72  V73  V74  V75  V76  V77  V78  V79  V80  V81 
##  [71] V82  V83  V84  V85  V86  V87  V88  V89  V90  V91  V92  V93  V94  V95 
##  [85] V96  V97  V98  V99  V100 V101 V102 V103 V104 V105 V106 V107 V108 V109
##  [99] V110 V111 V112 V113 V114 V115 V116 V117 V118 V119 V120 V121 V122 V123
## [113] V124 V125 V126 V127 V128 V129 V130 V131 V132 V133 V134 V135 V136 V137
## [127] V138 V139 V140 V141 V142 V143 V144 V145 V146 V147 V148 V149 V150 V151
## [141] V152 V153 V154 V155 V156 V157 V158 V159 V160 V161 V162 V163 V164 V165
## [155] V166 V167 V168 V169 V170 V171 V172 V173 V174 V175 V176 V177 V178 V179
## [169] V180 V181 V182 V183 V184 V185 V186 V187 V188 V189 V190 V191 V192 V193
## [183] V194 V195 V196 V197 V198 V199 V200 V201 V202 V203 V204 V205 V206 V207
## [197] V208 V209 V210 V211 V212 V213 V214 V215 V216 V217 V218 V219 V220 V221
## [211] V222 V223 V224 V225 V226 V227 V228 V229 V230 V231 V232 V233 V234 V235
## [225] V236 V237 V238 V239 V240 V241 V242 V243 V244 V245 V246 V247 V248 V249
## [239] V250 V251 V252 V253 V254 V255 V256 V257 V258 V259 V260 V261 V262 V263
## [253] V264 V265 V266 V267 V268 V269 V270 V271 V272 V273 V274 V275 V276 V277
## [267] V278 V279 V280 V281 V282 V283 V284 V285 V286 V287 V288 V289 V290 V291
## [281] V292 V293 V294 V295 V296 V297 V298 V299 V300 V301 V302 V303 V304 V305
## [295] V306 V307 V308 V309 V310 V311 V312 V313 V314 V315 V316 V317 V318 V319
## [309] V320 V321 V322 V323 V324 V325 V326 V327 V328 V329 V330 V331 V332 V333
## [323] V334 V335 V336 V337 V338 V339 V340 V341 V342 V343 V344 V345 V346 V347
## [337] V348 V349 V350 V351 V352 V353 V354 V355 V356 V357 V358 V359 V360 V361
## [351] V362 V363 V364 V365 V366 V367 V368 V369 V370 V371 V372 V373 V374 V375
## [365] V376 V377 V378 V379 V380 V381 V382 V383 V384 V385 V386 V387 V388 V389
## [379] V390 V391 V392 V393 V394 V395 V396 V397 V398 V399 V400 V401 V402 V403
## [393] V404 V405 V406 V407 V408 V409 V410 V411 V412 V413 V414 V415 V416 V417
## [407] V418 V419 V420 V421 V422 V423 V424 V425 V426 V427 V428 V429 V430 V431
## [421] V432 V433 V434 V435 V436 V437 V438 V439 V440 V441 V442 V443 V444 V445
## [435] V446 V447 V448 V449 V450 V451 V452 V453 V454 V455 V456 V457 V458 V459
## [449] V460 V461 V462 V463 V464 V465 V466 V467 V468 V469 V470 V471 V472 V473
## [463] V474 V475 V476 V477 V478 V479 V480 V481 V482 V483 V484 V485 V486 V487
## [477] V488 V489 V490 V491 V492 V493 V494 V495 V496 V497 V498 V499 V500 V501
## [491] V502 V503 V504 V505 V506 V507 V508 V509 V510 V511 V512 V513 V514 V515
## [505] V516 V517 V518 V519 V520 V521 V522 V523 V524 V525 V526 V527 V528 V529
## [519] V530 V531 V532 V533 V534 V535 V536 V537 V538 V539 V540 V541 V542 V543
## [533] V544 V545 V546 V547 V548 V549 V550 V551 V552 V553 V554 V555 V556 V557
## [547] V558 V559 V560 V561 V562 V563 V564 V565 V566 V567 V568 V569 V570 V571
## [561] V572 V573 V574 V575 V576 V577 V578 V579 V580 V581 V582 V583 V584 V585
## [575] V586 V587 V588 V589 V590 V591 V592 V593 V594 V595 V596 V597 V598 V599
## [589] V600 V601 V602 V603 V604 V605 V606 V607 V608 V609 V610 V611 V612 V613
## [603] V614 V615 V616 V617 V618 V619 V620 V621 V622 V623 V624 V625 V626 V627
## [617] V628 V629 V630 V631 V632 V633 V634 V635 V636 V637 V638 V639 V640 V641
## [631] V642 V643 V644 V645 V646 V647 V648 V649 V650 V651 V652 V653 V654 V655
## [645] V656 V657 V658 V659 V660 V661 V662 V663 V664 V665 V666 V667 V668 V669
## [659] V670 V671 V672 V673 V674 V675 V676 V677 V678 V679 V680 V681 V682 V683
## [673] V684 V685 V686 V687 V688 V689 V690 V691 V692 V693 V694 V695 V696 V697
## [687] V698 V699 V700 V701 V702 V703 V704 V705 V706 V707 V708 V709 V710 V711
## [701] V712 V713 V714 V715 V716 V717 V718 V719 V720 V721 V722 V723 V724 V725
## [715] V726 V727 V728 V729 V730 V731 V732 V733 V734 V735 V736 V737 V738 V739
## [729] V740 V741 V742 V743 V744 V745 V746 V747 V748 V749 V750
##  
##  The data contained 2 columns with zero or tiny median absolute deviation.
##  Their column names are:
##  
## [1] V12 V13
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 737 columns:
##  
##   [1] V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25  V26  V27 
##  [15] V28  V29  V30  V31  V32  V33  V34  V35  V36  V37  V38  V39  V40  V41 
##  [29] V42  V43  V44  V45  V46  V47  V48  V49  V50  V51  V52  V53  V54  V55 
##  [43] V56  V57  V58  V59  V60  V61  V62  V63  V64  V65  V66  V67  V68  V69 
##  [57] V70  V71  V72  V73  V74  V75  V76  V77  V78  V79  V80  V81  V82  V83 
##  [71] V84  V85  V86  V87  V88  V89  V90  V91  V92  V93  V94  V95  V96  V97 
##  [85] V98  V99  V100 V101 V102 V103 V104 V105 V106 V107 V108 V109 V110 V111
##  [99] V112 V113 V114 V115 V116 V117 V118 V119 V120 V121 V122 V123 V124 V125
## [113] V126 V127 V128 V129 V130 V131 V132 V133 V134 V135 V136 V137 V138 V139
## [127] V140 V141 V142 V143 V144 V145 V146 V147 V148 V149 V150 V151 V152 V153
## [141] V154 V155 V156 V157 V158 V159 V160 V161 V162 V163 V164 V165 V166 V167
## [155] V168 V169 V170 V171 V172 V173 V174 V175 V176 V177 V178 V179 V180 V181
## [169] V182 V183 V184 V185 V186 V187 V188 V189 V190 V191 V192 V193 V194 V195
## [183] V196 V197 V198 V199 V200 V201 V202 V203 V204 V205 V206 V207 V208 V209
## [197] V210 V211 V212 V213 V214 V215 V216 V217 V218 V219 V220 V221 V222 V223
## [211] V224 V225 V226 V227 V228 V229 V230 V231 V232 V233 V234 V235 V236 V237
## [225] V238 V239 V240 V241 V242 V243 V244 V245 V246 V247 V248 V249 V250 V251
## [239] V252 V253 V254 V255 V256 V257 V258 V259 V260 V261 V262 V263 V264 V265
## [253] V266 V267 V268 V269 V270 V271 V272 V273 V274 V275 V276 V277 V278 V279
## [267] V280 V281 V282 V283 V284 V285 V286 V287 V288 V289 V290 V291 V292 V293
## [281] V294 V295 V296 V297 V298 V299 V300 V301 V302 V303 V304 V305 V306 V307
## [295] V308 V309 V310 V311 V312 V313 V314 V315 V316 V317 V318 V319 V320 V321
## [309] V322 V323 V324 V325 V326 V327 V328 V329 V330 V331 V332 V333 V334 V335
## [323] V336 V337 V338 V339 V340 V341 V342 V343 V344 V345 V346 V347 V348 V349
## [337] V350 V351 V352 V353 V354 V355 V356 V357 V358 V359 V360 V361 V362 V363
## [351] V364 V365 V366 V367 V368 V369 V370 V371 V372 V373 V374 V375 V376 V377
## [365] V378 V379 V380 V381 V382 V383 V384 V385 V386 V387 V388 V389 V390 V391
## [379] V392 V393 V394 V395 V396 V397 V398 V399 V400 V401 V402 V403 V404 V405
## [393] V406 V407 V408 V409 V410 V411 V412 V413 V414 V415 V416 V417 V418 V419
## [407] V420 V421 V422 V423 V424 V425 V426 V427 V428 V429 V430 V431 V432 V433
## [421] V434 V435 V436 V437 V438 V439 V440 V441 V442 V443 V444 V445 V446 V447
## [435] V448 V449 V450 V451 V452 V453 V454 V455 V456 V457 V458 V459 V460 V461
## [449] V462 V463 V464 V465 V466 V467 V468 V469 V470 V471 V472 V473 V474 V475
## [463] V476 V477 V478 V479 V480 V481 V482 V483 V484 V485 V486 V487 V488 V489
## [477] V490 V491 V492 V493 V494 V495 V496 V497 V498 V499 V500 V501 V502 V503
## [491] V504 V505 V506 V507 V508 V509 V510 V511 V512 V513 V514 V515 V516 V517
## [505] V518 V519 V520 V521 V522 V523 V524 V525 V526 V527 V528 V529 V530 V531
## [519] V532 V533 V534 V535 V536 V537 V538 V539 V540 V541 V542 V543 V544 V545
## [533] V546 V547 V548 V549 V550 V551 V552 V553 V554 V555 V556 V557 V558 V559
## [547] V560 V561 V562 V563 V564 V565 V566 V567 V568 V569 V570 V571 V572 V573
## [561] V574 V575 V576 V577 V578 V579 V580 V581 V582 V583 V584 V585 V586 V587
## [575] V588 V589 V590 V591 V592 V593 V594 V595 V596 V597 V598 V599 V600 V601
## [589] V602 V603 V604 V605 V606 V607 V608 V609 V610 V611 V612 V613 V614 V615
## [603] V616 V617 V618 V619 V620 V621 V622 V623 V624 V625 V626 V627 V628 V629
## [617] V630 V631 V632 V633 V634 V635 V636 V637 V638 V639 V640 V641 V642 V643
## [631] V644 V645 V646 V647 V648 V649 V650 V651 V652 V653 V654 V655 V656 V657
## [645] V658 V659 V660 V661 V662 V663 V664 V665 V666 V667 V668 V669 V670 V671
## [659] V672 V673 V674 V675 V676 V677 V678 V679 V680 V681 V682 V683 V684 V685
## [673] V686 V687 V688 V689 V690 V691 V692 V693 V694 V695 V696 V697 V698 V699
## [687] V700 V701 V702 V703 V704 V705 V706 V707 V708 V709 V710 V711 V712 V713
## [701] V714 V715 V716 V717 V718 V719 V720 V721 V722 V723 V724 V725 V726 V727
## [715] V728 V729 V730 V731 V732 V733 V734 V735 V736 V737 V738 V739 V740 V741
## [729] V742 V743 V744 V745 V746 V747 V748 V749 V750
##  
##  The final data set we will analyze has 180 rows and 737 columns.
##  
##  The computation started at: Tue Dec 06 16:59:03 2016
##  The computation ended at: Tue Dec 06 17:01:51 2016
## 
remX = DDCglass$remX
dim(remX)
## [1] 180 737
library(rrcov) # for robust PCA:
PCAglass = PcaHubert(glass,alpha=0.75,scale=FALSE)

xblocksize = 5
yblocksize = 5
n = nrow(remX)
d = ncol(remX)
xlabels = rep("",floor(d/xblocksize));
xlabels[1] = "1";
xlabels[floor(d/xblocksize)] = "d"
xlabels
##   [1] "1" ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
##  [18] ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
##  [35] ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
##  [52] ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
##  [69] ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
##  [86] ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
## [103] ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
## [120] ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
## [137] ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "d"
ylabels = rep("",floor(n/yblocksize));
ylabels[1] = "1"
ylabels[floor(n/yblocksize)] = "n";
ylabels
##  [1] "1" ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
## [18] ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  ""  "" 
## [35] ""  "n"
xtitle = "wavelengths"
ytitle = "glass samples"

ggpROBPCA = cellMap(D=remX, R=matrix(0,n,d),
                           indcells=integer(0),
                           indrows=which(PCAglass@flag==FALSE),
                           xlabels=xlabels,
                           ylabels=ylabels,
                           mTitle="By row",
                           xblocksize=xblocksize,
                           yblocksize=yblocksize,
                           anglex=0,
                           xtitle=xtitle,
                           ytitle=ytitle,
                           sizexy=1.5,
                           autolabel=F)
plot(ggpROBPCA)

plot of chunk unnamed-chunk-16

ggpDDC = cellMap(D=remX, R=DDCglass$stdResid,
                        indcells=DDCglass$indcells,
                        indrows=integer(0), # DDCglass$indrows,
                        xlabels=xlabels,
                        ylabels=ylabels,
                        mTitle="DetectDeviatingCells",
                        xblocksize=xblocksize,
                        yblocksize=yblocksize,
                        anglex=0,
                        xtitle=xtitle,
                        ytitle=ytitle,
                        sizexy=1.5,
                        autolabel=F)
plot(ggpDDC)

plot of chunk unnamed-chunk-16

pdf(paste("cellmap_glass2_",xblocksize,".pdf",sep=""),width=16,height=10)
grid.arrange(ggpROBPCA,ggpDDC,ncol=1)
dev.off()
## png 
##   2

Row and column selection

In this section we test the selection of columns and rows by DDC.

i = c(1,2,3,4,5,6,7,8,9) 
name = c("aa","bb","cc","dd","ee","ff","gg","hh","ii") 
logic = c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE) 
V1 = c(1.3,NaN,4.5,2.7,-3.3,4.4,-2.1,1.1,-5)
V2 = c(2.3,NA,5,6,7,8,4,3,0.5)
V3 = c(2,Inf,3,-4,5,6,7,-2,8)
Vna = c(1,-4,2,NaN,3,-Inf,NA,6,5)
Vdis = c(1,1,2,2,3,3,3,1,2)
V0s = c(1,1.5,2,2,2,2,2,3,2.5) 
datafr = data.frame(i,name,logic,V1,V2,V3,Vna,Vdis,V0s) 
datafr
##   i name logic   V1  V2  V3  Vna Vdis V0s
## 1 1   aa  TRUE  1.3 2.3   2    1    1 1.0
## 2 2   bb FALSE  NaN  NA Inf   -4    1 1.5
## 3 3   cc  TRUE  4.5 5.0   3    2    2 2.0
## 4 4   dd FALSE  2.7 6.0  -4  NaN    2 2.0
## 5 5   ee FALSE -3.3 7.0   5    3    3 2.0
## 6 6   ff  TRUE  4.4 8.0   6 -Inf    3 2.0
## 7 7   gg  TRUE -2.1 4.0   7   NA    3 2.0
## 8 8   hh  TRUE  1.1 3.0  -2    6    1 3.0
## 9 9   ii FALSE -5.0 0.5   8    5    2 2.5
DDCdatafr=DetectDeviatingCells(datafr,DDCpars)
##  
##  The input data has 9 rows and 9 columns.
##  
##  The input data contained 2 non-numeric columns (variables).
##  Their column names are:
##  
## [1] name  logic
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 7 numeric columns:
##  
## [1] i    V1   V2   V3   Vna  Vdis V0s 
##  
##  The data contained 1 columns that were identical to the case number
##  (number of the row).
##  Their column names are:
##  
## [1] i
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 6 columns:
##  
## [1] V1   V2   V3   Vna  Vdis V0s 
##  
##  The data contained 1 discrete columns with 3 or fewer values.
##  Their column names are:
##  
## [1] Vdis
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 5 columns:
##  
## [1] V1  V2  V3  Vna V0s
##  
##  The data contained 1 columns with zero or tiny median absolute deviation.
##  Their column names are:
##  
## [1] V0s
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 4 columns:
##  
## [1] V1  V2  V3  Vna
##  
##  The final data set we will analyze has 9 rows and 4 columns.
##  
##  The computation started at: Tue Dec 06 17:01:54 2016
##  The computation ended at: Tue Dec 06 17:01:54 2016
## 
DDCdatafr
## $colInAnalysis
##  V1  V2  V3 Vna 
##   4   5   6   7 
## 
## $rowInAnalysis
## 1 2 3 4 5 6 7 8 9 
## 1 2 3 4 5 6 7 8 9 
## 
## $namesNotNumeric
## [1] "name"  "logic"
## 
## $namesCaseNumber
## [1] "i"
## 
## $namesNAcol
## NULL
## 
## $namesNArow
## NULL
## 
## $namesDiscrete
## [1] "Vdis"
## 
## $namesZeroScale
## [1] "V0s"
## 
## $remX
##     V1  V2 V3 Vna
## 1  1.3 2.3  2   1
## 2   NA  NA NA  -4
## 3  4.5 5.0  3   2
## 4  2.7 6.0 -4  NA
## 5 -3.3 7.0  5   3
## 6  4.4 8.0  6  NA
## 7 -2.1 4.0  7  NA
## 8  1.1 3.0 -2   6
## 9 -5.0 0.5  8   5
## 
## $Z
##            V1         V2          V3         Vna
## 1  0.07084994 -0.8839003 -0.67459622 -0.67968325
## 2          NA         NA          NA -2.39453799
## 3  0.94150268  0.1692823 -0.42261131 -0.33671231
## 4  0.45176051  0.5593499 -2.18650568          NA
## 5 -1.18071339  0.9494175  0.08135851  0.00625864
## 6  0.91429478  1.3394851  0.33334342          NA
## 7 -0.85421861 -0.2207853  0.58532833          NA
## 8  0.01643414 -0.6108530 -1.68253586  1.03517148
## 9 -1.64324766 -1.5860220  0.83731324  0.69220053
## 
## $k
## [1] 3
## 
## $ngbrs
##     [,1] [,2] [,3] [,4]
## V1     1    2    4    3
## V2     2    1    4    3
## V3     3    1    4    2
## Vna    4    1    2    3
## 
## $robcors
##     [,1]       [,2]       [,3]        [,4]
## V1     1  0.4792831 -0.4585536 -0.39241907
## V2     1  0.4792831 -0.3774730 -0.07576421
## V3     1 -0.3924191 -0.1882915 -0.07576421
## Vna    1 -0.4585536 -0.3774730 -0.18829152
## 
## $robslopes
##     [,1] [,2] [,3] [,4]
## V1     1    0    0    0
## V2     1    0    0    0
## V3     1    0    0    0
## Vna    1    0    0    0
## 
## $Xest
##          V1       V2        V3       Vna
## 1  1.300000 2.300000  2.000000  1.000000
## 2  1.039598 4.566018  4.677129 -4.000000
## 3  4.500000 5.000000  3.000000  2.000000
## 4  2.700000 6.000000 -4.000000  2.981752
## 5 -3.300000 7.000000  5.000000  3.000000
## 6  4.400000 8.000000  6.000000  2.981752
## 7 -2.100000 4.000000  7.000000  2.981752
## 8  1.100000 3.000000 -2.000000  6.000000
## 9 -5.000000 0.500000  8.000000  5.000000
## 
## $stdResid
##            V1         V2          V3         Vna
## 1  0.07084994 -0.8839003 -0.67459622 -0.67968325
## 2          NA         NA          NA -2.39453799
## 3  0.94150268  0.1692823 -0.42261131 -0.33671231
## 4  0.45176051  0.5593499 -2.18650568          NA
## 5 -1.18071339  0.9494175  0.08135851  0.00625864
## 6  0.91429478  1.3394851  0.33334342          NA
## 7 -0.85421861 -0.2207853  0.58532833          NA
## 8  0.01643414 -0.6108530 -1.68253586  1.03517148
## 9 -1.64324766 -1.5860220  0.83731324  0.69220053
## 
## $indcells
## numeric(0)
## 
## $Ti
##         [,1]
## 1 -0.5963351
## 2  2.7953056
## 3 -1.0545595
## 4  0.3713382
## 5 -0.8877966
## 6  0.3236981
## 7 -0.6744908
## 8  0.0000000
## 9  1.2312736
## 
## $indrows
## [1] 2
## 
## $indall
## [1]  2 11 20 29
## 
## $Ximp
##          V1       V2        V3       Vna
## 1  1.300000 2.300000  2.000000  1.000000
## 2  1.039598 4.566018  4.677129 -4.000000
## 3  4.500000 5.000000  3.000000  2.000000
## 4  2.700000 6.000000 -4.000000  2.981752
## 5 -3.300000 7.000000  5.000000  3.000000
## 6  4.400000 8.000000  6.000000  2.981752
## 7 -2.100000 4.000000  7.000000  2.981752
## 8  1.100000 3.000000 -2.000000  6.000000
## 9 -5.000000 0.500000  8.000000  5.000000