This file contains some examples of the functions related to the DDC routine. More specifically,
DetectDeviatingCells
, cellMap
and checkDataset
will be illustrated.
library("cellWise")
# Default options for DetectDeviatingCells:
DDCpars=list(fracNA=0.5,numDiscrete=3,precScale=1e-12,
tolProb=0.99,corrlim=0.5,combinRule ="wmean",
includeSelf=T,rowdetect=TRUE,returnBigXimp=F)
We will first look at a clean generated dataset.
set.seed(12345) # reproducible
n = 100; d = 10
A = matrix(0.9, d, d); diag(A) = 1
A # true covariance matrix
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1.0 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9
## [2,] 0.9 1.0 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9
## [3,] 0.9 0.9 1.0 0.9 0.9 0.9 0.9 0.9 0.9 0.9
## [4,] 0.9 0.9 0.9 1.0 0.9 0.9 0.9 0.9 0.9 0.9
## [5,] 0.9 0.9 0.9 0.9 1.0 0.9 0.9 0.9 0.9 0.9
## [6,] 0.9 0.9 0.9 0.9 0.9 1.0 0.9 0.9 0.9 0.9
## [7,] 0.9 0.9 0.9 0.9 0.9 0.9 1.0 0.9 0.9 0.9
## [8,] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 1.0 0.9 0.9
## [9,] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 1.0 0.9
## [10,] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 1.0
library(MASS) # only needed for the function mvrnorm in the next line:
xclean = mvrnorm(n, rep(0,d), A)
round(xclean[1:3,1:5],2)
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.55 -0.73 -0.09 -0.73 -0.25
## [2,] -0.15 -1.35 -0.66 -1.11 -0.26
## [3,] 0.19 0.26 0.12 0.17 -0.15
DDCxclean = DetectDeviatingCells(xclean,DDCpars)
##
## The input data has 100 rows and 10 columns.
## The computation started at: Tue Dec 06 16:58:53 2016
## The computation ended at: Tue Dec 06 16:58:53 2016
##
# TRUE # OK, all parts of the output match
summary(DDCxclean)
## Length Class Mode
## colInAnalysis 10 -none- numeric
## rowInAnalysis 100 -none- numeric
## namesNotNumeric 0 -none- NULL
## namesCaseNumber 0 -none- NULL
## namesNAcol 0 -none- NULL
## namesNArow 0 -none- NULL
## namesDiscrete 0 -none- NULL
## namesZeroScale 0 -none- NULL
## remX 1000 -none- numeric
## Z 1000 -none- numeric
## k 1 -none- numeric
## ngbrs 100 -none- numeric
## robcors 100 -none- numeric
## robslopes 100 -none- numeric
## Xest 1000 -none- numeric
## stdResid 1000 -none- numeric
## indcells 11 -none- numeric
## Ti 100 -none- numeric
## indrows 0 -none- numeric
## indall 11 -none- numeric
## Ximp 1000 -none- numeric
round(DDCxclean$Z[1:12,],2)# robustly standardized data:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 -0.25 -0.38 0.17 -0.38 0.06 0.02 -0.47 -0.46 0.19 -0.61
## 2 0.11 -0.93 -0.34 -0.71 0.04 -0.18 -0.16 -0.35 0.13 -0.75
## 3 0.41 0.48 0.35 0.41 0.15 0.24 0.34 0.44 0.33 0.44
## 4 0.66 0.57 0.29 0.72 1.06 0.20 0.43 0.74 0.97 0.79
## 5 -0.23 -0.21 -0.33 0.34 -0.69 -0.50 -0.31 0.29 -0.73 0.04
## 6 2.13 1.70 1.87 1.83 1.73 1.54 1.60 1.56 1.84 1.90
## 7 -0.31 -0.16 0.14 -0.19 -0.07 -0.86 -0.27 -0.20 -0.12 -0.42
## 8 0.50 0.44 0.35 0.69 0.53 0.94 0.45 0.76 0.00 0.29
## 9 0.81 0.47 0.22 -0.01 0.78 0.15 0.69 0.76 0.79 0.39
## 10 1.18 1.12 0.94 1.10 1.46 0.67 0.94 1.18 0.82 0.88
## 11 0.30 0.07 -0.01 -0.05 0.59 0.40 0.35 0.61 0.73 0.63
## 12 -1.46 -1.37 -1.16 -1.59 -1.03 -1.64 -1.06 -1.11 -1.02 -0.87
# Due to the high correlations, cells in the same row look similar
DDCxclean$k
## [1] 9
# The code looked for up to 9 non-self neighbors of each variable.
# (It goes through all of them, unless d is huge.)
DDCxclean$ngbrs # show the neighbors:
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## X1 1 3 7 9 2 4 5 10 8 6
## X2 2 3 1 4 5 8 9 6 10 7
## X3 3 1 4 5 2 9 8 10 7 6
## X4 4 3 8 2 1 10 7 5 9 6
## X5 5 3 9 8 2 1 7 4 10 6
## X6 6 1 3 10 2 9 5 4 8 7
## X7 7 8 1 9 4 3 10 5 2 6
## X8 8 7 4 5 9 2 1 3 10 6
## X9 9 1 5 3 8 10 2 7 6 4
## X10 10 4 1 9 3 7 6 2 5 8
# i.e. the nearest non-self neighbor of X10 is X4, then X1,...
round(DDCxclean$robcors,2) # correlations with these neighbors:
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## X1 1 0.94 0.93 0.93 0.93 0.93 0.93 0.92 0.92 0.92
## X2 1 0.93 0.93 0.93 0.93 0.92 0.92 0.91 0.91 0.91
## X3 1 0.94 0.93 0.93 0.93 0.92 0.92 0.92 0.91 0.91
## X4 1 0.93 0.93 0.93 0.93 0.93 0.91 0.91 0.90 0.89
## X5 1 0.93 0.93 0.93 0.93 0.93 0.91 0.91 0.90 0.89
## X6 1 0.92 0.91 0.91 0.91 0.91 0.89 0.89 0.89 0.88
## X7 1 0.94 0.93 0.91 0.91 0.91 0.91 0.91 0.91 0.88
## X8 1 0.94 0.93 0.93 0.92 0.92 0.92 0.92 0.90 0.89
## X9 1 0.93 0.93 0.92 0.92 0.92 0.92 0.91 0.91 0.90
## X10 1 0.93 0.92 0.92 0.92 0.91 0.91 0.91 0.90 0.90
# Per row, the correlation are sorted by decreasing absolute value.
round(DDCxclean$robslopes,2) # slope of each neighbor predicting the column:
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## X1 1 0.96 0.96 0.99 0.98 0.96 0.94 0.94 0.93 0.96
## X2 1 0.90 0.90 0.90 0.90 0.90 0.92 0.89 0.91 0.90
## X3 1 0.92 0.95 0.93 0.97 0.96 0.92 0.92 0.92 0.97
## X4 1 0.90 0.91 0.95 0.90 0.92 0.92 0.89 0.89 0.90
## X5 1 0.94 0.94 0.93 0.97 0.92 0.93 0.93 0.95 0.92
## X6 1 0.88 0.87 0.88 0.90 0.88 0.86 0.88 0.86 0.88
## X7 1 0.94 0.92 0.91 0.92 0.91 0.91 0.88 0.91 0.89
## X8 1 0.94 0.94 0.92 0.93 0.95 0.90 0.90 0.92 0.91
## X9 1 0.89 0.90 0.89 0.92 0.90 0.91 0.90 0.92 0.90
## X10 1 0.94 0.90 0.92 0.90 0.92 0.93 0.92 0.90 0.90
# For instance, X10 is predicted by its first neighbor with slope 0.94
# and by its second neighbor with slope 0.90 .
round(DDCxclean$Xest[1:12,],2) # the estimated cells:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 -0.51 -0.53 -0.51 -0.54 -0.55 -0.39 -0.66 -0.72 -0.54 -0.62
## 2 -0.63 -0.65 -0.65 -0.66 -0.67 -0.51 -0.76 -0.84 -0.65 -0.75
## 3 0.14 0.12 0.13 0.12 0.10 0.26 0.02 -0.03 0.11 0.08
## 4 0.47 0.43 0.46 0.43 0.44 0.58 0.35 0.31 0.44 0.42
## 5 -0.54 -0.54 -0.55 -0.54 -0.59 -0.43 -0.66 -0.72 -0.58 -0.63
## 6 1.78 1.68 1.76 1.71 1.72 1.90 1.68 1.65 1.72 1.78
## 7 -0.55 -0.56 -0.56 -0.57 -0.59 -0.46 -0.69 -0.75 -0.58 -0.66
## 8 0.30 0.26 0.29 0.27 0.25 0.43 0.18 0.14 0.25 0.24
## 9 0.31 0.27 0.29 0.27 0.27 0.42 0.20 0.15 0.28 0.24
## 10 0.92 0.86 0.91 0.87 0.88 1.03 0.81 0.78 0.87 0.88
## 11 0.14 0.11 0.13 0.10 0.11 0.27 0.02 -0.03 0.12 0.08
## 12 -1.69 -1.66 -1.70 -1.69 -1.72 -1.59 -1.84 -1.92 -1.70 -1.84
round(DDCxclean$stdResid[1:12,],1) # the standardized residuals:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 -0.2 -0.8 1.6 -0.7 1.0 0.8 -1.0 -0.9 1.5 -1.5
## 2 1.8 -2.7 -0.1 -1.6 1.3 0.4 0.6 -0.1 1.6 -1.6
## 3 0.2 0.5 -0.1 0.2 -0.8 -0.4 -0.1 0.3 -0.1 0.3
## 4 0.0 -0.3 -1.5 0.3 1.4 -1.4 -0.9 0.3 1.3 0.6
## 5 0.1 0.1 -0.4 2.3 -1.6 -0.9 -0.3 2.0 -1.9 1.0
## 6 1.2 -0.1 0.2 0.3 -0.2 -0.6 -0.8 -0.9 0.4 0.5
## 7 -0.2 0.3 1.6 0.2 0.6 -2.1 -0.1 0.2 0.5 -0.6
## 8 0.0 -0.2 -0.6 0.8 0.1 1.6 -0.2 1.0 -1.8 -0.8
## 9 1.2 -0.1 -1.2 -2.1 0.9 -1.2 0.7 0.9 1.1 -0.4
## 10 0.5 0.5 -0.5 0.3 1.5 -1.1 -0.4 0.5 -0.7 -0.6
## 11 -0.3 -1.2 -1.6 -1.7 0.8 0.2 -0.1 0.9 1.4 1.0
## 12 -0.7 -0.8 0.4 -1.5 0.8 -1.5 0.7 0.5 0.7 1.3
DDCxclean$indcells # indices of the cells that were flagged:
## [1] 102 124 135 177 214 247 349 583 660 847 969
d
## [1] 10
# The standardized residuals of the flagged cells:
round(DDCxclean$stdResid[DDCxclean$indcells],1)
## [1] -2.7 3.0 2.9 2.6 -2.6 -2.9 -2.7 -2.6 2.7 3.6 -3.2
# They are just barely larger in absolute value than
# sqrt(qchisq(0.99,1)) = 2.576 and there are not many,
# given that there are 1000 cells here.
# the standardized residuals are roughly standard gaussian:
qqnorm(as.vector(DDCxclean$stdResid))
# is a straight line through 0 with slope about 1.
# So, nothing stands out in this clean dataset.
# outlyingness of the rows:
qqnorm(DDCxclean$Ti) # looks fairly gaussian too
DDCxclean$indrows # no rows are flagged:
## integer(0)
DDCxclean$indall # flagged cells including those in flagged rows:
## [1] 102 124 135 177 214 247 349 583 660 847 969
round(DDCxclean$Ximp[1:12,],2) # the imputed matrix:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 -0.55 -0.73 -0.09 -0.73 -0.25 -0.13 -0.94 -1.00 -0.09 -1.09
## 2 -0.15 -0.65 -0.66 -1.11 -0.26 -0.36 -0.58 -0.88 -0.15 -1.26
## 3 0.19 0.26 0.12 0.17 -0.15 0.14 -0.01 0.06 0.08 0.18
## 4 0.47 0.36 0.05 0.53 0.89 0.09 0.10 0.41 0.83 0.60
## 5 -0.52 -0.52 -0.65 0.10 -1.09 -0.75 -0.76 -0.11 -1.16 -0.31
## 6 2.10 1.65 1.82 1.80 1.65 1.70 1.46 1.38 1.84 1.94
## 7 -0.62 -0.47 -0.12 -0.51 -0.39 -1.19 -0.71 -0.70 -0.44 -0.87
## 8 0.29 0.22 0.11 0.49 0.29 0.97 0.13 0.44 -0.30 -0.01
## 9 0.63 0.25 -0.03 -0.30 0.57 0.02 0.40 0.43 0.62 0.11
## 10 1.05 1.00 0.77 0.96 1.35 0.65 0.69 0.94 0.66 0.70
## 11 0.07 -0.21 -0.30 -0.35 0.35 0.33 0.00 0.26 0.55 0.40
## 12 -1.89 -1.86 -1.59 -2.11 -1.48 -2.12 -1.63 -1.77 -1.49 -1.41
round((DDCxclean$Ximp - xclean)[1:12,],2)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 0 0.00 0 0 0 0 0 0 0 0
## 2 0 0.71 0 0 0 0 0 0 0 0
## 3 0 0.00 0 0 0 0 0 0 0 0
## 4 0 0.00 0 0 0 0 0 0 0 0
## 5 0 0.00 0 0 0 0 0 0 0 0
## 6 0 0.00 0 0 0 0 0 0 0 0
## 7 0 0.00 0 0 0 0 0 0 0 0
## 8 0 0.00 0 0 0 0 0 0 0 0
## 9 0 0.00 0 0 0 0 0 0 0 0
## 10 0 0.00 0 0 0 0 0 0 0 0
## 11 0 0.00 0 0 0 0 0 0 0 0
## 12 0 0.00 0 0 0 0 0 0 0 0
# Only a few cells were imputed in this example.
Let's now look at the TopGear dataset.
library(robustHD)
library(gridExtra)
data(TopGear)
dim(TopGear)
## [1] 297 32
rownames(TopGear)[1:13] # "1" to "297" are not useful names
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13"
rownames(TopGear) = paste(TopGear[,1],TopGear[,2])
# Now the rownames are make and model of the cars.
rownames(TopGear)[165] = "Mercedes-Benz G" # name was too long
myTopGear = TopGear[,-31] # removes the subjective variable `Verdict'
# Transform some variables to get roughly gaussianity in the center:
transTG = myTopGear
transTG$Price = log(myTopGear$Price)
transTG$Displacement = log(myTopGear$Displacement)
transTG$BHP = log(myTopGear$BHP)
transTG$Torque = log(myTopGear$Torque)
transTG$TopSpeed = log(myTopGear$TopSpeed)
# Run the method:
DDCtransTG = DetectDeviatingCells(transTG,DDCpars)
##
## The input data has 297 rows and 31 columns.
##
## The input data contained 19 non-numeric columns (variables).
## Their column names are:
##
## [1] Maker Model Type
## [4] Fuel DriveWheel AdaptiveHeadlights
## [7] AdjustableSteering AlarmSystem Automatic
## [10] Bluetooth ClimateControl CruiseControl
## [13] ElectricSeats Leather ParkingSensors
## [16] PowerSteering SatNav ESP
## [19] Origin
##
## These columns will be ignored in the analysis.
## We continue with the remaining 12 numeric columns:
##
## [1] Price Cylinders Displacement BHP Torque
## [6] Acceleration TopSpeed MPG Weight Length
## [11] Width Height
##
## The data contained 1 rows with over 50% of NAs.
## Their row names are:
##
## [1] Citroen C5 Tourer
##
## These rows will be ignored in the analysis.
## We continue with the remaining 296 rows:
##
## [1] Alfa Romeo Giulietta Alfa Romeo MiTo
## [3] Aston Martin Cygnet Aston Martin DB9
## [5] Aston Martin DB9 Volante Aston Martin V12 Zagato
## [7] Aston Martin Vanquish Aston Martin Vantage
## [9] Aston Martin Vantage Roadster Audi A1
## [11] Audi A3 Audi A4
## [13] Audi A4 Allroad Audi A5
## [15] Audi A5 Sportback Audi A6 Avant
## [17] Audi A6 Saloon Audi A7 Sportback
## [19] Audi A8 Audi Q3
## [21] Audi Q5 Audi Q7
## [23] Audi R8 Audi R8 V10
## [25] Audi RS4 Avant Audi TT Coupe
## [27] Audi TT Roadster Bentley Continental
## [29] Bentley Continental GTC Bentley Flying Spur
## [31] Bentley Mulsanne BMW 1 Series
## [33] BMW 1 Series Convertible BMW 1 Series Coupe
## [35] BMW 3 Series BMW 3 Series Convertible
## [37] BMW 4 Series Coupe BMW 6 Series
## [39] BMW 6 Series Convertible BMW 6 Series Gran Coupe
## [41] BMW 7 Series BMW i3
## [43] BMW M3 BMW M6
## [45] BMW X1 BMW X3
## [47] BMW X5 BMW X6
## [49] BMW Z4 Bugatti Veyron
## [51] Caterham CSR Caterham Super 7
## [53] Chevrolet Aveo Chevrolet Camaro
## [55] Chevrolet Captiva Chevrolet Cruze
## [57] Chevrolet Orlando Chevrolet Spark
## [59] Chevrolet Volt Chrysler 300C
## [61] Chrysler Delta Chrysler Grand Voyager
## [63] Chrysler Ypsilon Citroen C1
## [65] Citroen C3 Citroen C3 Picasso
## [67] Citroen C4 Citroen C4 Picasso
## [69] Citroen C5 Citroen DS3
## [71] Citroen DS4 Citroen DS5
## [73] Corvette C6 Dacia Duster
## [75] Dacia Sandero Ferrari 458
## [77] Ferrari California Ferrari F12
## [79] Ferrari FF Fiat 500
## [81] Fiat 500 Abarth Fiat 500L
## [83] Fiat Bravo Fiat Doblo
## [85] Fiat Panda Fiat Punto Evo
## [87] Ford B-Max Ford C-Max
## [89] Ford Fiesta Ford Focus
## [91] Ford Focus Estate Ford Focus ST
## [93] Ford Galaxy Ford Kuga
## [95] Ford Mondeo Ford S-MAX
## [97] Honda Accord Honda Civic
## [99] Honda CR-V Honda CR-Z
## [101] Honda Insight Honda Jazz
## [103] Hyundai i10 Hyundai i20
## [105] Hyundai i30 Hyundai i40
## [107] Hyundai i800 Hyundai ix20
## [109] Hyundai ix35 Hyundai Santa Fe
## [111] Hyundai Veloster Infiniti EX
## [113] Infiniti FX Infiniti G37
## [115] Infiniti M Jaguar F-Type
## [117] Jaguar XF Jaguar XF Sportbrake
## [119] Jaguar XFR Jaguar XJ Series
## [121] Jaguar XK Jeep Compass
## [123] Jeep Grand Cherokee Jeep Wrangler
## [125] Kia Cee'd Kia Optima
## [127] Kia Picanto Kia Rio
## [129] Kia Sorento Kia Soul
## [131] Kia Sportage Kia Venga
## [133] Lamborghini Aventador Lamborghini Gallardo
## [135] Land Rover Defender Land Rover Discovery 4
## [137] Land Rover Freelander 2 Land Rover Range Rover
## [139] Land Rover Range Rover Evoque Land Rover Range Rover Sport
## [141] Lexus CT 200h Lexus GS
## [143] Lexus IS Lexus RX
## [145] Lotus Elise Lotus Evora
## [147] Lotus Exige S Maserati GranTurismo
## [149] Maserati Quattroporte Mazda CX-5
## [151] Mazda Mazda3 Mazda MX-5
## [153] McLaren MP4-12C Mercedes-Benz A-Class
## [155] Mercedes-Benz B-Class Mercedes-Benz C-Class
## [157] Mercedes-Benz C63 AMG Mercedes-Benz CL-Class
## [159] Mercedes-Benz CLS Shooting Brake Mercedes-Benz CLS-Class
## [161] Mercedes-Benz E-Class Mercedes-Benz E-Class Coupe
## [163] Mercedes-Benz E63 AMG Mercedes-Benz G
## [165] Mercedes-Benz GL-Class Mercedes-Benz M-Class
## [167] Mercedes-Benz R-Class Mercedes-Benz S-Class
## [169] Mercedes-Benz SL 63 Mercedes-Benz SL-Class
## [171] Mercedes-Benz SLK Mercedes-Benz SLS
## [173] Mini Clubman Mini Convertible
## [175] Mini Countryman Mini Coupe
## [177] Mini John Cooper Works Mini Roadster
## [179] Mitsubishi ASX Mitsubishi i-MiEV
## [181] Mitsubishi Mirage Mitsubishi Outlander
## [183] Mitsubishi Shogun Morgan 3 Wheeler
## [185] Morgan Aero Morgan Roadster
## [187] Nissan 370Z Nissan Juke
## [189] Nissan Leaf Nissan Micra
## [191] Nissan Note Nissan Pathfinder
## [193] Nissan Qashqai Nissan X-Trail
## [195] Noble M600 Pagani Huayra
## [197] Perodua MYVI Peugeot 107
## [199] Peugeot 207 CC Peugeot 207 SW
## [201] Peugeot 208 Peugeot 3008
## [203] Peugeot 308 Peugeot 308 CC
## [205] Peugeot 308 SW Peugeot 5008
## [207] Peugeot RCZ Porsche 911
## [209] Porsche Boxster Porsche Cayenne
## [211] Porsche Panamera Proton GEN-2
## [213] Proton Satria-Neo Proton Savvy
## [215] Renault Clio Renault Megane
## [217] Renault Scenic/Grand Scenic Renault Twingo
## [219] Renault Twizy Renault Zoe
## [221] Rolls-Royce Ghost Rolls-Royce Phantom
## [223] Rolls-Royce Phantom Coupe SEAT Alhambra
## [225] SEAT Altea SEAT Leon
## [227] SEAT Mii SEAT Toledo
## [229] Skoda Fabia Skoda Octavia
## [231] Skoda Roomster Skoda Yeti
## [233] Smart fortwo Ssangyong Rodius
## [235] Subaru BRZ Subaru Forester
## [237] Subaru Legacy Outback Subaru XV
## [239] Suzuki Alto Suzuki Grand Vitara
## [241] Suzuki Jimny Suzuki Splash
## [243] Suzuki Swift Suzuki Swift Sport
## [245] Suzuki SX4 Toyota Auris
## [247] Toyota Avensis Toyota AYGO
## [249] Toyota GT 86 Toyota iQ
## [251] Toyota Land Cruiser Toyota Land Cruiser V8
## [253] Toyota Prius Toyota RAV4
## [255] Toyota Verso Toyota Yaris
## [257] Vauxhall Adam Vauxhall Agila
## [259] Vauxhall Ampera Vauxhall Antara
## [261] Vauxhall Astra Vauxhall Astra GTC
## [263] Vauxhall Astra VXR Vauxhall Cascada
## [265] Vauxhall Corsa Vauxhall Corsa VXR
## [267] Vauxhall Insignia Vauxhall Insignia Sports Tourer
## [269] Vauxhall Meriva Vauxhall Mokka
## [271] Vauxhall VXR8 Vauxhall Zafira
## [273] Vauxhall Zafira Tourer Volkswagen Beetle
## [275] Volkswagen CC Volkswagen Eos
## [277] Volkswagen Golf Volkswagen Golf Plus
## [279] Volkswagen Jetta Volkswagen Passat
## [281] Volkswagen Phaeton Volkswagen Polo
## [283] Volkswagen Scirocco Volkswagen Sharan
## [285] Volkswagen Tiguan Volkswagen Touareg
## [287] Volkswagen Touran Volkswagen Up
## [289] Volvo S60 Volvo S80
## [291] Volvo V40 Volvo V60
## [293] Volvo V70 Volvo XC60
## [295] Volvo XC70 Volvo XC90
##
## The data contained 1 columns with zero or tiny median absolute deviation.
## Their column names are:
##
## [1] Cylinders
##
## These columns will be ignored in the analysis.
## We continue with the remaining 11 columns:
##
## [1] Price Displacement BHP Torque Acceleration
## [6] TopSpeed MPG Weight Length Width
## [11] Height
##
## The final data set we will analyze has 296 rows and 11 columns.
##
## The computation started at: Tue Dec 06 16:58:54 2016
## The computation ended at: Tue Dec 06 16:58:54 2016
##
# the remaining part of the dataset:
remX = DDCtransTG$remX
dim(remX)
## [1] 296 11
colSums(is.na(remX))
## Price Displacement BHP Torque Acceleration
## 0 8 3 3 0
## TopSpeed MPG Weight Length Width
## 3 11 32 10 15
## Height
## 10
# We have NAs left, mainly in `Weight'.
n = nrow(remX)
d = ncol(remX)
indexDDCcells = matrix(FALSE,nrow=n,ncol=d)
indexDDCcells[DDCtransTG$indcells] = TRUE
dim(indexDDCcells)
## [1] 296 11
# Analyze the data by column:
standX = scale(remX,apply(remX,2,median,na.rm = TRUE),
apply(remX,2,mad,na.rm = TRUE))
dim(standX)
## [1] 296 11
round(standX[1:5,],1) # has NAs where remX does
## Price Displacement BHP Torque Acceleration
## Alfa Romeo Giulietta -0.4 -0.4 -0.7 0.0 0.6
## Alfa Romeo MiTo -0.9 -0.7 -0.7 -1.6 0.4
## Aston Martin Cygnet 0.3 -0.8 -0.8 -1.7 0.7
## Aston Martin DB9 2.7 2.1 1.9 1.2 -1.2
## Aston Martin DB9 Volante 2.8 2.1 1.9 1.2 -1.2
## TopSpeed MPG Weight Length Width Height
## Alfa Romeo Giulietta -0.5 1.0 -0.3 -0.3 -0.2 -0.2
## Alfa Romeo MiTo -0.4 0.1 -1.0 -0.9 -1.1 -0.3
## Aston Martin Cygnet -0.9 0.6 -1.3 -3.1 -1.5 0.1
## Aston Martin DB9 2.0 -1.7 0.8 0.6 NA -1.6
## Aston Martin DB9 Volante 2.0 -1.7 1.0 0.6 NA -1.6
transTGcol = remX
transTGcol[abs(standX) > sqrt(qchisq(0.99,1))] = NA
round(transTGcol[1:5,],1) # also has NAs in outlying cells
## Price Displacement BHP Torque Acceleration
## Alfa Romeo Giulietta 10.0 7.4 4.7 5.5 11.3
## Alfa Romeo MiTo 9.6 7.2 4.7 4.6 10.7
## Aston Martin Cygnet 10.3 7.2 4.6 4.5 11.8
## Aston Martin DB9 NA 8.7 6.2 6.1 4.6
## Aston Martin DB9 Volante NA 8.7 6.2 6.1 4.6
## TopSpeed MPG Weight Length Width Height
## Alfa Romeo Giulietta 4.7 64 1385 4351 1798 1465
## Alfa Romeo MiTo 4.8 49 1090 4063 1720 1446
## Aston Martin Cygnet 4.7 56 988 NA 1680 1500
## Aston Martin DB9 5.2 19 1785 4720 NA 1282
## Aston Martin DB9 Volante 5.2 19 1890 4720 NA 1282
# Make untransformed submatrix of X for labeling the cells in the plot:
tempX = myTopGear[DDCtransTG$rowInAnalysis,DDCtransTG$colInAnalysis]
dim(tempX)
## [1] 296 11
tempX$Price = tempX$Price/1000 # to avoid printing long numbers
xlabels = colnames(tempX)
ylabels = rownames(tempX)
# select the following cars for the cellmap:
yshowindex = c(12,42,56,73,81,94,99,135,150,164,176,198,209,215,234,241,277)
# make two ggplot2 objects:
ggpcol = cellMap(D=tempX,
R=standX,
indcells=which(is.na(transTGcol)),
indrows=integer(0),
xlabels=xlabels,
ylabels=ylabels,
mTitle="By column",
yshowindex=yshowindex,
showVals=TRUE,
hjustYlabels=0.5)
plot(ggpcol)
ggpDDC = cellMap(D=tempX,
R=DDCtransTG$stdResid,
indcells=which(indexDDCcells==TRUE),
indrows=DDCtransTG$indrows,
xlabels=xlabels,
ylabels=ylabels,
mTitle="DetectDeviatingCells",
yshowindex=yshowindex,
showVals=TRUE,
hjustYlabels=0.5)
plot(ggpDDC)
# Creating the pdf:
pdfName = "cellmap_TopGear.pdf"
pdf(pdfName, width = 20, height = 15 )
grid.arrange(ggpcol,ggpDDC,nrow=1) # arranges ggplot2,.. on a page
dev.off()
## png
## 2
We now illustrate DDC on the philips dataset.
data(philips)
dim(philips)
## [1] 677 9
colnames(philips) = c("X1","X2","X3","X4","X5","X6","X7","X8","X9")
library(robustbase) # for covMcd
MCDphilips = covMcd(philips)
pdfName = "figure_philips_left.pdf"
pdf(pdfName, width = 10, height = 4)
plot(sqrt(mahalanobis(philips,MCDphilips$center,MCDphilips$cov)),
main="Philips data",ylab="Robust distances",xlab="",pch=20)
abline(h=sqrt(qchisq(0.975,df=9)))
dev.off()
## png
## 2
DDCphilips = DetectDeviatingCells(philips,DDCpars)
##
## The input data has 677 rows and 9 columns.
## The computation started at: Tue Dec 06 16:58:55 2016
## The computation ended at: Tue Dec 06 16:58:56 2016
##
summary(DDCphilips)
## Length Class Mode
## colInAnalysis 9 -none- numeric
## rowInAnalysis 677 -none- numeric
## namesNotNumeric 0 -none- NULL
## namesCaseNumber 0 -none- NULL
## namesNAcol 0 -none- NULL
## namesNArow 0 -none- NULL
## namesDiscrete 0 -none- NULL
## namesZeroScale 0 -none- NULL
## remX 6093 -none- numeric
## Z 6093 -none- numeric
## k 1 -none- numeric
## ngbrs 81 -none- numeric
## robcors 81 -none- numeric
## robslopes 81 -none- numeric
## Xest 6093 -none- numeric
## stdResid 6093 -none- numeric
## indcells 155 -none- numeric
## Ti 677 -none- numeric
## indrows 5 -none- numeric
## indall 195 -none- numeric
## Ximp 6093 -none- numeric
qqnorm(as.vector(DDCphilips$Z))
# rather gaussian, here we only see 2 outliers
round(DDCphilips$stdResid[1:12,],1) # the standardized residuals:
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## 1 -0.9 2.2 0.0 1.2 0.8 -2.9 2.1 0.1 -1.4
## 2 -0.5 2.0 0.0 3.9 0.8 -1.1 3.5 -0.4 -1.7
## 3 -1.0 2.3 -0.4 0.7 0.6 -2.0 1.4 0.0 -1.7
## 4 -0.6 2.3 -0.1 3.8 1.1 -0.9 3.4 -0.5 -2.0
## 5 -0.6 2.8 -0.2 3.4 0.9 -0.9 3.4 0.2 -1.9
## 6 -0.7 2.6 0.0 4.4 1.0 -1.1 4.2 -0.1 -2.0
## 7 1.0 -0.1 -1.0 -0.3 -1.2 -1.0 1.4 0.8 -0.6
## 8 -0.4 -1.4 -0.7 1.8 -0.7 -2.8 1.9 -0.3 -1.7
## 9 1.0 -0.2 0.0 -0.1 -1.2 -2.1 1.6 1.0 -0.9
## 10 0.2 -1.3 -0.1 1.0 -0.5 -2.9 2.0 -0.1 -1.5
## 11 -0.1 -0.3 -0.1 0.2 -1.1 -1.5 1.9 0.8 -0.8
## 12 -0.1 -0.4 -1.5 2.5 0.0 -3.4 2.3 0.3 -2.5
DDCphilips$indcells # indices of the cells that were flagged:
## [1] 55 70 95 104 116 144 151 156 161 170 174 175 324 468
## [15] 682 683 712 747 750 792 985 986 987 988 989 991 992 1006
## [29] 1007 1008 1010 1011 1196 1199 1450 1452 1455 1474 1529 1787 2033 2035
## [43] 2036 2037 2045 2047 2094 2101 2114 2116 2126 2135 2267 2312 2355 2365
## [57] 2464 2499 2592 2743 2775 2777 3120 3216 3226 3227 3228 3248 3386 3393
## [71] 3395 3397 3399 3401 3412 3413 3414 3415 3416 3417 3423 3424 3425 3426
## [85] 3427 3428 3430 3431 3437 3445 3456 3457 3460 3464 3465 3466 3468 3469
## [99] 3470 3990 4064 4066 4067 4068 4076 4078 4088 4145 4147 4792 4794 5036
## [113] 5037 5168 5169 5170 5171 5172 5174 5175 5176 5230 5238 5241 5246 5248
## [127] 5250 5253 5256 5258 5259 5260 5262 5263 5265 5267 5268 5270 5510 5511
## [141] 5512 5513 5514 5515 5516 5517 5518 5519 5520 5569 5869 5873 5882 5890
## [155] 5902
DDCphilips$indrows # flagged rows:
## [1] 26 56 84 334 491
# cellmaps with rectangular blocks:
xblocksize = 1
yblocksize = 15 # 20
xlabels = colnames(philips)
ylabels = 1:dim(philips)[1] #rownames(philips)
n = nrow(philips)
d = ncol(philips)
indrows=which(mahalanobis(philips,MCDphilips$center,MCDphilips$cov) >
qchisq(0.975,df=d))
ggpMCDphilips = cellMap(D=philips,R=matrix(0,n,d),
indcells=integer(0),
indrows=indrows,
xlabels=xlabels,
ylabels=ylabels,
mTitle="MCD",
xblocksize=xblocksize,
yblocksize=yblocksize,
autolabel=T)
plot(ggpMCDphilips)
ggpDDCphilips = cellMap(D=philips, R=DDCphilips$stdResid,
indcells=DDCphilips$indcells,
indrows=integer(0), #DDCphilips$indrows,
xlabels=xlabels,
ylabels=ylabels,
mTitle="DetectDeviatingCells",
xblocksize=xblocksize,
yblocksize=yblocksize,
autolabel=T,
hjustYlabels=1)
plot(ggpDDCphilips)
pdfName = "figure_philips_right.pdf"
pdf(pdfName, width = 6, height = 12)
plot(ggpDDCphilips)
dev.off()
## png
## 2
Let's now look at the Mortality dataset.
data(mortality)
dim(mortality)
## [1] 198 91
# 198 91
rownames(mortality)
## [1] "1816" "1817" "1818" "1819" "1820" "1821" "1822" "1823" "1824" "1825"
## [11] "1826" "1827" "1828" "1829" "1830" "1831" "1832" "1833" "1834" "1835"
## [21] "1836" "1837" "1838" "1839" "1840" "1841" "1842" "1843" "1844" "1845"
## [31] "1846" "1847" "1848" "1849" "1850" "1851" "1852" "1853" "1854" "1855"
## [41] "1856" "1857" "1858" "1859" "1860" "1861" "1862" "1863" "1864" "1865"
## [51] "1866" "1867" "1868" "1869" "1870" "1871" "1872" "1873" "1874" "1875"
## [61] "1876" "1877" "1878" "1879" "1880" "1881" "1882" "1883" "1884" "1885"
## [71] "1886" "1887" "1888" "1889" "1890" "1891" "1892" "1893" "1894" "1895"
## [81] "1896" "1897" "1898" "1899" "1900" "1901" "1902" "1903" "1904" "1905"
## [91] "1906" "1907" "1908" "1909" "1910" "1911" "1912" "1913" "1914" "1915"
## [101] "1916" "1917" "1918" "1919" "1920" "1921" "1922" "1923" "1924" "1925"
## [111] "1926" "1927" "1928" "1929" "1930" "1931" "1932" "1933" "1934" "1935"
## [121] "1936" "1937" "1938" "1939" "1940" "1941" "1942" "1943" "1944" "1945"
## [131] "1946" "1947" "1948" "1949" "1950" "1951" "1952" "1953" "1954" "1955"
## [141] "1956" "1957" "1958" "1959" "1960" "1961" "1962" "1963" "1964" "1965"
## [151] "1966" "1967" "1968" "1969" "1970" "1971" "1972" "1973" "1974" "1975"
## [161] "1976" "1977" "1978" "1979" "1980" "1981" "1982" "1983" "1984" "1985"
## [171] "1986" "1987" "1988" "1989" "1990" "1991" "1992" "1993" "1994" "1995"
## [181] "1996" "1997" "1998" "1999" "2000" "2001" "2002" "2003" "2004" "2005"
## [191] "2006" "2007" "2008" "2009" "2010" "2011" "2012" "2013"
colnames(mortality)
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13"
## [15] "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27"
## [29] "28" "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41"
## [43] "42" "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55"
## [57] "56" "57" "58" "59" "60" "61" "62" "63" "64" "65" "66" "67" "68" "69"
## [71] "70" "71" "72" "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83"
## [85] "84" "85" "86" "87" "88" "89" "90"
DDCmortality = DetectDeviatingCells(mortality,DDCpars) # 3 seconds
##
## The input data has 198 rows and 91 columns.
## The computation started at: Tue Dec 06 16:58:57 2016
## The computation ended at: Tue Dec 06 16:59:01 2016
##
remX = DDCmortality$remX
dim(remX)
## [1] 198 91
library(rrcov) # for robust PCA
PCAmortality = PcaHubert(mortality,alpha=0.75,scale=FALSE)
xblocksize = 5
yblocksize = 5
xlabels = colnames(remX) # the _columns_ are in the x-direction
ylabels = rownames(remX)
n = nrow(remX)
d = ncol(remX)
ggpROBPCA = cellMap(D=remX,R=matrix(0,n,d),
indcells=integer(0),
indrows=which(PCAmortality@flag==FALSE),
xlabels=xlabels,
ylabels=ylabels,
mTitle="By row",
xblocksize=xblocksize,
yblocksize=yblocksize,
autolabel=T)
plot(ggpROBPCA)
ggpDDC = cellMap(D=remX, R=DDCmortality$stdResid,
indcells=DDCmortality$indcells,
indrows=DDCmortality$indrows,
xlabels=xlabels,
ylabels=ylabels,
mTitle="DetectDeviatingCells",
xblocksize=xblocksize,
yblocksize=yblocksize,
autolabel=T)
plot(ggpDDC)
pdf(paste("cellmap_mortality_",xblocksize,".pdf",sep=""),width=14,height=12)
grid.arrange(ggpROBPCA,ggpDDC,nrow=1)
dev.off()
## png
## 2
Let's now look at the glass dataset.
data(glass)
DDCglass = DetectDeviatingCells(glass,DDCpars) # takes 1 or 2 minutes
##
## The input data has 180 rows and 750 columns.
##
## The data contained 11 discrete columns with 3 or fewer values.
## Their column names are:
##
## [1] V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
##
## These columns will be ignored in the analysis.
## We continue with the remaining 739 columns:
##
## [1] V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25
## [15] V26 V27 V28 V29 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39
## [29] V40 V41 V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52 V53
## [43] V54 V55 V56 V57 V58 V59 V60 V61 V62 V63 V64 V65 V66 V67
## [57] V68 V69 V70 V71 V72 V73 V74 V75 V76 V77 V78 V79 V80 V81
## [71] V82 V83 V84 V85 V86 V87 V88 V89 V90 V91 V92 V93 V94 V95
## [85] V96 V97 V98 V99 V100 V101 V102 V103 V104 V105 V106 V107 V108 V109
## [99] V110 V111 V112 V113 V114 V115 V116 V117 V118 V119 V120 V121 V122 V123
## [113] V124 V125 V126 V127 V128 V129 V130 V131 V132 V133 V134 V135 V136 V137
## [127] V138 V139 V140 V141 V142 V143 V144 V145 V146 V147 V148 V149 V150 V151
## [141] V152 V153 V154 V155 V156 V157 V158 V159 V160 V161 V162 V163 V164 V165
## [155] V166 V167 V168 V169 V170 V171 V172 V173 V174 V175 V176 V177 V178 V179
## [169] V180 V181 V182 V183 V184 V185 V186 V187 V188 V189 V190 V191 V192 V193
## [183] V194 V195 V196 V197 V198 V199 V200 V201 V202 V203 V204 V205 V206 V207
## [197] V208 V209 V210 V211 V212 V213 V214 V215 V216 V217 V218 V219 V220 V221
## [211] V222 V223 V224 V225 V226 V227 V228 V229 V230 V231 V232 V233 V234 V235
## [225] V236 V237 V238 V239 V240 V241 V242 V243 V244 V245 V246 V247 V248 V249
## [239] V250 V251 V252 V253 V254 V255 V256 V257 V258 V259 V260 V261 V262 V263
## [253] V264 V265 V266 V267 V268 V269 V270 V271 V272 V273 V274 V275 V276 V277
## [267] V278 V279 V280 V281 V282 V283 V284 V285 V286 V287 V288 V289 V290 V291
## [281] V292 V293 V294 V295 V296 V297 V298 V299 V300 V301 V302 V303 V304 V305
## [295] V306 V307 V308 V309 V310 V311 V312 V313 V314 V315 V316 V317 V318 V319
## [309] V320 V321 V322 V323 V324 V325 V326 V327 V328 V329 V330 V331 V332 V333
## [323] V334 V335 V336 V337 V338 V339 V340 V341 V342 V343 V344 V345 V346 V347
## [337] V348 V349 V350 V351 V352 V353 V354 V355 V356 V357 V358 V359 V360 V361
## [351] V362 V363 V364 V365 V366 V367 V368 V369 V370 V371 V372 V373 V374 V375
## [365] V376 V377 V378 V379 V380 V381 V382 V383 V384 V385 V386 V387 V388 V389
## [379] V390 V391 V392 V393 V394 V395 V396 V397 V398 V399 V400 V401 V402 V403
## [393] V404 V405 V406 V407 V408 V409 V410 V411 V412 V413 V414 V415 V416 V417
## [407] V418 V419 V420 V421 V422 V423 V424 V425 V426 V427 V428 V429 V430 V431
## [421] V432 V433 V434 V435 V436 V437 V438 V439 V440 V441 V442 V443 V444 V445
## [435] V446 V447 V448 V449 V450 V451 V452 V453 V454 V455 V456 V457 V458 V459
## [449] V460 V461 V462 V463 V464 V465 V466 V467 V468 V469 V470 V471 V472 V473
## [463] V474 V475 V476 V477 V478 V479 V480 V481 V482 V483 V484 V485 V486 V487
## [477] V488 V489 V490 V491 V492 V493 V494 V495 V496 V497 V498 V499 V500 V501
## [491] V502 V503 V504 V505 V506 V507 V508 V509 V510 V511 V512 V513 V514 V515
## [505] V516 V517 V518 V519 V520 V521 V522 V523 V524 V525 V526 V527 V528 V529
## [519] V530 V531 V532 V533 V534 V535 V536 V537 V538 V539 V540 V541 V542 V543
## [533] V544 V545 V546 V547 V548 V549 V550 V551 V552 V553 V554 V555 V556 V557
## [547] V558 V559 V560 V561 V562 V563 V564 V565 V566 V567 V568 V569 V570 V571
## [561] V572 V573 V574 V575 V576 V577 V578 V579 V580 V581 V582 V583 V584 V585
## [575] V586 V587 V588 V589 V590 V591 V592 V593 V594 V595 V596 V597 V598 V599
## [589] V600 V601 V602 V603 V604 V605 V606 V607 V608 V609 V610 V611 V612 V613
## [603] V614 V615 V616 V617 V618 V619 V620 V621 V622 V623 V624 V625 V626 V627
## [617] V628 V629 V630 V631 V632 V633 V634 V635 V636 V637 V638 V639 V640 V641
## [631] V642 V643 V644 V645 V646 V647 V648 V649 V650 V651 V652 V653 V654 V655
## [645] V656 V657 V658 V659 V660 V661 V662 V663 V664 V665 V666 V667 V668 V669
## [659] V670 V671 V672 V673 V674 V675 V676 V677 V678 V679 V680 V681 V682 V683
## [673] V684 V685 V686 V687 V688 V689 V690 V691 V692 V693 V694 V695 V696 V697
## [687] V698 V699 V700 V701 V702 V703 V704 V705 V706 V707 V708 V709 V710 V711
## [701] V712 V713 V714 V715 V716 V717 V718 V719 V720 V721 V722 V723 V724 V725
## [715] V726 V727 V728 V729 V730 V731 V732 V733 V734 V735 V736 V737 V738 V739
## [729] V740 V741 V742 V743 V744 V745 V746 V747 V748 V749 V750
##
## The data contained 2 columns with zero or tiny median absolute deviation.
## Their column names are:
##
## [1] V12 V13
##
## These columns will be ignored in the analysis.
## We continue with the remaining 737 columns:
##
## [1] V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27
## [15] V28 V29 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39 V40 V41
## [29] V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52 V53 V54 V55
## [43] V56 V57 V58 V59 V60 V61 V62 V63 V64 V65 V66 V67 V68 V69
## [57] V70 V71 V72 V73 V74 V75 V76 V77 V78 V79 V80 V81 V82 V83
## [71] V84 V85 V86 V87 V88 V89 V90 V91 V92 V93 V94 V95 V96 V97
## [85] V98 V99 V100 V101 V102 V103 V104 V105 V106 V107 V108 V109 V110 V111
## [99] V112 V113 V114 V115 V116 V117 V118 V119 V120 V121 V122 V123 V124 V125
## [113] V126 V127 V128 V129 V130 V131 V132 V133 V134 V135 V136 V137 V138 V139
## [127] V140 V141 V142 V143 V144 V145 V146 V147 V148 V149 V150 V151 V152 V153
## [141] V154 V155 V156 V157 V158 V159 V160 V161 V162 V163 V164 V165 V166 V167
## [155] V168 V169 V170 V171 V172 V173 V174 V175 V176 V177 V178 V179 V180 V181
## [169] V182 V183 V184 V185 V186 V187 V188 V189 V190 V191 V192 V193 V194 V195
## [183] V196 V197 V198 V199 V200 V201 V202 V203 V204 V205 V206 V207 V208 V209
## [197] V210 V211 V212 V213 V214 V215 V216 V217 V218 V219 V220 V221 V222 V223
## [211] V224 V225 V226 V227 V228 V229 V230 V231 V232 V233 V234 V235 V236 V237
## [225] V238 V239 V240 V241 V242 V243 V244 V245 V246 V247 V248 V249 V250 V251
## [239] V252 V253 V254 V255 V256 V257 V258 V259 V260 V261 V262 V263 V264 V265
## [253] V266 V267 V268 V269 V270 V271 V272 V273 V274 V275 V276 V277 V278 V279
## [267] V280 V281 V282 V283 V284 V285 V286 V287 V288 V289 V290 V291 V292 V293
## [281] V294 V295 V296 V297 V298 V299 V300 V301 V302 V303 V304 V305 V306 V307
## [295] V308 V309 V310 V311 V312 V313 V314 V315 V316 V317 V318 V319 V320 V321
## [309] V322 V323 V324 V325 V326 V327 V328 V329 V330 V331 V332 V333 V334 V335
## [323] V336 V337 V338 V339 V340 V341 V342 V343 V344 V345 V346 V347 V348 V349
## [337] V350 V351 V352 V353 V354 V355 V356 V357 V358 V359 V360 V361 V362 V363
## [351] V364 V365 V366 V367 V368 V369 V370 V371 V372 V373 V374 V375 V376 V377
## [365] V378 V379 V380 V381 V382 V383 V384 V385 V386 V387 V388 V389 V390 V391
## [379] V392 V393 V394 V395 V396 V397 V398 V399 V400 V401 V402 V403 V404 V405
## [393] V406 V407 V408 V409 V410 V411 V412 V413 V414 V415 V416 V417 V418 V419
## [407] V420 V421 V422 V423 V424 V425 V426 V427 V428 V429 V430 V431 V432 V433
## [421] V434 V435 V436 V437 V438 V439 V440 V441 V442 V443 V444 V445 V446 V447
## [435] V448 V449 V450 V451 V452 V453 V454 V455 V456 V457 V458 V459 V460 V461
## [449] V462 V463 V464 V465 V466 V467 V468 V469 V470 V471 V472 V473 V474 V475
## [463] V476 V477 V478 V479 V480 V481 V482 V483 V484 V485 V486 V487 V488 V489
## [477] V490 V491 V492 V493 V494 V495 V496 V497 V498 V499 V500 V501 V502 V503
## [491] V504 V505 V506 V507 V508 V509 V510 V511 V512 V513 V514 V515 V516 V517
## [505] V518 V519 V520 V521 V522 V523 V524 V525 V526 V527 V528 V529 V530 V531
## [519] V532 V533 V534 V535 V536 V537 V538 V539 V540 V541 V542 V543 V544 V545
## [533] V546 V547 V548 V549 V550 V551 V552 V553 V554 V555 V556 V557 V558 V559
## [547] V560 V561 V562 V563 V564 V565 V566 V567 V568 V569 V570 V571 V572 V573
## [561] V574 V575 V576 V577 V578 V579 V580 V581 V582 V583 V584 V585 V586 V587
## [575] V588 V589 V590 V591 V592 V593 V594 V595 V596 V597 V598 V599 V600 V601
## [589] V602 V603 V604 V605 V606 V607 V608 V609 V610 V611 V612 V613 V614 V615
## [603] V616 V617 V618 V619 V620 V621 V622 V623 V624 V625 V626 V627 V628 V629
## [617] V630 V631 V632 V633 V634 V635 V636 V637 V638 V639 V640 V641 V642 V643
## [631] V644 V645 V646 V647 V648 V649 V650 V651 V652 V653 V654 V655 V656 V657
## [645] V658 V659 V660 V661 V662 V663 V664 V665 V666 V667 V668 V669 V670 V671
## [659] V672 V673 V674 V675 V676 V677 V678 V679 V680 V681 V682 V683 V684 V685
## [673] V686 V687 V688 V689 V690 V691 V692 V693 V694 V695 V696 V697 V698 V699
## [687] V700 V701 V702 V703 V704 V705 V706 V707 V708 V709 V710 V711 V712 V713
## [701] V714 V715 V716 V717 V718 V719 V720 V721 V722 V723 V724 V725 V726 V727
## [715] V728 V729 V730 V731 V732 V733 V734 V735 V736 V737 V738 V739 V740 V741
## [729] V742 V743 V744 V745 V746 V747 V748 V749 V750
##
## The final data set we will analyze has 180 rows and 737 columns.
##
## The computation started at: Tue Dec 06 16:59:03 2016
## The computation ended at: Tue Dec 06 17:01:51 2016
##
remX = DDCglass$remX
dim(remX)
## [1] 180 737
library(rrcov) # for robust PCA:
PCAglass = PcaHubert(glass,alpha=0.75,scale=FALSE)
xblocksize = 5
yblocksize = 5
n = nrow(remX)
d = ncol(remX)
xlabels = rep("",floor(d/xblocksize));
xlabels[1] = "1";
xlabels[floor(d/xblocksize)] = "d"
xlabels
## [1] "1" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
## [18] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
## [35] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
## [52] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
## [69] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
## [86] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
## [103] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
## [120] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
## [137] "" "" "" "" "" "" "" "" "" "" "d"
ylabels = rep("",floor(n/yblocksize));
ylabels[1] = "1"
ylabels[floor(n/yblocksize)] = "n";
ylabels
## [1] "1" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
## [18] "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""
## [35] "" "n"
xtitle = "wavelengths"
ytitle = "glass samples"
ggpROBPCA = cellMap(D=remX, R=matrix(0,n,d),
indcells=integer(0),
indrows=which(PCAglass@flag==FALSE),
xlabels=xlabels,
ylabels=ylabels,
mTitle="By row",
xblocksize=xblocksize,
yblocksize=yblocksize,
anglex=0,
xtitle=xtitle,
ytitle=ytitle,
sizexy=1.5,
autolabel=F)
plot(ggpROBPCA)
ggpDDC = cellMap(D=remX, R=DDCglass$stdResid,
indcells=DDCglass$indcells,
indrows=integer(0), # DDCglass$indrows,
xlabels=xlabels,
ylabels=ylabels,
mTitle="DetectDeviatingCells",
xblocksize=xblocksize,
yblocksize=yblocksize,
anglex=0,
xtitle=xtitle,
ytitle=ytitle,
sizexy=1.5,
autolabel=F)
plot(ggpDDC)
pdf(paste("cellmap_glass2_",xblocksize,".pdf",sep=""),width=16,height=10)
grid.arrange(ggpROBPCA,ggpDDC,ncol=1)
dev.off()
## png
## 2
In this section we test the selection of columns and rows by DDC.
i = c(1,2,3,4,5,6,7,8,9)
name = c("aa","bb","cc","dd","ee","ff","gg","hh","ii")
logic = c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE)
V1 = c(1.3,NaN,4.5,2.7,-3.3,4.4,-2.1,1.1,-5)
V2 = c(2.3,NA,5,6,7,8,4,3,0.5)
V3 = c(2,Inf,3,-4,5,6,7,-2,8)
Vna = c(1,-4,2,NaN,3,-Inf,NA,6,5)
Vdis = c(1,1,2,2,3,3,3,1,2)
V0s = c(1,1.5,2,2,2,2,2,3,2.5)
datafr = data.frame(i,name,logic,V1,V2,V3,Vna,Vdis,V0s)
datafr
## i name logic V1 V2 V3 Vna Vdis V0s
## 1 1 aa TRUE 1.3 2.3 2 1 1 1.0
## 2 2 bb FALSE NaN NA Inf -4 1 1.5
## 3 3 cc TRUE 4.5 5.0 3 2 2 2.0
## 4 4 dd FALSE 2.7 6.0 -4 NaN 2 2.0
## 5 5 ee FALSE -3.3 7.0 5 3 3 2.0
## 6 6 ff TRUE 4.4 8.0 6 -Inf 3 2.0
## 7 7 gg TRUE -2.1 4.0 7 NA 3 2.0
## 8 8 hh TRUE 1.1 3.0 -2 6 1 3.0
## 9 9 ii FALSE -5.0 0.5 8 5 2 2.5
DDCdatafr=DetectDeviatingCells(datafr,DDCpars)
##
## The input data has 9 rows and 9 columns.
##
## The input data contained 2 non-numeric columns (variables).
## Their column names are:
##
## [1] name logic
##
## These columns will be ignored in the analysis.
## We continue with the remaining 7 numeric columns:
##
## [1] i V1 V2 V3 Vna Vdis V0s
##
## The data contained 1 columns that were identical to the case number
## (number of the row).
## Their column names are:
##
## [1] i
##
## These columns will be ignored in the analysis.
## We continue with the remaining 6 columns:
##
## [1] V1 V2 V3 Vna Vdis V0s
##
## The data contained 1 discrete columns with 3 or fewer values.
## Their column names are:
##
## [1] Vdis
##
## These columns will be ignored in the analysis.
## We continue with the remaining 5 columns:
##
## [1] V1 V2 V3 Vna V0s
##
## The data contained 1 columns with zero or tiny median absolute deviation.
## Their column names are:
##
## [1] V0s
##
## These columns will be ignored in the analysis.
## We continue with the remaining 4 columns:
##
## [1] V1 V2 V3 Vna
##
## The final data set we will analyze has 9 rows and 4 columns.
##
## The computation started at: Tue Dec 06 17:01:54 2016
## The computation ended at: Tue Dec 06 17:01:54 2016
##
DDCdatafr
## $colInAnalysis
## V1 V2 V3 Vna
## 4 5 6 7
##
## $rowInAnalysis
## 1 2 3 4 5 6 7 8 9
## 1 2 3 4 5 6 7 8 9
##
## $namesNotNumeric
## [1] "name" "logic"
##
## $namesCaseNumber
## [1] "i"
##
## $namesNAcol
## NULL
##
## $namesNArow
## NULL
##
## $namesDiscrete
## [1] "Vdis"
##
## $namesZeroScale
## [1] "V0s"
##
## $remX
## V1 V2 V3 Vna
## 1 1.3 2.3 2 1
## 2 NA NA NA -4
## 3 4.5 5.0 3 2
## 4 2.7 6.0 -4 NA
## 5 -3.3 7.0 5 3
## 6 4.4 8.0 6 NA
## 7 -2.1 4.0 7 NA
## 8 1.1 3.0 -2 6
## 9 -5.0 0.5 8 5
##
## $Z
## V1 V2 V3 Vna
## 1 0.07084994 -0.8839003 -0.67459622 -0.67968325
## 2 NA NA NA -2.39453799
## 3 0.94150268 0.1692823 -0.42261131 -0.33671231
## 4 0.45176051 0.5593499 -2.18650568 NA
## 5 -1.18071339 0.9494175 0.08135851 0.00625864
## 6 0.91429478 1.3394851 0.33334342 NA
## 7 -0.85421861 -0.2207853 0.58532833 NA
## 8 0.01643414 -0.6108530 -1.68253586 1.03517148
## 9 -1.64324766 -1.5860220 0.83731324 0.69220053
##
## $k
## [1] 3
##
## $ngbrs
## [,1] [,2] [,3] [,4]
## V1 1 2 4 3
## V2 2 1 4 3
## V3 3 1 4 2
## Vna 4 1 2 3
##
## $robcors
## [,1] [,2] [,3] [,4]
## V1 1 0.4792831 -0.4585536 -0.39241907
## V2 1 0.4792831 -0.3774730 -0.07576421
## V3 1 -0.3924191 -0.1882915 -0.07576421
## Vna 1 -0.4585536 -0.3774730 -0.18829152
##
## $robslopes
## [,1] [,2] [,3] [,4]
## V1 1 0 0 0
## V2 1 0 0 0
## V3 1 0 0 0
## Vna 1 0 0 0
##
## $Xest
## V1 V2 V3 Vna
## 1 1.300000 2.300000 2.000000 1.000000
## 2 1.039598 4.566018 4.677129 -4.000000
## 3 4.500000 5.000000 3.000000 2.000000
## 4 2.700000 6.000000 -4.000000 2.981752
## 5 -3.300000 7.000000 5.000000 3.000000
## 6 4.400000 8.000000 6.000000 2.981752
## 7 -2.100000 4.000000 7.000000 2.981752
## 8 1.100000 3.000000 -2.000000 6.000000
## 9 -5.000000 0.500000 8.000000 5.000000
##
## $stdResid
## V1 V2 V3 Vna
## 1 0.07084994 -0.8839003 -0.67459622 -0.67968325
## 2 NA NA NA -2.39453799
## 3 0.94150268 0.1692823 -0.42261131 -0.33671231
## 4 0.45176051 0.5593499 -2.18650568 NA
## 5 -1.18071339 0.9494175 0.08135851 0.00625864
## 6 0.91429478 1.3394851 0.33334342 NA
## 7 -0.85421861 -0.2207853 0.58532833 NA
## 8 0.01643414 -0.6108530 -1.68253586 1.03517148
## 9 -1.64324766 -1.5860220 0.83731324 0.69220053
##
## $indcells
## numeric(0)
##
## $Ti
## [,1]
## 1 -0.5963351
## 2 2.7953056
## 3 -1.0545595
## 4 0.3713382
## 5 -0.8877966
## 6 0.3236981
## 7 -0.6744908
## 8 0.0000000
## 9 1.2312736
##
## $indrows
## [1] 2
##
## $indall
## [1] 2 11 20 29
##
## $Ximp
## V1 V2 V3 Vna
## 1 1.300000 2.300000 2.000000 1.000000
## 2 1.039598 4.566018 4.677129 -4.000000
## 3 4.500000 5.000000 3.000000 2.000000
## 4 2.700000 6.000000 -4.000000 2.981752
## 5 -3.300000 7.000000 5.000000 3.000000
## 6 4.400000 8.000000 6.000000 2.981752
## 7 -2.100000 4.000000 7.000000 2.981752
## 8 1.100000 3.000000 -2.000000 6.000000
## 9 -5.000000 0.500000 8.000000 5.000000