The LUCIDus package is aiming to provide researchers in the genetic epidemiology community with an integrative tool in R to obtain a joint estimation of latent or unknown clusters/subgroups with multi-omics data and phenotypic traits.
This package is an implementation for the novel statistical method proposed in the research paper “A Latent Unknown Clustering Integrating Multi-Omics Data (LUCID) with Phenotypic Traits1” published by the Bioinformatics. LUCID improves the subtype classification which leads to better diagnostics as well as prognostics and could be the potential solution for efficient targeted treatments and successful personalized medicine.
Four main functions, including est_lucid
, boot_lucid
, sem_lucid
, and tune_lucid
, are currently available for model fitting and feature selection. The model outputs can be summarized and visualized using summary_lucid
and plot_lucid
respectively. Predictions could be made with pred_lucid
.
Here are the descriptions of LUCIDus functions:
Function | Description |
---|---|
est_lucid() |
Estimates latent clusters using multi-omics data with/without the outcome of interest, and producing an IntClust object; missing values in biomarker data (Z) are allowed |
boot_lucid() |
Latent cluster estimation through bootstrapping for statistical inference |
sem_lucid() |
Latent cluster estimation with supplemented E-M algorithm for statistical inference |
tune_lucid() |
Grid search for tuning parameters using parallel computing to determine an optimal choice of three tuning parameters with minimum model BIC |
summary_lucid() |
Summarizes the results of integrative clustering based on an IntClust object |
plot_lucid() |
Produces a Sankey diagram for the results of integrative clustering based on an IntClust object |
pred_lucid() |
Predicts latent clusters and outcomes with an IntClust object and new data |
def_initial() |
Defines initial values of model parameters in est_lucid() , sem_lucid() , and tune_lucid() fitting |
def_tune() |
Defines selection options and tuning parameters in est_lucid() , sem_lucid() , and tune_lucid() fitting |
def_tol() |
Defines tolerance settings in est_lucid() , sem_lucid() , and tune_lucid() fitting |
For a simulated data with 10 genetic features (5 causal) and 4 biomarkers (2 causal)
est_lucid()
2set.seed(10)
IntClusFit <- est_lucid(G=G1,Z=Z1,Y=Y1,K=2,family="binary",Pred=TRUE)
#> [1] "1 : E-step finished"
#> # weights: 24 (11 variable)
#> initial value 1386.294361
#> iter 10 value 1263.208242
#> final value 1263.156529
#> converged
#> [1] "1 : Updated parameters"
#> [1] "2 : E-step finished"
#> # weights: 24 (11 variable)
#> initial value 1386.294361
#> iter 10 value 1263.208242
#> final value 1263.156529
#> converged
#> [1] "2 : Updated parameters"
#> [1] "2 : Converged!"
summary_lucid()
summary_lucid(IntClusFit)
#> $Beta
#> Int CG1 CG2 CG3 CG4 CG5 NG1
#> Cluster1 0.000000 0.0000000 0.000000 0.0000000 0.0000000 0.0000000 0.00000000
#> Cluster2 -0.400554 0.8744379 0.726266 0.7927368 0.6009161 0.7625038 -0.05833605
#> NG2 NG3 NG4 NG5
#> Cluster1 0.00000000 0.00000000 0.0000000 0.00000000
#> Cluster2 -0.08475132 0.01254265 -0.1114491 -0.09215688
#>
#> $Mu
#> CZ1 CZ2 NZ1 NZ2
#> Cluster1 -4.991211 -4.968509 -0.01887086 0.04801966
#> Cluster2 5.022779 5.074047 -0.03176532 0.02659571
#>
#> $OR_Gamma
#> Cluster1 Cluster2
#> 1.000000 2.294186
#>
#> $select_G
#> [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#>
#> $select_Z
#> [1] TRUE TRUE TRUE TRUE
#>
#> $No0G
#> [1] 10
#>
#> $No0Z
#> [1] 4
#>
#> $BIC
#> [1] 31165.95
#>
#> $GIC1
#> [1] 31502.03
#>
#> $GIC2
#> [1] 31069.28
plot_lucid()
tune_lucid()
GridSearch <- tune_lucid(G=G1, Z=Z1, Y=Y1, K=2, Family="binary", USEY = TRUE, NoCores = 2,
LRho_g = 0.008, URho_g = 0.012, NoRho_g = 3,
LRho_z_invcov = 0.04, URho_z_invcov = 0.06, NoRho_z_invcov = 3,
LRho_z_covmu = 90, URho_z_covmu = 100, NoRho_z_covmu = 3)
GridSearch$Results
#> Rho_G Rho_Z_InvCov Rho_Z_CovMu Non0G Non0Z BIC GIC1 GIC2
#> 1 0.008 0.04 90 7 3 31096.86 31331.33 31029.41
#> 2 0.008 0.04 95 7 3 31097.99 31332.46 31030.54
#> 3 0.008 0.04 100 7 3 31099.18 31333.66 31031.74
#> 4 0.008 0.05 90 7 3 31100.14 31334.61 31032.70
#> 5 0.008 0.05 95 7 2 31040.24 31212.19 30990.78
#> 6 0.008 0.05 100 7 3 31102.01 31336.48 31034.56
#> 7 0.008 0.06 90 7 3 31103.95 31338.42 31036.50
#> 8 0.008 0.06 95 7 2 31044.06 31216.00 30994.60
#> 9 0.008 0.06 100 7 2 31045.02 31216.97 30995.56
#> 10 0.010 0.04 90 5 3 31084.02 31302.86 31021.07
#> 11 0.010 0.04 95 5 3 31085.15 31303.99 31022.20
#> 12 0.010 0.04 100 5 3 31086.34 31305.19 31023.39
#> 13 0.010 0.05 90 5 3 31087.30 31306.14 31024.35
#> 14 0.010 0.05 95 5 2 31027.40 31183.72 30982.44
#> 15 0.010 0.05 100 5 3 31089.17 31308.01 31026.22
#> 16 0.010 0.06 90 5 3 31091.11 31309.95 31028.16
#> 17 0.010 0.06 95 5 2 31031.22 31187.53 30986.25
#> 18 0.010 0.06 100 5 2 31032.18 31188.50 30987.22
#> 19 0.012 0.04 90 5 3 31086.41 31305.26 31023.46
#> 20 0.012 0.04 95 5 3 31087.55 31306.39 31024.60
#> 21 0.012 0.04 100 5 3 31088.74 31307.58 31025.79
#> 22 0.012 0.05 90 5 3 31089.70 31308.54 31026.75
#> 23 0.012 0.05 95 5 2 31029.80 31186.11 30984.83
#> 24 0.012 0.05 100 5 3 31091.56 31310.40 31028.61
#> 25 0.012 0.06 90 5 3 31093.50 31312.34 31030.55
#> 26 0.012 0.06 95 5 2 31033.61 31189.93 30988.65
#> 27 0.012 0.06 100 5 2 31034.58 31190.89 30989.62
GridSearch$Optimal
#> Rho_G Rho_Z_InvCov Rho_Z_CovMu Non0G Non0Z BIC GIC1 GIC2
#> 14 0.01 0.05 95 5 2 31027.4 31183.72 30982.44
set.seed(0.01*0.05*95)
IntClusFit <- est_lucid(G=G1,Z=Z1,Y=Y1,K=2,family="binary",Pred=TRUE,
tunepar = def_tune(Select_G=TRUE,Select_Z=TRUE,
Rho_G=0.01,Rho_Z_InvCov=0.05,Rho_Z_CovMu=95))
#> [1] "1 : E-step finished"
#> [1] "1 : Updated parameters"
#> [1] "2 : E-step finished"
#> [1] "2 : Updated parameters"
#> [1] "2 : Converged!"
summary_lucid(IntClusFit)$No0G; summary_lucid(IntClusFit)$No0Z
#> [1] 5
#> [1] 2
colnames(G1)[summary_lucid(IntClusFit)$select_G]; colnames(Z1)[summary_lucid(IntClusFit)$select_Z]
#> [1] "CG1" "CG2" "CG3" "CG4" "CG5"
#> [1] "CZ1" "CZ2"
if(!all(summary_lucid(IntClusFit)$select_G==FALSE)){
G_select <- G1[,summary_lucid(IntClusFit)$select_G]
}
if(!all(summary_lucid(IntClusFit)$select_Z==FALSE)){
Z_select <- Z1[,summary_lucid(IntClusFit)$select_Z]
}
set.seed(10)
IntClusFitFinal <- est_lucid(G=G_select,Z=Z_select,Y=Y1,K=2,family="binary",Pred=TRUE)
#> [1] "1 : E-step finished"
#> # weights: 14 (6 variable)
#> initial value 1386.294361
#> iter 10 value 1264.617679
#> final value 1264.403226
#> converged
#> [1] "1 : Updated parameters"
#> [1] "2 : E-step finished"
#> # weights: 14 (6 variable)
#> initial value 1386.294361
#> iter 10 value 1264.617679
#> final value 1264.403226
#> converged
#> [1] "2 : Updated parameters"
#> [1] "2 : Converged!"
# Re-run the model with covariates in the G->X path
set.seed(10)
IntClusCoFit <- est_lucid(G=G1,CoG=CoG,Z=Z1,Y=Y1,K=2,family="binary",Pred=TRUE)
#> [1] "1 : E-step finished"
#> # weights: 34 (16 variable)
#> initial value 1386.294361
#> iter 10 value 1261.582053
#> iter 20 value 1261.058904
#> final value 1261.057357
#> converged
#> [1] "1 : Updated parameters"
#> [1] "2 : E-step finished"
#> # weights: 34 (16 variable)
#> initial value 1386.294361
#> iter 10 value 1261.582053
#> iter 20 value 1261.058904
#> final value 1261.057357
#> converged
#> [1] "2 : Updated parameters"
#> [1] "2 : Converged!"
# Check important model outputs
summary_lucid(IntClusCoFit)
#> $Beta
#> Int CG1 CG2 CG3 CG4 CG5
#> Cluster1 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
#> Cluster2 -0.4805745 0.8856772 0.7308921 0.7886249 0.6118021 0.7672202
#> NG1 NG2 NG3 NG4 NG5 GC1
#> Cluster1 0.00000000 0.00000000 0.000000000 0.0000000 0.00000000 0.0000000
#> Cluster2 -0.05390816 -0.08548088 0.004539503 -0.1166383 -0.08833329 -0.0159782
#> GC2 GC3 GC4 GC5
#> Cluster1 0.00000000 0.00000000 0.0000000 0.0000000
#> Cluster2 -0.01873078 -0.04047645 0.1680491 0.0891731
#>
#> $Mu
#> CZ1 CZ2 NZ1 NZ2
#> Cluster1 -4.991211 -4.968509 -0.01887086 0.04801966
#> Cluster2 5.022779 5.074047 -0.03176532 0.02659571
#>
#> $OR_Gamma
#> Cluster1 Cluster2
#> 1.000000 2.294186
#>
#> $select_G
#> [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#>
#> $select_Z
#> [1] TRUE TRUE TRUE TRUE
#>
#> $No0G
#> [1] 15
#>
#> $No0Z
#> [1] 4
#>
#> $BIC
#> [1] 31199.76
#>
#> $GIC1
#> [1] 31574.92
#>
#> $GIC2
#> [1] 31121.58
# Visualize the results
plot_lucid(IntClusCoFit)
# Re-run feature selection with covariates in both paths
set.seed(10)
IntClusCoFit <- est_lucid(G=G1,CoG=CoG,Z=Z1,Y=Y1,CoY=CoY,K=2,family="binary",Pred=TRUE,
initial=def_initial(), itr_tol=def_tol(),
tunepar = def_tune(Select_G=TRUE,Select_Z=TRUE,
Rho_G=0.02,Rho_Z_InvCov=0.1,Rho_Z_CovMu=93))
#> [1] "1 : E-step finished"
#> [1] "1 : Updated parameters"
#> [1] "2 : E-step finished"
#> [1] "2 : Updated parameters"
#> [1] "2 : Converged!"
# Re-fit with selected features with covariates
set.seed(10)
IntClusCoFitFinal <- est_lucid(G=G_select,CoG=CoG,Z=Z_select,Y=Y1,CoY=CoY,K=2,
family="binary",Pred=TRUE)
#> [1] "1 : E-step finished"
#> # weights: 24 (11 variable)
#> initial value 1386.294361
#> iter 10 value 1262.827184
#> final value 1262.297910
#> converged
#> [1] "1 : Updated parameters"
#> [1] "2 : E-step finished"
#> # weights: 24 (11 variable)
#> initial value 1386.294361
#> iter 10 value 1262.827184
#> final value 1262.297910
#> converged
#> [1] "2 : Updated parameters"
#> [1] "2 : Converged!"
# Visualize the results
plot_lucid(IntClusCoFitFinal)
set.seed(10)
BootCIs <- boot_lucid(G = G1, CoG = CoG, Z = Z1, Y = Y1, CoY = CoY,
useY = TRUE, family = "binary", K = 2, R=50)
BootCIs$Results