knitr::opts_chunk$set(echo = TRUE, 
                      cache.lazy = FALSE,
                      dev = c("svglite", "pdf", "png"),
                      dpi = 300,
                      fig.path = 'figures/',
                      fig.keep = "high")

#added from: https://github.com/yihui/knitr-examples/blob/master/077-wrap-output.Rmd
library(knitr)
hook_output = knit_hooks$get('output')
knit_hooks$set(output = function(x, options) {
  # this hook is used only when the linewidth option is not NULL
  if (!is.null(n <- options$linewidth)) {
    x = knitr:::split_lines(x)
    # any lines wider than n should be wrapped
    if (any(nchar(x) > n)) x = strwrap(x, width = n)
    x = paste(x, collapse = '\n')
  }
  hook_output(x, options)
})


library(tidyverse) # A collection of packages for data science. More about it on 
                   # www.tidyverse.com
library(magrittr)  # A package that provides pipe operators like %>%
library(mgcv)      # A package written by Simon Wood that implements his 
                   # (spline based) GAM specification.
library(glue)      # A package that provides interpolated string functions.


#When you want to reproduce the examples, you have to run this code chunk in advance!!
powerSet <- function(set) {
  
    apply(X      = rep(list(c(T,F)), length = length(set)) %>% expand.grid(.),
          MARGIN = 1,
          FUN    = function(...) set[...])
}


accuracy <- 
  function(y, y_hat, filter = TRUE) mean((y == y_hat)[filter])

class_accuracy <-
  function(y, y_hat, classes = sort(unique(y))) 
    sapply(as.character(classes), function(c) accuracy(y, y_hat, filter = (y == as.numeric(c))))


train_data <- readRDS("data/train_set_with_sofa_predictions.rds")

test_data <- readRDS("data/test_set_with_sofa_predictions.rds")


with(train_data, accuracy(target, sofa_circ))
with(train_data, class_accuracy(target, sofa_circ))


xtabs(~ target + sofa_circ, data = train_data)

      sofa_circ
target      0      1      2      3      4
     0 279912    104      0      0      0
     1    460  19649    152      0      0
     2      1    145  14558     33      0
     3      0      0     47   7283      0
     4      0      0      0     54   1006


with(test_data, accuracy(target, sofa_circ))
with(test_data, class_accuracy(target, sofa_circ))


xtabs(~ target + sofa_circ, data = test_data)

      sofa_circ
target     0     1     2     3     4
     0 61473   232     0     0     0
     1   591  4017   171     0     0
     2     0   442  9882     0     0
     3     0     0   170  3242     0
     4     0     0     0   290   161


with(train_data, accuracy(target, sofa_nocirc))
with(train_data, class_accuracy(target, sofa_nocirc))


xtabs(~ target + sofa_nocirc, data = train_data)

      sofa_nocirc
target      0      1      2      3      4
     0 279249    590    177      0      0
     1  13647   3731   2822     61      0
     2   2341   1342   9994   1060      0
     3     79     49   1112   6081      9
     4      0      2     23    503    532


with(test_data, accuracy(target, sofa_nocirc))
with(test_data, class_accuracy(target, sofa_nocirc))


xtabs(~ target + sofa_nocirc, data = test_data)

      sofa_nocirc
target     0     1     2     3
     0 59810   951   916    28
     1  4333   206   240     0
     2  7584   845  1626   269
     3  1950   497   819   146
     4    95    18   318    20


library(circlize)
#to install package "ComplexHeatmap":
# install.packages("BiocManager")
# BiocManager::install("ComplexHeatmap")
library(ComplexHeatmap)
library(gridBase)

========================================
circlize version 0.4.12
CRAN page: https://cran.r-project.org/package=circlize
Github page: https://github.com/jokergoo/circlize
Documentation: https://jokergoo.github.io/circlize_book/book/

If you use it in published research, please cite:
Gu, Z. circlize implements and enhances circular visualization
  in R. Bioinformatics 2014.

This message can be suppressed by:
  suppressPackageStartupMessages(library(circlize))
========================================


Loading required package: grid

========================================
ComplexHeatmap version 2.6.2
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference

If you use it in published research, please cite:
Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
  genomic data. Bioinformatics 2016.

This message can be suppressed by:
  suppressPackageStartupMessages(library(ComplexHeatmap))
========================================


method <- "pearson"

corr_matrix <-
  cbind(
    "label"        = sapply(select(test_data, amv:horovitz), 
                            cor,
                            method = method,
                            y = test_data$target),
    "circular"     = sapply(select(test_data, amv:horovitz), 
                            cor,
                            method = method,
                            y = test_data$sofa_circ),
    "non-circular" = sapply(select(test_data, amv:horovitz), 
                            cor,
                            method = method,
                            y = test_data$sofa_nocirc))


plot.new()
circle_size = unit(1, "snpc") # snpc unit gives you a square region

pushViewport(viewport(x = 0, 
                      y = 0.5, 
                      width = circle_size, 
                      height = circle_size, 
                      just = c("left", "center")))


col_scale <- 
  colorRamp2(c(-1, 0, 1), c("#67001F", "#FFFFFF", "#053061"), space ="LAB")

par(omi = gridOMI(), new = TRUE)
par(mar = c(0.1, 0.1, 0.1, 0.1))
circos.par(gap.after = 35, cell.padding = c(0, 0, 0, 0))


  circos.heatmap(corr_matrix[, 3, drop = FALSE], 
                 col = col_scale, 
                 track.height = .1,
                 rownames.side = "outside",
                 rownames.font = 2)
                 #rownames.cex = 0.5 + .25 * rownames(corr_matrix) %in% vars)
  circos.track(track.index = 2, 
               panel.fun = function(x, y) {
                  cn = colnames(corr_matrix)[3]
                  n  = length(cn)
                  circos.text(rep(CELL_META$cell.xlim[2], n) + convert_x(1, "mm"), 
                              1:n - 0.5, cn, 
                              cex = 0.75, 
                              adj = c(0, 0.5), 
                              facing = "bending.inside",
                              niceFacing = TRUE,
                              font = 2)},
              bg.border = NA)
  
                
  circos.heatmap(corr_matrix[,2 , drop = FALSE], 
               col = col_scale, 
               track.height = .1)
  circos.track(track.index = 3,
               panel.fun = function(x, y) {
                  cn = colnames(corr_matrix)[2]
                  n = length(cn)
                  circos.text(rep(CELL_META$cell.xlim[2], n) + convert_x(1, "mm"), 
                              1:n - 0.5, cn, 
                              cex = 0.75, 
                              adj = c(0, 0.5), 
                              facing = "bending.inside",
                              niceFacing = TRUE,
                              font = 2)},
              bg.border = NA)
  
  
  circos.heatmap(corr_matrix[,1 , drop = FALSE], 
               col = col_scale, 
               track.height = .1)
  circos.track(track.index = 4,
               panel.fun = function(x, y) {
                  cn = colnames(corr_matrix)[1]
                  n = length(cn)
                  circos.text(rep(CELL_META$cell.xlim[2], n) + convert_x(1, "mm"), 
                              1:n - 0.5, cn, 
                              cex = 0.75, 
                              adj = c(0, 0.5), 
                              facing = "bending.inside",
                              niceFacing = TRUE,
                              font = 2)},
              bg.border = NA)
  
  circos.clear()

upViewport()

lgd_color <- 
  Legend(at = seq(-1, 1, .2), 
         col_fun = col_scale, 
         title_position = "topcenter", 
         title = "r",
         border = "black")
draw(lgd_color,  x = circle_size, just = "left")

Note: 1 point is out of plotting region in sector 'group', track '2'.

Note: 1 point is out of plotting region in sector 'group', track '2'.

Note: 12 points are out of plotting region in sector 'group', track
'2'.

Note: 1 point is out of plotting region in sector 'group', track '3'.

Note: 1 point is out of plotting region in sector 'group', track '3'.

Note: 8 points are out of plotting region in sector 'group', track '3'.

Note: 1 point is out of plotting region in sector 'group', track '4'.

Note: 1 point is out of plotting region in sector 'group', track '4'.

Note: 5 points are out of plotting region in sector 'group', track '4'.


list_wo_bili <- 
  paste0(c(glue("s({names(test_data)[-c(8, 42, 45:47)]})"), 
           names(test_data)[42]),
        collapse = " + ")

student_nocirc_all <- 
  gam(as.formula(paste0("sofa_nocirc ~ s(bili) + ", list_wo_bili)), 
      data = test_data)

student_circ_all <- 
  gam(as.formula(paste0("sofa_circ ~ s(bili, k=100) + ", list_wo_bili)), 
      data = test_data)


list_wo_bili <- 
  paste0(c(glue("s({names(test_data)[-c(8, 42, 45:47)]})"), 
           names(test_data)[42]),
        collapse = " + ")

student_nocirc_noBili <- 
  gam(as.formula(paste0("sofa_nocirc ~ ", list_wo_bili)), 
      data = test_data)

student_circ_noBili <- 
  gam(as.formula(paste0("sofa_circ ~ ", list_wo_bili)), 
      data = test_data)


summary(student_circ_all)

Family: gaussian 
Link function: identity 

Formula:
sofa_circ ~ s(bili, k = 100) + s(amv) + s(heartrate) + s(leuko) + 
    s(temp) + s(pco2) + s(rr) + s(artph) + s(bun) + s(crea) + 
    s(dbp) + s(fio2) + s(mbp) + s(pao2) + s(sbp) + s(thrombo) + 
    s(lactate) + s(bicarb) + s(crp) + s(haemo) + s(hzv) + s(lympho) + 
    s(natrium) + s(panklip) + s(pct) + s(quinr) + s(saO2) + s(glucose) + 
    s(baseex) + s(chlorid) + s(calcium) + s(kalium) + s(szvo2) + 
    s(urine24) + s(netto) + s(alat) + s(asat) + s(svol) + s(svri) + 
    s(adrenaline) + s(dobutamine) + s(noradrenaline) + s(horovitz) + 
    dopamine

Parametric coefficients:
              Estimate Std. Error  t value Pr(>|t|)    
(Intercept)  0.4509313  0.0003443 1309.567   <2e-16 ***
dopamine    -0.0015186  0.0098316   -0.154    0.877    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Approximate significance of smooth terms:
                    edf Ref.df         F  p-value    
s(bili)          98.550 98.989 19578.150  < 2e-16 ***
s(amv)            8.841  8.992     9.038  < 2e-16 ***
s(heartrate)      7.443  8.302     8.907  < 2e-16 ***
s(leuko)          8.875  8.995    28.253  < 2e-16 ***
s(temp)           8.701  8.967     9.274  < 2e-16 ***
s(pco2)           7.804  8.583    15.586  < 2e-16 ***
s(rr)             8.892  8.995    26.113  < 2e-16 ***
s(artph)          8.989  9.000    18.069  < 2e-16 ***
s(bun)            8.859  8.993    28.867  < 2e-16 ***
s(crea)           8.687  8.968    37.583  < 2e-16 ***
s(dbp)            7.801  8.569     6.109  < 2e-16 ***
s(fio2)           8.341  8.860    12.376  < 2e-16 ***
s(mbp)            5.313  6.554     3.890 0.000375 ***
s(pao2)           1.000  1.000     0.283 0.594680    
s(sbp)            4.070  5.165    13.270  < 2e-16 ***
s(thrombo)        8.897  8.996    37.516  < 2e-16 ***
s(lactate)        8.928  8.998    89.932  < 2e-16 ***
s(bicarb)         5.896  7.067     6.714  < 2e-16 ***
s(crp)            8.903  8.997    42.009  < 2e-16 ***
s(haemo)          8.931  8.998    30.212  < 2e-16 ***
s(hzv)            8.885  8.992    55.912  < 2e-16 ***
s(lympho)         8.842  8.990    41.999  < 2e-16 ***
s(natrium)        8.887  8.993    22.328  < 2e-16 ***
s(panklip)        8.994  9.000    82.565  < 2e-16 ***
s(pct)            8.996  9.000   342.054  < 2e-16 ***
s(quinr)          8.923  8.997    35.126  < 2e-16 ***
s(saO2)           7.065  7.966    10.314  < 2e-16 ***
s(glucose)        6.946  7.935     5.396 3.42e-06 ***
s(baseex)         8.584  8.876     3.987 3.44e-05 ***
s(chlorid)        8.993  9.000    28.114  < 2e-16 ***
s(calcium)        8.763  8.979    29.603  < 2e-16 ***
s(kalium)         8.879  8.993    11.707  < 2e-16 ***
s(szvo2)          8.751  8.976    24.289  < 2e-16 ***
s(urine24)        4.561  5.606    21.103  < 2e-16 ***
s(netto)          6.584  7.747    10.569  < 2e-16 ***
s(alat)           8.892  8.996    26.953  < 2e-16 ***
s(asat)           8.849  8.991    23.471  < 2e-16 ***
s(svol)           8.313  8.836     6.944  < 2e-16 ***
s(svri)           8.878  8.994    11.287  < 2e-16 ***
s(adrenaline)     4.420  4.495     0.173 0.890904    
s(dobutamine)     8.801  8.986    14.971  < 2e-16 ***
s(noradrenaline)  4.420  4.495     0.173 0.890904    
s(horovitz)       4.621  5.806     2.323 0.025717 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Rank: 478/479
R-sq.(adj) =  0.988   Deviance explained = 98.8%
GCV = 0.0096127  Scale est. = 0.009562  n = 80671


summary(student_circ_noBili)

Family: gaussian 
Link function: identity 

Formula:
sofa_circ ~ s(amv) + s(heartrate) + s(leuko) + s(temp) + s(pco2) + 
    s(rr) + s(artph) + s(bun) + s(crea) + s(dbp) + s(fio2) + 
    s(mbp) + s(pao2) + s(sbp) + s(thrombo) + s(lactate) + s(bicarb) + 
    s(crp) + s(haemo) + s(hzv) + s(lympho) + s(natrium) + s(panklip) + 
    s(pct) + s(quinr) + s(saO2) + s(glucose) + s(baseex) + s(chlorid) + 
    s(calcium) + s(kalium) + s(szvo2) + s(urine24) + s(netto) + 
    s(alat) + s(asat) + s(svol) + s(svri) + s(adrenaline) + s(dobutamine) + 
    s(noradrenaline) + s(horovitz) + dopamine

Parametric coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.451014   0.001714 263.093  < 2e-16 ***
dopamine    -0.134473   0.049465  -2.719  0.00656 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Approximate significance of smooth terms:
                   edf Ref.df       F p-value    
s(amv)           8.767  8.983  66.969 < 2e-16 ***
s(heartrate)     8.619  8.940  46.573 < 2e-16 ***
s(leuko)         8.966  9.000  62.462 < 2e-16 ***
s(temp)          8.801  8.985  15.508 < 2e-16 ***
s(pco2)          7.892  8.647  16.848 < 2e-16 ***
s(rr)            8.897  8.995  64.667 < 2e-16 ***
s(artph)         8.678  8.962  63.363 < 2e-16 ***
s(bun)           8.774  8.984 793.710 < 2e-16 ***
s(crea)          8.968  9.000 536.120 < 2e-16 ***
s(dbp)           8.900  8.996  33.352 < 2e-16 ***
s(fio2)          8.968  9.000  53.634 < 2e-16 ***
s(mbp)           8.509  8.920  23.865 < 2e-16 ***
s(pao2)          8.699  8.970  19.481 < 2e-16 ***
s(sbp)           8.657  8.960  65.782 < 2e-16 ***
s(thrombo)       8.982  9.000 375.255 < 2e-16 ***
s(lactate)       8.880  8.995 214.424 < 2e-16 ***
s(bicarb)        9.000  9.000  34.146 < 2e-16 ***
s(crp)           8.970  9.000 114.900 < 2e-16 ***
s(haemo)         8.855  8.992 120.993 < 2e-16 ***
s(hzv)           8.862  8.992 243.880 < 2e-16 ***
s(lympho)        8.973  9.000 686.630 < 2e-16 ***
s(natrium)       8.903  8.994  68.346 < 2e-16 ***
s(panklip)       8.959  8.999 204.805 < 2e-16 ***
s(pct)           8.991  9.000 663.022 < 2e-16 ***
s(quinr)         8.990  9.000 420.922 < 2e-16 ***
s(saO2)          8.410  8.876   8.925 < 2e-16 ***
s(glucose)       8.526  8.922  61.229 < 2e-16 ***
s(baseex)        8.811  8.979  19.517 < 2e-16 ***
s(chlorid)       8.687  8.955 146.420 < 2e-16 ***
s(calcium)       8.948  8.999 178.005 < 2e-16 ***
s(kalium)        8.489  8.902  20.075 < 2e-16 ***
s(szvo2)         8.960  8.999 123.861 < 2e-16 ***
s(urine24)       8.906  8.997 169.002 < 2e-16 ***
s(netto)         4.407  5.541   3.393 0.00354 ** 
s(alat)          8.863  8.994 124.554 < 2e-16 ***
s(asat)          8.983  9.000 490.551 < 2e-16 ***
s(svol)          8.942  8.998 137.503 < 2e-16 ***
s(svri)          8.960  8.999 144.925 < 2e-16 ***
s(adrenaline)    4.327  4.478   0.350 0.90504    
s(dobutamine)    8.958  8.999  26.732 < 2e-16 ***
s(noradrenaline) 4.327  4.478   0.350 0.90504    
s(horovitz)      8.817  8.984  14.577 < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Rank: 379/380
R-sq.(adj) =  0.699   Deviance explained =   70%
GCV = 0.23805  Scale est. = 0.237     n = 80671


summary(student_nocirc_all)

Family: gaussian 
Link function: identity 

Formula:
sofa_nocirc ~ s(bili) + s(amv) + s(heartrate) + s(leuko) + s(temp) + 
    s(pco2) + s(rr) + s(artph) + s(bun) + s(crea) + s(dbp) + 
    s(fio2) + s(mbp) + s(pao2) + s(sbp) + s(thrombo) + s(lactate) + 
    s(bicarb) + s(crp) + s(haemo) + s(hzv) + s(lympho) + s(natrium) + 
    s(panklip) + s(pct) + s(quinr) + s(saO2) + s(glucose) + s(baseex) + 
    s(chlorid) + s(calcium) + s(kalium) + s(szvo2) + s(urine24) + 
    s(netto) + s(alat) + s(asat) + s(svol) + s(svri) + s(adrenaline) + 
    s(dobutamine) + s(noradrenaline) + s(horovitz) + dopamine

Parametric coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.145619   0.001223 119.031   <2e-16 ***
dopamine    -0.065084   0.035277  -1.845    0.065 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Approximate significance of smooth terms:
                   edf Ref.df       F p-value    
s(bili)          8.658  8.956 341.006  <2e-16 ***
s(amv)           8.989  9.000  24.109  <2e-16 ***
s(heartrate)     8.386  8.854  13.746  <2e-16 ***
s(leuko)         8.950  8.999 177.877  <2e-16 ***
s(temp)          8.941  8.999  13.957  <2e-16 ***
s(pco2)          8.577  8.938  85.480  <2e-16 ***
s(rr)            7.507  8.368  19.634  <2e-16 ***
s(artph)         8.638  8.947  19.972  <2e-16 ***
s(bun)           8.974  9.000 199.799  <2e-16 ***
s(crea)          8.920  8.998  89.147  <2e-16 ***
s(dbp)           4.735  5.953   2.189  0.0379 *  
s(fio2)          8.897  8.996  26.858  <2e-16 ***
s(mbp)           8.615  8.937   7.215  <2e-16 ***
s(pao2)          8.499  8.925   8.084  <2e-16 ***
s(sbp)           7.987  8.697  10.855  <2e-16 ***
s(thrombo)       8.627  8.958 151.360  <2e-16 ***
s(lactate)       8.502  8.920  56.828  <2e-16 ***
s(bicarb)        8.164  8.675  29.731  <2e-16 ***
s(crp)           8.951  8.999  54.393  <2e-16 ***
s(haemo)         8.950  8.999  71.739  <2e-16 ***
s(hzv)           8.955  8.999 318.697  <2e-16 ***
s(lympho)        8.994  9.000 148.275  <2e-16 ***
s(natrium)       8.959  8.999  58.669  <2e-16 ***
s(panklip)       8.965  8.999 345.960  <2e-16 ***
s(pct)           8.974  9.000 129.815  <2e-16 ***
s(quinr)         9.000  9.000 147.735  <2e-16 ***
s(saO2)          8.155  8.758  13.268  <2e-16 ***
s(glucose)       8.700  8.967   9.810  <2e-16 ***
s(baseex)        8.452  8.805  23.238  <2e-16 ***
s(chlorid)       8.590  8.925  38.976  <2e-16 ***
s(calcium)       8.170  8.774  16.816  <2e-16 ***
s(kalium)        6.905  7.830  13.295  <2e-16 ***
s(szvo2)         8.906  8.996  66.918  <2e-16 ***
s(urine24)       8.683  8.968  26.044  <2e-16 ***
s(netto)         7.184  8.230  11.319  <2e-16 ***
s(alat)          8.499  8.923 137.403  <2e-16 ***
s(asat)          8.964  8.999 425.347  <2e-16 ***
s(svol)          8.997  9.000  57.817  <2e-16 ***
s(svri)          8.876  8.994  27.571  <2e-16 ***
s(adrenaline)    4.432  4.496   0.058  0.9947    
s(dobutamine)    8.886  8.995  46.760  <2e-16 ***
s(noradrenaline) 4.432  4.496   0.058  0.9947    
s(horovitz)      9.000  9.000  10.153  <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Rank: 388/389
R-sq.(adj) =  0.529   Deviance explained = 53.1%
GCV = 0.12124  Scale est. = 0.1207    n = 80671


summary(student_nocirc_noBili)

Family: gaussian 
Link function: identity 

Formula:
sofa_nocirc ~ s(amv) + s(heartrate) + s(leuko) + s(temp) + s(pco2) + 
    s(rr) + s(artph) + s(bun) + s(crea) + s(dbp) + s(fio2) + 
    s(mbp) + s(pao2) + s(sbp) + s(thrombo) + s(lactate) + s(bicarb) + 
    s(crp) + s(haemo) + s(hzv) + s(lympho) + s(natrium) + s(panklip) + 
    s(pct) + s(quinr) + s(saO2) + s(glucose) + s(baseex) + s(chlorid) + 
    s(calcium) + s(kalium) + s(szvo2) + s(urine24) + s(netto) + 
    s(alat) + s(asat) + s(svol) + s(svri) + s(adrenaline) + s(dobutamine) + 
    s(noradrenaline) + s(horovitz) + dopamine

Parametric coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.145620   0.001246  116.84   <2e-16 ***
dopamine    -0.067096   0.035876   -1.87   0.0615 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Approximate significance of smooth terms:
                   edf Ref.df       F  p-value    
s(amv)           8.708  8.969  19.466  < 2e-16 ***
s(heartrate)     8.379  8.851  18.245  < 2e-16 ***
s(leuko)         8.945  8.999 127.259  < 2e-16 ***
s(temp)          8.942  8.999  13.710  < 2e-16 ***
s(pco2)          8.648  8.957  81.566  < 2e-16 ***
s(rr)            8.281  8.812  19.930  < 2e-16 ***
s(artph)         8.885  8.992  21.268  < 2e-16 ***
s(bun)           8.997  9.000 483.079  < 2e-16 ***
s(crea)          8.940  8.999 172.022  < 2e-16 ***
s(dbp)           5.935  7.126   5.676 1.35e-06 ***
s(fio2)          8.906  8.997  26.192  < 2e-16 ***
s(mbp)           8.659  8.950   8.416  < 2e-16 ***
s(pao2)          7.983  8.718  11.112  < 2e-16 ***
s(sbp)           7.995  8.700   8.334  < 2e-16 ***
s(thrombo)       8.765  8.983 154.747  < 2e-16 ***
s(lactate)       8.719  8.973  63.021  < 2e-16 ***
s(bicarb)        8.321  8.769  25.663  < 2e-16 ***
s(crp)           8.965  9.000  51.636  < 2e-16 ***
s(haemo)         8.946  8.998  74.662  < 2e-16 ***
s(hzv)           8.885  8.994 344.949  < 2e-16 ***
s(lympho)        8.864  8.993 158.383  < 2e-16 ***
s(natrium)       8.957  8.999  78.725  < 2e-16 ***
s(panklip)       8.973  9.000 291.342  < 2e-16 ***
s(pct)           8.973  9.000 182.341  < 2e-16 ***
s(quinr)         8.997  9.000 150.845  < 2e-16 ***
s(saO2)          7.767  8.521  13.040  < 2e-16 ***
s(glucose)       8.642  8.953  12.522  < 2e-16 ***
s(baseex)        8.612  8.882  20.818  < 2e-16 ***
s(chlorid)       8.771  8.974  58.678  < 2e-16 ***
s(calcium)       6.749  7.753  25.773  < 2e-16 ***
s(kalium)        6.559  7.516  12.404  < 2e-16 ***
s(szvo2)         8.934  8.998  68.693  < 2e-16 ***
s(urine24)       7.867  8.644  32.162  < 2e-16 ***
s(netto)         6.678  7.828   9.655  < 2e-16 ***
s(alat)          8.758  8.981 129.881  < 2e-16 ***
s(asat)          8.962  8.999 516.932  < 2e-16 ***
s(svol)          8.997  9.000  66.533  < 2e-16 ***
s(svri)          8.878  8.994  37.673  < 2e-16 ***
s(adrenaline)    4.410  4.494   0.031    0.998    
s(dobutamine)    8.902  8.997  78.035  < 2e-16 ***
s(noradrenaline) 4.410  4.494   0.031    0.998    
s(horovitz)      9.000  9.000  13.054  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Rank: 379/380
R-sq.(adj) =  0.511   Deviance explained = 51.3%
GCV = 0.12581  Scale est. = 0.12527   n = 80671


corr_matrix[order(abs(corr_matrix[,"circular"]), decreasing=TRUE),][1:5,]


dev_nocirc   <- round(summary(student_circ_noBili)$dev.expl * 100)
dev_withcirc <- round(summary(student_circ_all)$dev.expl * 100)

par(mfrow = c(5,2),  mar=c(2,3,2,3), oma = c(1,1,1,1))
 #first row: bilirubin 
   plot(x = 0:15, y = seq(0,4, length.out=16),
     xlim   = c(0,15), 
     ylim   = c(0,4),
     main   = glue("Student without Bilirubin (D\U00B2={dev_nocirc}%)"),
     ylab   = "", 
     xlab   = "",
     type   = "n")
  text(labels = "not included in student model",
       x      = mean(c(0,15)),
       y      = mean(c(0,4)),
       adj    = .5)
  
  plot(student_circ_all, 
     se     = FALSE, 
     #rug   = TRUE, 
     xlim   = c(0,15), 
     ylim   = c(0,4),
     main   = glue("Student with all Features (D\U00B2={dev_withcirc}%)"),
     ylab   = "", 
     xlab   = "",
     select = 1,
     shift  = coef(student_circ_all)[1])
   rug(unique(test_data$bili))
  mtext("bilirubin", side = 4, line = 1)
  
  #second row: asat
  plot(student_circ_noBili, 
       se     = FALSE, 
       #rug    = TRUE, 
       ylab   = "Feature Shape",
       xlab   = "",
       ylim   = c(-1,4), 
       xlim   = c(0, 550), 
       select = 15)
    abline(h=0, lty = 3)
    rug(unique(test_data$thrombo))
  
  plot(student_circ_all, 
       #se     = TRUE, 
       #rug    = TRUE, 
       ylab   = "",
       xlab   = "",
       ylim   = c(-1,4), 
       xlim   = c(0, 550), 
       select = 16)
    abline(h=0, lty = 3)
      rug(unique(test_data$thrombo))
  mtext("thrombo", side = 4, line = 1)
  
  #third row: quinr
  plot(student_circ_noBili, 
       se     = FALSE, 
       #rug    = TRUE, 
       ylab   = "",
       xlab   = "",
       ylim   = c(-1,4),
       xlim   = c(0, 18),
       select = 20)
    abline(h=0, lty = 3)
    rug(unique(test_data$hzv))
  
  plot(student_circ_all, 
       se     = FALSE, 
       #rug    = TRUE, 
       ylab   = "",
       xlab   = "",
       ylim   = c(-1,4),
       xlim   = c(0, 18),
       select = 21)
    abline(h=0, lty = 3)
    rug(unique(test_data$hzv))
  mtext("hzv", side = 4, line = 1)
  
  #forth row: alat
  plot(student_circ_noBili, 
       se     = FALSE, 
       #rug    = TRUE, 
       #ylab   = "", 
       xlab   = "",
       ylim   = c(-1,4), 
       xlim   = c(0, 7000), 
       select = 38)
    rug(unique(test_data$svri))
    abline(h=0, lty = 3)
  
  plot(student_circ_all, 
       se     = FALSE, 
       #rug    = TRUE, 
       ylab   = "", 
       #xlab   = "",
       ylim   = c(-1,4), 
       xlim   = c(0, 7000),
       select = 39)
    abline(h=0, lty = 3)
    rug(unique(test_data$svri))
  mtext("svri", side = 4, line = 1)
  
  #fifth row: hzv
  plot(student_circ_noBili, 
       se     = FALSE, 
       #rug    = TRUE, 
       ylab   = "",
       #xlab   = "",
       ylim   = c(-1,4),
       #xlim   = c(, ),
       select = 33)
    abline(h=0, lty = 3)
    rug(unique(test_data$urine24))
  
  plot(student_circ_all, 
       se     = FALSE, 
       #rug    = TRUE, 
       ylab   = "",
       xlab   = "",
       ylim   = c(-1,4),
       #xlim   = c(, ),
       select = 34)
    abline(h=0, lty = 3)
    rug(unique(test_data$urine24))
  mtext("urine24", side = 4, line = 1)
par(mfrow = c(1,1), mar = c(5,4,4,2))

Warning message in rug(unique(test_data$bili)):
“some values will be clipped”
Warning message in rug(unique(test_data$thrombo)):
“some values will be clipped”
Warning message in rug(unique(test_data$thrombo)):
“some values will be clipped”
Warning message in rug(unique(test_data$hzv)):
“some values will be clipped”
Warning message in rug(unique(test_data$hzv)):
“some values will be clipped”
Warning message in rug(unique(test_data$svri)):
“some values will be clipped”
Warning message in rug(unique(test_data$svri)):
“some values will be clipped”


ablation_data <- readRDS("data/ablation_set_with_sofa_predictions.rds")


with(ablation_data, accuracy(target, sofa_circ))
with(ablation_data, class_accuracy(target, sofa_circ))


xtabs(~ target + sofa_circ, data = ablation_data)

      sofa_circ
target     0     1
     0 61546   159
     1  4776     3
     2 10296    28
     3  3388    24
     4   451     0


with(ablation_data, accuracy(target, sofa_nocirc))
with(ablation_data, class_accuracy(target, sofa_nocirc))


xtabs(~ target + sofa_nocirc, data = ablation_data)

      sofa_nocirc
target     0     1     2     3
     0 59810   951   916    28
     1  4333   206   240     0
     2  7584   845  1626   269
     3  1950   497   819   146
     4    95    18   318    20


feature_list <- 
  corr_matrix[order(abs(corr_matrix[,"circular"]), decreasing=TRUE),][1:5,] %>%
  rownames(.) %>% 
  paste0("s(", .,")")


fitted_models <-
  #generate the power set of feature_list, and remove the void set 
  powerSet(feature_list)[-(2^length(feature_list))] %>%
  #build the symbolic model description  
  sapply(., function(...) glue("sofa_circ~{glue_collapse(..., sep='+')}")) %>%
  #fit the models to data, and extract key statistics
  tibble(formula_str = .,
         models      = lapply(X = .,
                              FUN  = function(m_str,...) bam(as.formula(m_str), ...),
                              data = test_data),
         data_fit    = floor(sapply(models, function(m) summary(m)$dev.expl) * 100),
         complexity  = sapply(models, function(...) attr(logLik(...), 'df'))) %>%
  #Sort the models so that we can find models that replicate the data well.
  #For this models data_fit should be approximately 100.
  arrange(desc(data_fit), complexity)


head(fitted_models[,-2], n = 10)


selected_model <- gam(sofa_circ ~ s(bili, k = 100), data=test_data)

summary(selected_model)

Family: gaussian 
Link function: identity 

Formula:
sofa_circ ~ s(bili, k = 100)

Parametric coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.4509303  0.0003793    1189   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Approximate significance of smooth terms:
          edf Ref.df     F p-value    
s(bili) 98.62  98.99 54501  <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

R-sq.(adj) =  0.985   Deviance explained = 98.5%
GCV = 0.011619  Scale est. = 0.011604  n = 80671


jump_bili <- function(x) (x > 12) + (x > 6) + (x > 2) + (x > 1.2) + 0

plot(selected_model, 
     se    = FALSE, 
     #rug   = TRUE, 
     xlim  = c(0,15), 
     ylim  = c(0,4),
     main = "",
     ylab  = "feature shape", 
     xlab = "bilirubin",
     shift = coef(selected_model)[1])
rug(unique(test_data$bili))
curve(jump_bili, from = 0, to = 15, n = 1001, add = TRUE, col = "blue", lty="dashed")
abline(v = c(1.2, 2, 6, 12), col = "blue", lty="dotted")
curve(jump_bili, from = 0, to = 15, n = 1001, add = TRUE, col = "blue", lty="dashed")
abline(v = c(1.2, 2, 6, 12), col = "blue", lty="dotted")
legend(legend = c("Theoretical", "Estimated"), 
       col    = c("blue", "black"),
       lty    = c("dashed", "solid"),
       bty    = "o",
       bg     = "white",
       x      = "topleft")

Warning message in rug(unique(test_data$bili)):
“some values will be clipped”

	label	circular	non-circular
bili	0.9146881	0.9079398	0.4524725
thrombo	-0.3755966	-0.3686422	-0.2219831
hzv	0.3443854	0.3490582	0.3396028
svri	-0.2962417	-0.2999327	-0.2233361
urine24	-0.2844442	-0.2810587	-0.1547634

formula_str	data_fit	complexity
<chr>	<dbl>	<dbl>
sofa_circ~s(bili)	97	10.99951
sofa_circ~s(bili)+s(thrombo)	97	19.33408
sofa_circ~s(bili)+s(urine24)	97	19.61895
sofa_circ~s(bili)+s(hzv)	97	19.90596
sofa_circ~s(bili)+s(svri)	97	19.92417
sofa_circ~s(bili)+s(thrombo)+s(urine24)	97	27.89822
sofa_circ~s(bili)+s(thrombo)+s(hzv)	97	28.22188
sofa_circ~s(bili)+s(thrombo)+s(svri)	97	28.26034
sofa_circ~s(bili)+s(hzv)+s(urine24)	97	28.58532
sofa_circ~s(bili)+s(svri)+s(urine24)	97	28.66155

Model Distillation: Liver SOFA Example¶

Performance Evaluation¶

Circular Model - Train set accuracy (overall and per class)¶

Circular Model - Test set accuracy (overall and per class)¶

Non-circular Model - Train set accuracy (overall and per class)¶

Non-circular Model - Test set accuracy (overall and per class)¶

Correlation pattern¶

Nullification¶

TEACHER: circular, STUDENT: all features¶

TEACHER: circular, STUDENT: no bili¶

TEACHER: non-circular, STUDENT: all features¶

TEACHER: non-circular, STUDENT: no bili¶

Ablation study¶

Circular Model¶

Non-Circular Model¶

Search¶

Top 10¶

Selected Model¶