LassoDevelopment
, RandomForestDevelopment
, and LinearMixedModelDevelopment
These classes let one create, compare, and save custom models on varied datasets.
One can do both classification (ie, predict Y or N) as well as regression (ie, predict a numeric field, like cost).
Nope. It’ll help if you can follow these guidelines:
LinearMixedModelDevelopment
(detailed below).selectData
library(healthcareai)
library(RODBC)
connection.string = "
driver={SQL Server};
server=localhost;
database=SAM;
trusted_connection=true
"
# This query should pull only rows for training. They must have a label.
query = "
SELECT
[PatientEncounterID]
,[PatientID]
,[SystolicBPNBR]
,[LDLNBR]
,[A1CNBR]
,[GenderFLG]
,[ThirtyDayReadmitFLG]
FROM [SAM].[dbo].[HCRDiabetesClinical]
"
df <- selectData(connection.string, query)
head(df)
Note: if you want a CSV example (ie, an example that you can run as-is), see the built-in docs:
library(healthcareai)
?healthcareai
SupervisedModelDevelopmentParams
p <- SupervisedModelDevelopmentParams$new()
p$df = df
p$type = 'classification'
p$impute = TRUE
p$grainCol = 'PatientEncounterID'
p$predictedCol = 'ThirtyDayReadmitFLG'
p$debug = FALSE
p$cores = 1
LassoDevelopment
and RandomForestDevelopment
algorithms.# Run Lasso
Lasso <- LassoDevelopment$new(p)
Lasso$run()
# Run Random Forest
rf <- RandomForestDevelopment$new(p)
rf$run()
LassoDevelopment
DetailsThis version of Lasso is based on the Grouped Lasso alogrithm offered by the grpreg package. We prefer simple models to complicated ones, so for tuning the lambda regularization parameter, we use the 1SE rule, which means that we take the model with fewest coefficients, which is also within one standard error of the best model. This way, we provide guidance as to which features (ie, columns) should be kept in the deployed model.
RandomForestDevelopment
DetailsThis version of random forest is based on the wonderful ranger package.
ptm <- proc.time()
library(healthcareai)
connection.string <- "
driver={SQL Server};
server=localhost;
database=SAM;
trusted_connection=true
"
# This query should pull only rows for training. They must have a label.
query <- "
SELECT
[PatientEncounterID]
,[SystolicBPNBR]
,[LDLNBR]
,[A1CNBR]
,[GenderFLG]
,[ThirtyDayReadmitFLG]
FROM [SAM].[dbo].[HCRDiabetesClinical]
"
df <- selectData(connection.string, query)
head(df)
set.seed(42)
p <- SupervisedModelDevelopmentParams$new()
p$df <- df
p$type <- "classification"
p$impute <- TRUE
p$grainCol <- "PatientEncounterID"
p$predictedCol <- "ThirtyDayReadmitFLG"
p$debug <- FALSE
p$cores <- 1
# Run Lasso
lasso <- LassoDevelopment$new(p)
lasso$run()
set.seed(42)
# Run Random Forest
rf <- RandomForestDevelopment$new(p)
rf$run()
# Plot ROC
rocs <- list(rf$getROC(), lasso$getROC())
names <- c("Random Forest", "Lasso")
legendLoc <- "bottomright"
plotROCs(rocs, names, legendLoc)
# Plot PR Curve
rocs <- list(rf$getPRCurve(), lasso$getPRCurve())
names <- c("Random Forest", "Lasso")
legendLoc <- "bottomleft"
plotPRCurve(rocs, names, legendLoc)
cat(proc.time() - ptm,"\n")
LinearMixedModelDevelopment
DetailsThis mixed model is designed for longitudinal datasets (ie, those that typically have more than one row per-person). The method is based on the lme4 package. It’s not as computationally efficient as the random forest algorithm, so it’s best to compare against the other algorithms on smaller datasets, and then scale up from there. In particular, this method works best on data sets having fewer than 10,000 rows.
ptm <- proc.time()
library(healthcareai)
connection.string <- "
driver={SQL Server};
server=localhost;
database=SAM;
trusted_connection=true
"
# This query should pull only rows for training. They must have a label.
query <- "
SELECT
[PatientEncounterID]
,[PatientID]
,[SystolicBPNBR]
,[LDLNBR]
,[A1CNBR]
,[GenderFLG]
,[ThirtyDayReadmitFLG]
FROM [SAM].[dbo].[HCRDiabetesClinical]
"
df <- selectData(connection.string, query)
head(df)
set.seed(42)
p <- SupervisedModelDevelopmentParams$new()
p$df <- df
p$type <- "classification"
p$impute <- TRUE
p$grainCol <- "PatientEncounterID"
p$personCol <- "PatientID"
p$predictedCol <- "ThirtyDayReadmitFLG"
p$debug <- FALSE
p$cores <- 1
# Create Mixed Model
lmm <- LinearMixedModelDevelopment$new(p)
lmm$run()
# Remove person col, since RF can't use it
df$personCol <- NULL
p$df <- df
p$personCol <- NULL
set.seed(42)
# Run Random Forest
rf <- RandomForestDevelopment$new(p)
rf$run()
# Plot ROC
rocs <- list(lmm$getROC(), rf$getROC())
names <- c("Linear Mixed Model", "Random Forest")
legendLoc <- "bottomright"
plotROCs(rocs, names, legendLoc)
# Plot PR Curve
rocs <- list(lmm$getPRCurve(), rf$getPRCurve())
names <- c("Linear Mixed Model", "Random Forest")
legendLoc <- "bottomleft"
plotPRCurve(rocs, names, legendLoc)
cat(proc.time() - ptm, '\n')
Note: if you want a CSV example (ie, an example that you can run as-is), see the built-in docs:
library(healthcareai)
?healthcareai