Skip to contents

Constructs a `feature_data` object, calculates scaling parameters, and optionally applies initial data transformations.

Usage

create_preprocessed_data(
  data,
  target_col,
  id_col = NULL,
  split_col = NULL,
  scale_option = c("train", "test", "all"),
  scale_method = c("zscore", "minmax", "none"),
  fun_transform = NULL,
  fun_inverse = NULL
)

Arguments

data

A data frame containing the raw data.

target_col

Character vector specifying the name of the target variable column.

id_col

Character vector specifying the name of the ID column (optional).

split_col

Character vector specifying the name of the column used for train/test split.

scale_option

Character string specifying the scaling option. Must be one of "train", "test", or "all". Defaults to "train".

scale_method

Character string specifying the scaling method. Must be one of "zscore" (standardization) or "minmax" (normalization). Defaults to "zscore".

fun_transform

A named list of functions to apply to the data *before* scaling. The names of the list elements should correspond to the columns to transform.

fun_inverse

A named list of functions representing the inverse transformations of `fun_transform`. These are applied during the `inverse_transform` method.

Value

A `feature_data` object.

Details

This function calculates scaling parameters (mean, standard deviation, min, max) based on the specified `scale_option` and `scale_method`. It then creates a `feature_data` object, storing the original data and the calculated parameters. If `fun_transform` is provided, it applies the specified transformations to the data before creating the object.

If no `id_col` is given, an additional column `id` with row numers is created.

If a `split_col` is missing, an random `split` column is created.

Examples

# Example usage (replace with your actual data and column names)
df <- data.frame(
  id = 1:10,
  x = runif(10),
  y = rnorm(10),
  z = 1:10,
  split = sample(c(TRUE, FALSE), 10, replace = TRUE)
)

transformations <- list(
  x = log,
  y = \(y) sqrt(5 + y),
  z = \(z) z^2
)

inverse_transformations <- list(
  x = exp,
  y = \(y) y^2 - 5,
  z = sqrt
)

prep_data <- create_preprocessed_data(
  df, id_col = "id", target_col = "y", split_col = "split",
  fun_transform = transformations, fun_inverse = inverse_transformations)
print(prep_data)
#> An object of class "feature_data"
#> Slot "data":
#>    id           x          y  z split
#> 1   1 0.080750138  1.1484116  1  TRUE
#> 2   2 0.834333037 -1.8218177  2  TRUE
#> 3   3 0.600760886 -0.2473253  3 FALSE
#> 4   4 0.157208442 -0.2441996  4 FALSE
#> 5   5 0.007399441 -0.2827054  5 FALSE
#> 6   6 0.466393497 -0.5536994  6  TRUE
#> 7   7 0.497777389  0.6289820  7  TRUE
#> 8   8 0.289767245  2.0650249  8 FALSE
#> 9   9 0.732881987 -1.6309894  9  TRUE
#> 10 10 0.772521511  0.5124269 10  TRUE
#> 
#> Slot "params":
#> An object of class "feature_params"
#> Slot "id_col":
#> [1] "id"
#> 
#> Slot "target_col":
#> [1] "y"
#> 
#> Slot "split_col":
#> [1] "split"
#> 
#> Slot "scale_option":
#> [1] "train"
#> 
#> Slot "scale_method":
#> [1] "zscore"
#> 
#> Slot "mean_vals":
#>          x          y          z 
#>  0.5641096 -0.2861143  5.8333333 
#> 
#> Slot "sd_vals":
#>         x         y         z 
#> 0.2801875 1.2467245 3.6560452 
#> 
#> Slot "min_vals":
#>           x           y           z 
#>  0.08075014 -1.82181766  1.00000000 
#> 
#> Slot "max_vals":
#>         x         y         z 
#>  0.834333  1.148412 10.000000 
#> 
#> Slot "t_mean_vals":
#>          x          y          z 
#> -0.7877854  2.1544767 45.1666667 
#> 
#> Slot "t_sd_vals":
#>          x          y          z 
#>  0.8801146  0.2941755 40.0869887 
#> 
#> Slot "t_min_vals":
#>         x         y         z 
#> -2.516396  1.782746  1.000000 
#> 
#> Slot "t_max_vals":
#>           x           y           z 
#>  -0.1811226   2.4795991 100.0000000 
#> 
#> Slot "fun_transform":
#> $x
#> function (x, base = exp(1))  .Primitive("log")
#> 
#> $y
#> function (y) 
#> sqrt(5 + y)
#> <environment: 0x5653a0c0c630>
#> 
#> $z
#> function (z) 
#> z^2
#> <environment: 0x5653a0c0c630>
#> 
#> 
#> Slot "fun_inverse":
#> $x
#> function (x)  .Primitive("exp")
#> 
#> $y
#> function (y) 
#> y^2 - 5
#> <environment: 0x5653a0c0c630>
#> 
#> $z
#> function (x)  .Primitive("sqrt")
#> 
#> 
#> Slot "transformed":
#> [1] FALSE
#> 
#>