% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/mice.spark.R
\name{sampler.spark}
\alias{sampler.spark}
\title{MICE sampler function}
\usage{
sampler.spark(
  sc,
  data,
  imp_init,
  fromto,
  var_types,
  ud_methods = NULL,
  predictorMatrix = NULL,
  checkpointing,
  checkpoint_frequency = 10,
  printFlag
)
}
\arguments{
\item{sc}{A Spark connection}

\item{data}{A Spark DataFrame, the original data with missing values}

\item{imp_init}{A Spark DataFrame, the original data with missing values, but with initial imputation (by random sampling or mean/median/mode imputation)}

\item{fromto}{A vector of length 2, the range of iterations to perform (from, to)}

\item{var_types}{A named character vector, the variable types of the columns in the data.}

\item{ud_methods}{The user-defined methods for imputing each variables. Beta}

\item{predictorMatrix}{A matrix, the predictor matrix to use for the imputation. Beta}

\item{checkpointing}{Default TRUE. Can be set to FALSE if you are running the package without access to a HDFS directory for checkpointing. It is strongly recommended to keep it to TRUE to avoid Stackoverflow errors.}

\item{checkpoint_frequency}{Advanced parameter, modify with care. If checkpointing = TRUE, how often to checkpoint , default = 10, so after processing every 10 variables, the lineage will be cut and the current state of computation will be save to disk. A low number might slow down computation but enable bigger computation. A number too high (or not checkpoiting) might cause JVM stackOverflowError as the lineage will have grown too big.}

\item{printFlag}{A boolean, whether to print debug information.}
}
\value{
The Spark DataFrame with missing values imputed for all variables
}
\description{
This function is the core of the MICE algorithm. It iteratively imputes missing values in a Spark DataFrame using a set of imputation methods based on the variable types.
}
\examples{
# This example is not executed since it needs additional software (Apache Spark)
\dontrun{
# Example for sampler.spark function
library(sparklyr)
library(dplyr)

# Connect to Spark
# Assumes that you have already installed Spark with sparklyr::spark_install()
sc <- spark_connect(master = "local")

# Create sample data with missing values
sample_data <- data.frame(
  age = c(25, NA, 35, 28, 45, NA),
  income = c(50000, 60000, NA, 55000, 80000, 52000),
  education = c("High", "Medium", "High", NA, "Medium", "Medium")
)

# Copy to Spark DataFrame
sdf <- copy_to(sc, sample_data, "sample_data")

# Define variable types for sampler
var_types <- c(
  age = "Continuous_int",
  income = "Continuous_int",
  education = "Nominal"
)

# Create initial imputation (simple mean/mode)
imp_init <- sdf \%>\%
  mutate(
  age = ifelse(is.na(age), 35, age),
    income = ifelse(is.na(income), 60000, income),
    education = ifelse(is.na(education), "Medium", education)
  )

# Run sampler
sampled_data <- sampler.spark(
  sc = sc,
  data = sdf,
  imp_init = imp_init,
  fromto = c(1, 2),
  var_types = var_types,
  printFlag = TRUE,
  checkpointing = FALSE
)

# View results
sampled_data \%>\% collect()

 # Clean up
 spark_disconnect(sc)
}
}
