library(leakr)
#>
#> Attaching package: 'leakr'
#> The following object is masked from 'package:base':
#>
#> %||%This vignette explores advanced usage patterns for the leakr package, demonstrating how to detect subtle leakage patterns and customise the detection process for specific scenarios. We’ll cover complex datasets, configuration options, and best practices for comprehensive leakage detection.
The leakr package can identify various types of data leakage that might compromise model validity:
Target leakage can be subtle and context-dependent. Let’s explore some realistic scenarios:
# Create a medical dataset with subtle leakage
set.seed(456)
n <- 500
medical_data <- data.frame(
patient_id = 1:n,
age = sample(25:75, n, replace = TRUE),
bmi = rnorm(n, 25, 4),
blood_pressure = rnorm(n, 120, 15),
diagnosis = factor(sample(c("healthy", "diseased"), n, replace = TRUE, prob = c(0.8, 0.2)))
)
# Add a leaky feature: treatment_received (only available post-diagnosis)
medical_data$treatment_received <- ifelse(
medical_data$diagnosis == "diseased",
sample(c("yes", "no"), sum(medical_data$diagnosis == "diseased"), replace = TRUE, prob = c(0.9, 0.1)),
"no"
)
# Audit the medical data
medical_report <- leakr_audit(
data = medical_data,
target = "diagnosis",
id = "patient_id"
)
print(medical_report)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 500 6
#>
#> $meta$original_data_shape
#> [1] 500 6
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:35 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"# Create financial data with temporal leakage
set.seed(789)
dates <- seq(as.Date("2020-01-01"), as.Date("2023-12-31"), by = "month")
financial_data <- data.frame(
account_id = 1:200,
transaction_date = sample(dates, 200, replace = TRUE),
amount = rlnorm(200, 4, 1),
account_balance = rnorm(200, 1000, 500),
default_risk = factor(sample(c("low", "high"), 200, replace = TRUE))
)
# Sort by date
financial_data <- financial_data[order(financial_data$transaction_date), ]
# Add feature that uses future information (credit score after default assessment)
financial_data$credit_score_updated <- ifelse(
financial_data$default_risk == "high",
rnorm(sum(financial_data$default_risk == "high"), 450, 50),
rnorm(sum(financial_data$default_risk == "low"), 750, 75)
)
# Create temporal split
financial_data$split <- ifelse(
financial_data$transaction_date < as.Date("2022-01-01"),
"train",
"test"
)
# Audit financial data
financial_report <- leakr_audit(
data = financial_data,
target = "default_risk",
split = "split",
id = "account_id"
)
print(financial_report)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 200 7
#>
#> $meta$original_data_shape
#> [1] 200 7
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:35 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"# Create customer dataset with near-duplicates
set.seed(321)
# Original customers
customers <- data.frame(
name = c("John Smith", "Jane Doe", "Bob Johnson", "Alice Brown", "Charlie Davis"),
email = c("john@email.com", "jane@email.com", "bob@email.com", "alice@email.com", "charlie@email.com"),
age = c(35, 28, 42, 31, 39),
income = c(50000, 45000, 75000, 55000, 62000),
purchase_category = factor(c("electronics", "books", "clothing", "electronics", "books"))
)
# Create near-duplicates with slight variations
near_dupes <- customers[1:3, ]
near_dupes$name <- c("J Smith", "Jane D", "Robert Johnson") # Name variations
near_dupes$email <- c("john.smith@email.com", "j.doe@email.com", "bob.johnson@email.com") # Email variations
near_dupes$age <- near_dupes$age + c(1, 0, -1) # Age variations
# Combine datasets
all_customers <- rbind(customers, near_dupes)
all_customers$customer_id <- 1:nrow(all_customers)
# Audit for duplicates
dup_report <- leakr_audit(
data = all_customers,
target = "purchase_category",
id = "customer_id"
)
print(dup_report)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 8 6
#>
#> $meta$original_data_shape
#> [1] 8 6
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:35 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"# Example of custom configuration for sensitive detection
sensitive_config <- list(
sample_size = 5000, # Limit sample size for large datasets
correlation_threshold = 0.7, # Lower threshold for correlation-based detection
duplicate_threshold = 0.9 # Threshold for considering records as duplicates
)
# Apply custom configuration
iris_sensitive <- leakr_audit(
data = iris,
target = "Species",
config = sensitive_config
)
print(iris_sensitive)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 150 5
#>
#> $meta$original_data_shape
#> [1] 150 5
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:36 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 5000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.7
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#> $meta$config_used$duplicate_threshold
#> [1] 0.9
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"# Create a large imbalanced dataset
set.seed(555)
large_n <- 10000
large_data <- data.frame(
feature1 = rnorm(large_n),
feature2 = sample(letters[1:10], large_n, replace = TRUE),
feature3 = rnorm(large_n, 100, 20),
# Imbalanced target
target = factor(sample(c("rare", "common"), large_n, replace = TRUE, prob = c(0.05, 0.95)))
)
# Use stratified sampling to ensure representation
sample_indices <- stratified_sample(large_data$target, 1000)
sampled_data <- large_data[sample_indices, ]
# Verify sampling maintained class balance
table(large_data$target)
#>
#> common rare
#> 9504 496
table(sampled_data$target)
#>
#> common rare
#> 950 50
# Audit sampled data
large_report <- leakr_audit(
data = sampled_data,
target = "target"
)
print(large_report)
#> $summary
#> data frame with 0 columns and 0 rows
#>
#> $evidence
#> list()
#>
#> $meta
#> $meta$n_detectors
#> [1] 0
#>
#> $meta$n_issues
#> [1] 0
#>
#> $meta$data_shape
#> [1] 1000 4
#>
#> $meta$original_data_shape
#> [1] 1000 4
#>
#> $meta$was_sampled
#> [1] FALSE
#>
#> $meta$detectors_run
#> NULL
#>
#> $meta$timestamp
#> [1] "2025-10-22 10:43:36 CEST"
#>
#> $meta$config_used
#> $meta$config_used$sample_size
#> [1] 50000
#>
#> $meta$config_used$correlation_threshold
#> [1] 0.8
#>
#> $meta$config_used$contamination_threshold
#> [1] 0.1
#>
#> $meta$config_used$numeric_severity
#> [1] TRUE
#>
#> $meta$config_used$plot_results
#> [1] FALSE
#>
#> $meta$config_used$parallel
#> [1] FALSE
#>
#> $meta$config_used$seed
#> [1] 123
#>
#>
#>
#> attr(,"class")
#> [1] "leakr_report"# Create complex dataset for comprehensive analysis
complex_data <- data.frame(
id = 1:300,
timestamp = seq(as.POSIXct("2023-01-01"), as.POSIXct("2023-12-31"), length.out = 300),
feature_a = rnorm(300),
feature_b = sample(LETTERS[1:5], 300, replace = TRUE),
feature_c = rnorm(300, 50, 10),
outcome = factor(sample(c("success", "failure"), 300, replace = TRUE))
)
# Add intentional leakage for demonstration
complex_data$leaky_feature <- ifelse(complex_data$outcome == "success", 1, 0)
# Generate comprehensive audit
detailed_report <- leakr_audit(
data = complex_data,
target = "outcome",
id = "id"
)
# Generate detailed summary
detailed_summary <- leakr_summarise(detailed_report, top_n = 10, show_config = TRUE)
#> Leakage Audit Report
#> ===================
#> Data shape: 300 x 7
#> Detectors run:
#> Timestamp: 2025-10-22 10:43:36
#>
#> ✓ No leakage issues detected.
print(detailed_summary)
#> data frame with 0 columns and 0 rowsImplement a systematic approach to leakage detection:
# Multi-stage validation function
comprehensive_validation <- function(data, target, id = NULL, split = NULL) {
cat("Stage 1: Basic data validation\n")
# Basic preprocessing and validation
clean_data <- validate_and_preprocess_data(data, target, split, id)
cat("Stage 2: Initial leakage screening\n")
# Quick initial screening
initial_report <- leakr_audit(clean_data, target = target, split = split, id = id)
cat("Stage 3: Detailed analysis\n")
# Generate detailed summary
summary_report <- leakr_summarise(initial_report, top_n = 15, show_config = TRUE)
# Count critical issues
if(length(initial_report$issues) > 0) {
critical_count <- sum(sapply(initial_report$issues, function(x)
!is.null(x$severity) && x$severity == "high"))
if(critical_count > 0) {
cat("WARNING:", critical_count, "critical issues detected!\n")
}
}
return(list(
data = clean_data,
audit = initial_report,
summary = summary_report
))
}
# Example usage
# validation_result <- comprehensive_validation(your_data, "target_column")Adapt validation to your specific domain:
# Example: E-commerce specific validation
ecommerce_validation <- function(data, target) {
# Standard audit
base_report <- leakr_audit(data, target = target)
# Domain-specific checks
issues <- list()
# Check for post-purchase features
post_purchase_patterns <- c("return", "refund", "satisfaction", "rating")
feature_names <- names(data)
for(pattern in post_purchase_patterns) {
matching_features <- grep(pattern, feature_names, value = TRUE, ignore.case = TRUE)
if(length(matching_features) > 0) {
issues <- append(issues, paste("Potential post-purchase feature:",
paste(matching_features, collapse = ", ")))
}
}
if(length(issues) > 0) {
cat("Domain-specific warnings:\n")
for(issue in issues) {
cat("-", issue, "\n")
}
}
return(base_report)
}
# Example e-commerce data
ecommerce_data <- data.frame(
customer_id = 1:100,
purchase_amount = rlnorm(100, 4, 1),
product_category = sample(c("electronics", "books", "clothing"), 100, replace = TRUE),
customer_satisfaction = sample(1:5, 100, replace = TRUE), # Post-purchase!
will_repurchase = factor(sample(c("yes", "no"), 100, replace = TRUE))
)
# Validate e-commerce data
ecommerce_report <- ecommerce_validation(ecommerce_data, "will_repurchase")
#> Domain-specific warnings:
#> - Potential post-purchase feature: customer_satisfactionThis vignette has demonstrated advanced leakage detection techniques including:
The key to effective leakage detection is understanding your data domain and systematically applying appropriate detection techniques. leakr provides the flexibility to adapt these techniques to your specific requirements whilst maintaining robust detection capabilities.
For integration with popular ML frameworks, see the “Framework Integration” vignette.