orders <- data.frame(
id = c(1L, 2L, 3L, 3L, 4L, 5L),
amount = c(100, 200, 150, 175, 300, 50)
)
customers <- data.frame(
id = c(2L, 3L, 6L),
name = c("Alice", "Bob", "Carol")
)
validate_join(orders, customers, by = "id")
#>
#> ── Join Validation: orders ↔ customers ─────────────────────────────────────────
#> Keys in orders: id
#> Keys in customers: id
#>
#> Item Value
#> ─────────────────────────────────────────── ───────────
#> Relationship many-to-one
#> Key(s) in orders [id] (1 col)
#> Key(s) in customers [id] (1 col)
#> Rows in orders 6
#> Distinct key combos in orders 5
#> Rows in customers 3
#> Distinct key combos in customers 3
#> Overlapping distinct key combos 2
#> Matched row pairs (cartesian) 3
#> Match rate from orders 50.00%
#> Match rate from customers 66.67%
#> Rows only in orders (no match in customers) 3
#> Rows only in customers (no match in orders) 1
#>
#> Duplicates: orders=yes customers=no
Different key names
When the key columns have different names, use a named vector:
products <- data.frame(prod_id = 1:3, price = c(10, 20, 30))
sales <- data.frame(item_id = c(1L, 1L, 2L), qty = c(5, 3, 7))
validate_join(products, sales, by = c("prod_id" = "item_id"))
#>
#> ── Join Validation: products ↔ sales ───────────────────────────────────────────
#> Keys in products: prod_id
#> Keys in sales: item_id
#>
#> Item Value
#> ───────────────────────────────────────── ───────────
#> Relationship one-to-many
#> Key(s) in products [prod_id] (1 col)
#> Key(s) in sales [item_id] (1 col)
#> Rows in products 3
#> Distinct key combos in products 3
#> Rows in sales 3
#> Distinct key combos in sales 2
#> Overlapping distinct key combos 2
#> Matched row pairs (cartesian) 3
#> Match rate from products 66.67%
#> Match rate from sales 100.00%
#> Rows only in products (no match in sales) 1
#> Rows only in sales (no match in products) 0
#>
#> Duplicates: products=no sales=yes
Stat tracking
Track the impact on a numeric column with stat (same
column name in both tables) or stat_x/stat_y
(different column names):
x <- data.frame(id = 1:4, revenue = c(100, 200, 300, 400))
y <- data.frame(id = c(2L, 3L, 5L), cost = c(10, 20, 30))
validate_join(x, y, by = "id", stat_x = "revenue", stat_y = "cost")
#>
#> ── Join Validation: x ↔ y ──────────────────────────────────────────────────────
#> Keys in x: id
#> Keys in y: id
#>
#> Item Value
#> ─────────────────────────────── ──────────
#> Relationship one-to-one
#> Key(s) in x [id] (1 col)
#> Key(s) in y [id] (1 col)
#> Rows in x 4
#> Distinct key combos in x 4
#> Rows in y 3
#> Distinct key combos in y 3
#> Overlapping distinct key combos 2
#> Matched row pairs (cartesian) 2
#> Match rate from x 50.00%
#> Match rate from y 66.67%
#> Rows only in x (no match in y) 2
#> Rows only in y (no match in x) 1
#>
#> ── Stat diagnostics ────────────────────────────────────────────────────────────
#>
#> revenue in x:
#> • Total: 1,000
#> • Matched: 500 (50.00%)
#> • Unmatched: 500 (50.00%)
#>
#> cost in y:
#> • Total: 60
#> • Matched: 30 (50.00%)
#> • Unmatched: 30 (50.00%)
#>
#> Duplicates: x=no y=no