Creating Patient Identifiers and Time-Based Episodes with epidm

Introduction

The epidm R package provides utilities to support patient identity resolution and clinical episode construction.

This vignette introduces two key functions:

  1. uk_patient_id()
    Generates unique patient ids using known patient identifiers with multiple staged rules.

  2. group_time()
    Groups time intervals into clinically meaningful episodes (static or rolling windows).

Both functions are designed for healthcare data, for example SGSS, HES/SUS data and ECDS data where repeated records across systems must be linked or aggregated.

1. uk_patient_id() function

Input Requirements

A dataset should ideally contain:

You must also supply a list mapping your column names using the id = list(...) argument.

Example usage

# 1) Create example data (shown so readers can reproduce)
id_test <-
  data.frame(
    stringsAsFactors = FALSE,
    record_id = c(1L,2L,3L,4L,
                  5L,6L,7L,8L,9L,10L,11L,12L,13L,14L,15L,
                  16L,17L,18L,19L,20L,21L,22L,23L,24L),
    nhs_number = c(9435754422,
                   9435754422,NA,9435754422,5555555555,NA,
                   9435773982,NA,9999999999,NA,9435773982,NA,
                   9435802508,9435802508,NA,NA,9435802508,9435802508,NA,
                   3333333333,NA,9999999999,9435817777,
                   9435817777),
    local_patient_identifier = c(NA,"IG12067",
                                 NA,NA,"IG12067","IG12067","KR2535","KR2535",
                                 "KR2535",NA,NA,NA,"UK8734","UK8734",NA,NA,
                                 "UK8734","UK8734",NA,NA,"JH45204",
                                 "HS45202","HS45202","JH45204"),
    patient_birth_date = c("1993-07-16",
                           "1993-07-16","1993-07-16","1993-07-16",
                           "1993-07-16",NA,"1967-02-10",NA,"1967-02-10",NA,NA,
                           "1967-02-10",NA,NA,"1952-10-22","1952-10-22",
                           "1952-10-22",NA,"1947-09-14","1947-09-14",
                           "1947-09-14","1947-09-14","1947-09-14",
                           "1947-09-14"),
    sex = c("Male","Male",
            "Male","Male",NA,"Male","Female","Female",
            "Female","Female","Female","Female","Male",
            "Male","Male","Male","Male","Male","Male",
            "Male","Male","Male",NA,"Male"),
    forename = c(NA,"DENNIS",
                 NA,NA,"DENNIS",NA,"ELLIE","ELLIE",NA,
                 "ELLIE","ELLIE","ELLIE","IAN","IAN","MALCOLM",
                 "IAN","IAN",NA,"GRANT","ALAN","ALAN","ALAN",
                 "GRANT","ALAN"),
    surname = c(NA,"NEDRY",
                "NEDRY",NA,"NEDRY","NEDRY","SATTLER","SATTLER",
                NA,"SATTLER","SATTLER","SATTLER","M",NA,
                "IAN","MALCOLM","MALCOLM",NA,"ALAN","GRANT",
                "GRANT","GRANT","ALAN","GRANT"),
    postcode = c("HA4 0FF",
                 "HA4 0FF","HA4 0FF",NA,"HA4 0FF","HA4 0FF",
                 "L3 1DZ","L3 1DZ","L3 1DZ","L3 1DZ",NA,"L3 1DZ",
                 "BN14 9EP",NA,"BN14 9EP",NA,NA,NA,"CW6 9TX",
                 "CW6 9TX",NA,NA,NA,NA),
    specimen_date = c("2024-08-14",
                      "2023-02-03","2023-02-07","2023-02-04",
                      "2023-02-09","2024-08-14","2021-03-28","2021-03-28",
                      "2021-03-28","2021-03-28","2021-03-28",
                      "2021-03-28","2024-07-06","2024-07-06","2024-07-06",
                      "2023-10-31","2023-10-31","2023-10-31",
                      "2022-01-23","2022-01-24","2022-01-25","2022-01-26",
                      "2022-01-27","2022-01-28")
  )

# 2) Run uk_patient_id()
result_id <- uk_patient_id(
  id_test,
  id = list(
    nhs_number    = 'nhs_number',
    hospital_number = 'local_patient_identifier',
    date_of_birth = 'patient_birth_date',
    sex_mfu       = 'sex',
    forename      = 'forename',
    surname       = 'surname',
    postcode      = 'postcode'
  ),
  .useStages   = 1:11,      # optional
  .keepStages  = FALSE,     # optional
  .keepValidNHS = FALSE     # optional
)
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty

# 3) Show a compact preview
print(head(result_id), row.names = FALSE)
#>     id record_id nhs_number local_patient_identifier patient_birth_date    sex forename surname postcode
#>  <int>     <int>     <char>                   <char>             <char> <char>   <char>  <char>   <char>
#>      1         1 9435754422                     <NA>         1993-07-16      M     <NA>    <NA>   HA40FF
#>      1         2 9435754422                  IG12067         1993-07-16      M   DENNIS   NEDRY   HA40FF
#>      1         3       <NA>                     <NA>         1993-07-16      M     <NA>   NEDRY   HA40FF
#>      1         4 9435754422                     <NA>         1993-07-16      M     <NA>    <NA>     <NA>
#>      1         5 5555555555                  IG12067         1993-07-16   <NA>   DENNIS   NEDRY   HA40FF
#>      1         6       <NA>                  IG12067               <NA>      M     <NA>   NEDRY   HA40FF
#>  specimen_date
#>         <char>
#>     2024-08-14
#>     2023-02-03
#>     2023-02-07
#>     2023-02-04
#>     2023-02-09
#>     2024-08-14

2. group_time() function

Input Requirements

A dataset should ideally contain:

You must also supply the grouping and date arguments via function parameters:

Purpose

group_time() aggregates:

Two episode rules are supported:

Type Meaning
Static window Based on the first event only; all events within X days of the first event belong to the same episode.
Rolling window The window resets with each event; a new event inside X days of the previous event extends the episode.
# Events example (14‑day static window):
#1) Create example data

episode_test <- structure(
  list(
    pat_id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L),
    species = c(rep("E. coli", 7), rep("K. pneumonia", 7)),
    spec_type = c(rep("Blood", 7), rep("Blood", 4), rep("Sputum", 3)),
    sp_date = structure(
      c(
        18262,
        18263,
        18281,
        18282,
        18262,
        18263,
        18281,
        18265,
        18270,
        18281,
        18283,
        18259,
        18260,
        18281
      ),
      class = "Date"
    )
  ),
  row.names = c(NA, -14L),
  class = "data.frame"
)

# 2) Run group_time() for events using a 14-day static window
ep_static <- group_time(
  x = episode_test,
  date_start = 'sp_date',
  window = 14,
  window_type = 'static',
  group_vars = c('pat_id', 'species', 'spec_type'),
  indx_varname = 'static_indx',   # optional
  min_varname = 'min_date',
  # optional (defaults)
  max_varname = 'max_date'      # optional (defaults)
)
# 3) Show a compact preview
print(head(ep_static), row.names = FALSE)
#>  pat_id      species spec_type    sp_date static_indx   min_date   max_date
#>   <int>       <char>    <char>     <Date>      <char>     <Date>     <Date>
#>       1      E. coli     Blood 2020-01-01       1.4.1 2020-01-01 2020-01-02
#>       1      E. coli     Blood 2020-01-02       1.4.1 2020-01-01 2020-01-02
#>       1      E. coli     Blood 2020-01-20       1.4.2 2020-01-20 2020-01-21
#>       1      E. coli     Blood 2020-01-21       1.4.2 2020-01-20 2020-01-21
#>       1 K. pneumonia     Blood 2020-01-04       2.4.1 2020-01-04 2020-01-09
#>       1 K. pneumonia     Blood 2020-01-09       2.4.1 2020-01-04 2020-01-09

# Intervals example (start + end dates):1) Create example interval data (start + end dates)
#1) Create example data
spell_test <- data.frame(
  id = c(rep(99, 6), rep(88, 4), rep(3, 3)),
  provider = c("YXZ", rep("ZXY", 5), rep("XYZ", 4), rep("YZX", 3)),
  spell_start = as.Date(
    c(
      "2020-03-01",
      "2020-07-07",
      "2020-02-08",
      "2020-04-28",
      "2020-03-15",
      "2020-07-01",
      "2020-01-01",
      "2020-01-12",
      "2019-12-25",
      "2020-03-28",
      "2020-01-01",
      NA,
      NA
    )
  ),
  spell_end = as.Date(
    c(
      "2020-03-10",
      "2020-07-26",
      "2020-05-22",
      "2020-04-30",
      "2020-05-20",
      "2020-07-08",
      "2020-01-23",
      "2020-03-30",
      "2020-01-02",
      "2020-04-20",
      "2020-01-01",
      NA,
      NA
    )
  )
)

# 2) Run group_time() for intervals (start + end dates)
spell_episodes <- group_time(
  x = spell_test,
  date_start = 'spell_start',
  date_end = 'spell_end',
  group_vars = c('id', 'provider'),
  indx_varname = 'spell_id',
  # optional
  min_varname = 'spell_min_date',
  # optional
  max_varname = 'spell_max_date'        # optional
)

# 3) Show a compact preview
print(head(spell_episodes), row.names = FALSE)
#>     id provider spell_start  spell_end spell_id spell_min_date spell_max_date
#>  <num>   <char>      <Date>     <Date>   <char>         <Date>         <Date>
#>     88      XYZ  2019-12-25 2020-01-02    1.4.0     2019-12-25     2020-04-20
#>     88      XYZ  2020-01-01 2020-01-23    1.4.0     2019-12-25     2020-04-20
#>      3      YZX  2020-01-01 2020-01-01    2.1.0     2020-01-01     2020-01-01
#>     88      XYZ  2020-01-12 2020-03-30    1.4.0     2019-12-25     2020-04-20
#>     99      ZXY  2020-02-08 2020-05-22    3.5.0     2020-02-08     2020-05-22
#>     99      YXZ  2020-03-01 2020-03-10    4.1.0     2020-03-01     2020-03-10