# Clear the workspace by removing all objects
rm(list = ls())
# Load the pacman package if it is not already installed.
# if (!requireNamespace("pacman", quietly = TRUE)) {
# install.packages("pacman")
# }
#
# pacman::p_load(
# dplyr, # Data manipulation and transformation
# data.table, # Fast data manipulation with data tables
# stringi, # Character string processing
# lubridate, # Date and time handling
# janitor, # Data cleaning and tabulation functions
# parallel, # Parallel computing
# fastLink, # Record linkage and deduplication
# stringdist # String distance computation
# )
# Get the number of detected cores minus 1
# Reserve one core for non-computational tasks to help prevent system slowdowns or unresponsiveness
numCores <- parallel::detectCores() - 1
Example data frames dfA and
#data(samplematch)
dfA <- data.frame(
FIRST_NAME = c("John", "Mary", "Robert", "Michael", "Jennifer", "David", "Karen", "Maria", "Carlos", "James"),
LAST_NAME = c("Smith III", "Johnson!", "Williams123", "Brown", "Jones", "Davis", "Miller", "Garcia", "Martinez", "Andrson"),
BIRTH_DATE = c("1981-05-20", "1990-05-15", "1978-12-10", "1985-08-02", "1993-11-25", "1977-03-30", "1988-06-18", "1991-02-05", "1980-09-12", "1982-07-09"),
gender = c("M", "F", "M", "M", "F", "M", "F", "F", "M", "M")
)
dfB <- data.frame(
RecipientNameFirst = c("John", "Mary", "Robert", "Michael", "Jennifer", "David", "Karenn", "Carloas", "Mariaa", "James"),
RecipientNameLast = c("Smith iv", "Brown-", "Williams", "Jones", "John son", "No Name", "Miller", "Martinez", "Garcia", "Anderson"),
RecipientDateOfBirth = c("1981-05-21", "1992-09-25", "1978-10-12", "1985-08-02", "1993-11-25", "1977-03-30", "1988-06-18", "1980-09-12", "1991-02-05", "1982-07-09"),
gender = c("M", "F", "M", "M", "F", "M", "F", "M", "F", "M")
)
# # Example data frames dfA and dfB
# dfA <- fread(file = 'dfA.csv',
# sep = ",",
# header = T,
# nThread = numCores)
#
# dfB <- fread(file = 'dfB.csv',
# sep = ",",
# header = T,
# nThread = numCores)
knitr::kable(dfA)
John |
Smith III |
1981-05-20 |
M |
Mary |
Johnson! |
1990-05-15 |
F |
Robert |
Williams123 |
1978-12-10 |
M |
Michael |
Brown |
1985-08-02 |
M |
Jennifer |
Jones |
1993-11-25 |
F |
David |
Davis |
1977-03-30 |
M |
Karen |
Miller |
1988-06-18 |
F |
Maria |
Garcia |
1991-02-05 |
F |
Carlos |
Martinez |
1980-09-12 |
M |
James |
Andrson |
1982-07-09 |
M |
knitr::kable(dfB)
John |
Smith iv |
1981-05-21 |
M |
Mary |
Brown- |
1992-09-25 |
F |
Robert |
Williams |
1978-10-12 |
M |
Michael |
Jones |
1985-08-02 |
M |
Jennifer |
John son |
1993-11-25 |
F |
David |
No Name |
1977-03-30 |
M |
Karenn |
Miller |
1988-06-18 |
F |
Carloas |
Martinez |
1980-09-12 |
M |
Mariaa |
Garcia |
1991-02-05 |
F |
James |
Anderson |
1982-07-09 |
M |
Assign a unique key ID to each row
dfA <- dfA |>
dplyr::mutate(row_idA = paste("dfA_", dplyr::row_number(), sep = ""))
dfB <- dfB |>
dplyr::mutate(row_idB = paste("dfB_", dplyr::row_number(), sep = ""))
Create new date variables by splitting date of birth into three
different parts.
dfA <- dfA |>
dplyr::mutate(dob_day = as.numeric(lubridate::day(BIRTH_DATE)),
dob_month = as.numeric(lubridate::month(BIRTH_DATE)),
dob_year = as.numeric(lubridate::year(BIRTH_DATE)),
DOB=BIRTH_DATE)
dfB <- dfB |>
dplyr::mutate(dob_day = as.numeric(lubridate::day(RecipientDateOfBirth)),
dob_month = as.numeric(lubridate::month(RecipientDateOfBirth)),
dob_year = as.numeric(lubridate::year(RecipientDateOfBirth)),
DOB=RecipientDateOfBirth)
Define a function for data cleaning with additional name removal
logic
clean_names <- function(names_column) {
# Step 0: Convert to uppercase
names_column_new <- toupper(names_column)
# Step 1: Remove specified name suffixes
toRemove <- c(" JR", " SR", " IV", " III", " II")
for (tR in toRemove) {
names_column_new <- gsub(tR, "", names_column_new)
}
# Step 2: Convert special characters to ASCII equivalents
names_column_new <- iconv(names_column_new, "latin1", "ASCII//TRANSLIT", sub = "")
# Step 3: Remove punctuation, digits, and all sapces
names_column_new <- gsub("[[:punct:][:digit:]][[:space:]]", "", names_column_new)
# Step 4: Create a new variable with only alphabetic characters
names_column_new <- gsub("[^[:alpha:]]", "", names_column_new)
return(names_column_new)
}
Create no name list
NoNameList <- c(
"NICKNAME",
"NOFAMILYNAME",
"NOFIRSTNAME",
"NOLASTNAME",
"NOMIDDLENAME",
"NONAME",
"NO",
"UNKNOWN",
"UNK",
"UN",
"NA"
)
#
# # Blank out the names in the data if they match any of the strings in the NoNameList
dfA <- dfA |>
dplyr::mutate(FN = dplyr::case_when(
FN %in% NoNameList~ "",
TRUE ~ FN),
LN = dplyr::case_when(
LN %in% NoNameList ~ "",
TRUE ~ LN))
dfB <- dfB |>
dplyr::mutate(FN = dplyr::case_when(
FN %in% NoNameList~ "",
TRUE ~ FN),
LN = dplyr::case_when(
LN %in% NoNameList ~ "",
TRUE ~ LN))
rm(NoNameList)
Delete rows that have missing First Name (FN), Last Name (LN), or
Date of Birth (DOB).
dfA <- dfA |>
dplyr::filter(!is.na(FN) & FN != "" & !is.na( LN) & LN != "" & !is.na(DOB) )
dfB <- dfB |>
dplyr::filter(!is.na(FN) & FN != "" & !is.na( LN) & LN != "" & !is.na(DOB) )
Exact Matching
Exact.match <- merge(dfA, dfB, by=c("FN","LN","DOB", "gender"))
nrow(Exact.match)
## [1] 0
Using the fastLink R package for record linkage
matches.out <- fastLink::fastLink(
dfA = dfA, dfB = dfB,
# Specify the vector of variable names to be used for matching.
# These variable names should exist in both dfA and dfB
varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year', 'gender'),
# Specify which variables among varnames should be compared using string distance
stringdist.match = c('FN', 'LN'),
# Specify which variables present in stringdist.match can be partially matched
partial.match = c('FN', 'LN'),
# Specify which variables should be matched numerically
# Must be a subset of 'varnames' and must not be present in 'stringdist.match'.
numeric.match = c('dob_day', 'dob_month', 'dob_year'),
# Specify the number of CPU cores to utilize (parallel processing). The default value is NULL.
n.cores = numCores,
return.all = TRUE,
return.df = TRUE
)
##
## ====================
## fastLink(): Fast Probabilistic Record Linkage
## ====================
##
## Calculating matches for each variable.
## Getting counts for parameter estimation.
## Parallelizing calculation using OpenMP. 1 threads out of 12 are used.
## Running the EM algorithm.
## Getting the indices of estimated matches.
## Parallelizing calculation using OpenMP. 1 threads out of 12 are used.
## Deduping the estimated matches.
## Getting the match patterns for each estimated match.
Confusion Matrice
# The output from fastLink() will be a list that includes a matrix where each row
# is a match with the relevant indices of dfA (column 1) and dfB (column 2).
fastLink::confusion(matches.out, threshold = 0.98)
## $confusion.table
## 'True' Matches 'True' Non-Matches
## Declared Matches 7.99 0.01
## Declared Non-Matches 0.00 1.00
##
## $addition.info
## results
## Max Number of Obs to be Matched 9.00
## Sensitivity (%) 99.99
## Specificity (%) 98.56
## Positive Predicted Value (%) 99.82
## Negative Predicted Value (%) 99.90
## False Positive Rate (%) 1.44
## False Negative Rate (%) 0.01
## Correctly Classified (%) 99.83
## F1 Score (%) 99.90
# Examine the EM object:
#matches.out$EM
Summarize the accuracy of the match:
# each column gives the match count, match rate,
# false discovery rate (FDR) and false negative rate (FNR)
# under different cutoffs for matches based on the posterior
# probability of a match.
summary(matches.out)
## 95% 85% 75% Exact
## 1 Match Count 8 8 8 5
## 2 Match Rate 88.727% 88.727% 88.727% 55.556%
## 3 FDR 0.182% 0.182% 0.182%
## 4 FNR 0.003% 0.003% 0.003%
Get fuzzy matches using the results from fastLink
# A threshold of 0.98 is set for match classification
fuzzy_matches <- fastLink::getMatches(dfA, dfB,
fl.out = matches.out,
threshold.match = 0.98)
# dput(names(fuzzy_matches))
# c("FIRST_NAME", "LAST_NAME", "BIRTH_DATE", "gender", "row_idA",
# "dob_day", "dob_month", "dob_year", "DOB", "FIRST_NAME_new",
# "LAST_NAME_new", "FN", "LN", "RecipientNameFirst", "RecipientNameLast",
# "RecipientDateOfBirth", "row_idB", "RecipientNameFirst_new",
# "RecipientNameLast_new", "gamma.1", "gamma.2", "gamma.3", "gamma.4",
# "gamma.5", "gamma.6", "posterior")
## Display nicely...
fuzzy_matches |>
dplyr::mutate( posterior = round(posterior*100, digits = 3)) |>
dplyr::select("posterior", "FIRST_NAME", "LAST_NAME","BIRTH_DATE", "RecipientNameFirst", "RecipientNameLast","RecipientDateOfBirth" ) |>
knitr::kable()
1 |
100.000 |
John |
Smith III |
1981-05-20 |
John |
Smith iv |
1981-05-21 |
3 |
98.586 |
Robert |
Williams123 |
1978-12-10 |
Robert |
Williams |
1978-10-12 |
4 |
99.979 |
Michael |
Brown |
1985-08-02 |
Michael |
Jones |
1985-08-02 |
5 |
99.979 |
Jennifer |
Jones |
1993-11-25 |
Jennifer |
John son |
1993-11-25 |
7 |
100.000 |
Karen |
Miller |
1988-06-18 |
Karenn |
Miller |
1988-06-18 |
9 |
100.000 |
Carlos |
Martinez |
1980-09-12 |
Carloas |
Martinez |
1980-09-12 |
8 |
100.000 |
Maria |
Garcia |
1991-02-05 |
Mariaa |
Garcia |
1991-02-05 |
10 |
100.000 |
James |
Andrson |
1982-07-09 |
James |
Anderson |
1982-07-09 |
Blocking data
blockdata_out <- fastLink::blockData(dfA, dfB,
varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'),
# window blocking for numeric variables, where a given observation in dataset A
#will be compared to all observations in dataset B where the value of the blocking
#variable is within ±K of the value of the same variable in dataset A.
#The value of K is the size of the window
# window.block = "birthyear",
# window.size = 1,
# using k-means clustering, so that similar values of string and numeric
# variables are blocked together.
kmeans.block = "FN",
nclusters = 2)
##
## ====================
## blockData(): Blocking Methods for Record Linkage
## ====================
##
## Blocking variables.
## Blocking variable FN using k-means blocking.
## Blocking variable LN using exact blocking.
## Blocking variable dob_day using exact blocking.
## Blocking variable dob_month using exact blocking.
## Blocking variable dob_year using exact blocking.
##
## Combining blocked variables for final blocking assignments.
Aggregating Multiple Matches Together
#Often, we run several different matches for a single data set -
#for instance, when blocking by gender or by some other criterion to reduce the
#number of pairwise comparisons. Here, we walk through how to aggregate those
#multiple matches into a single summary.
blockgender_out <- fastLink::blockData(dfA, dfB, varnames = "gender")
##
## ====================
## blockData(): Blocking Methods for Record Linkage
## ====================
##
## Blocking variables.
## Blocking variable gender using exact blocking.
##
## Combining blocked variables for final blocking assignments.
## Subset dfA into blocks
dfA_block1 <- dfA[blockgender_out$block.1$dfA.inds,]
dfA_block2 <- dfA[blockgender_out$block.2$dfA.inds,]
## Subset dfB into blocks
dfB_block1 <- dfB[blockgender_out$block.1$dfB.inds,]
dfB_block2 <- dfB[blockgender_out$block.2$dfB.inds,]
## Run fastLink on each
fl_out_block1 <- fastLink::fastLink(
dfA_block1, dfB_block1,
varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'),
n.cores = numCores
)
##
## ====================
## fastLink(): Fast Probabilistic Record Linkage
## ====================
##
## If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.
## Calculating matches for each variable.
## Getting counts for parameter estimation.
## Parallelizing calculation using OpenMP. 1 threads out of 12 are used.
## Running the EM algorithm.
## Getting the indices of estimated matches.
## Parallelizing calculation using OpenMP. 1 threads out of 12 are used.
## Deduping the estimated matches.
## Getting the match patterns for each estimated match.
fl_out_block2 <- fastLink::fastLink(
dfA_block2, dfB_block2,
varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'),
n.cores = numCores
)
##
## ====================
## fastLink(): Fast Probabilistic Record Linkage
## ====================
##
## If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.
## Calculating matches for each variable.
## Getting counts for parameter estimation.
## Parallelizing calculation using OpenMP. 1 threads out of 12 are used.
## Running the EM algorithm.
## Getting the indices of estimated matches.
## Parallelizing calculation using OpenMP. 1 threads out of 12 are used.
## Deduping the estimated matches.
## Getting the match patterns for each estimated match.
#Here, we run fastLink() on the subsets of data defined by blocking on gender in the previous section:
## Run fastLink on each
link.1 <- fastLink::fastLink(
dfA_block1, dfB_block1,
varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'),
n.cores = numCores
)
##
## ====================
## fastLink(): Fast Probabilistic Record Linkage
## ====================
##
## If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.
## Calculating matches for each variable.
## Getting counts for parameter estimation.
## Parallelizing calculation using OpenMP. 1 threads out of 12 are used.
## Running the EM algorithm.
## Getting the indices of estimated matches.
## Parallelizing calculation using OpenMP. 1 threads out of 12 are used.
## Deduping the estimated matches.
## Getting the match patterns for each estimated match.
link.2 <- fastLink::fastLink(
dfA_block2, dfB_block2,
varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'),
n.cores = numCores
)
##
## ====================
## fastLink(): Fast Probabilistic Record Linkage
## ====================
##
## If you set return.all to FALSE, you will not be able to calculate a confusion table as a summary statistic.
## Calculating matches for each variable.
## Getting counts for parameter estimation.
## Parallelizing calculation using OpenMP. 1 threads out of 12 are used.
## Running the EM algorithm.
## Getting the indices of estimated matches.
## Parallelizing calculation using OpenMP. 1 threads out of 12 are used.
## Deduping the estimated matches.
## Getting the match patterns for each estimated match.
#To aggregate the two matches into a single summary, we use the aggregateEM() function as follows:
agg.out <- fastLink::aggregateEM(em.list = list(link.1, link.2))
summary(agg.out )
## 95% 85% 75% Exact
## 1 Match Count 8 8 8 0
## 2 Match Rate 88.889% 88.889% 88.889% 0%
## 3 FDR 0% 0% 0%
## 4 FNR 0% 0% 0%
fuzzy_matches2 <- fastLink::getMatches(dfA, dfB,
fl.out = agg.out,
threshold.match = 0.98)
# dput(names(fuzzy_matches2))
# ## Display nicely...
# fuzzy_matches2 |>
# dplyr::mutate( posterior = round(posterior*100, digits = 3)) |>
# dplyr::select("posterior", "FIRST_NAME", "LAST_NAME","BIRTH_DATE", "RecipientNameFirst", "RecipientNameLast","RecipientDateOfBirth" ) |>
# knitr::kable()