Skip to content

Commit cc66aab

Browse files
committed
Handle missing values in blocked MPI records (#242)
## Description This PR adds a filter function to the `get_block_data` function in the `mpi_service`. This filter function removes from consideration all candidates from the MPI who are part of a person cluster containing a patient who satisfied blocking criteria, but which themselves have present values (i.e. are not missing fields) that disagree with incoming blocking keys. This will make matching more precise and reinforce validity in the blocking step. ## Related Issues #231 ## Additional Notes N/A
1 parent a398914 commit cc66aab

File tree

3 files changed

+142
-14
lines changed

3 files changed

+142
-14
lines changed

src/recordlinker/database/mpi_service.py

+69-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,73 @@
1818
from recordlinker import schemas
1919

2020

21+
def _filter_incorrect_blocks(
22+
record: schemas.PIIRecord,
23+
patients: typing.Sequence[models.Patient],
24+
blocking_keys: list[str]
25+
) -> list[models.Patient]:
26+
"""
27+
Filter a set of candidates returned via blocking from the MPI. The initial
28+
SQL query returns a collection of candidates comprised of all patients in
29+
the MPI belonging to a Person cluster for which at *least one* patient
30+
satisfied blocking criteria. This function filters that candidate set to
31+
include *only* those patients who either satisfied blocking criteria or
32+
were missing a value for one or more blocked fields. This eliminates
33+
patients from consideration who have mismatched blocking information but
34+
belonged to a Person cluster where a different record had correct blocking
35+
values.
36+
37+
:param record: The PIIRecord of the incoming patient.
38+
:param patients: The initial collection of candidates retrieved from the MPI.
39+
:param blocking_keys: A list of strings of the fields used for blocking.
40+
:returns: A filtered list of Patients from the MPI.
41+
"""
42+
# Extract the acceptable blocking values from the incoming record
43+
# Keys have already been getattr validated by caller, no need
44+
# to check that they exist
45+
blocking_vals_in_incoming = {}
46+
for bk in blocking_keys:
47+
key = getattr(models.BlockingKey, bk)
48+
vals_blocked_from_key = [v for v in record.blocking_keys(key)]
49+
if len(vals_blocked_from_key) > 0:
50+
blocking_vals_in_incoming[bk] = vals_blocked_from_key
51+
52+
# Can't modify sequence in place, so we'll build up a list of list idxs
53+
# to exclude for mpi patients who don't match blocking criteria exactly
54+
pats_to_exclude = set()
55+
for p in patients:
56+
# Note: This implementation searches for compatible values in the
57+
# fields of candidates. It is possible to write this inner loop
58+
# checking for incompatible values instead. This changes which loop
59+
# gets short-circuited. Performance testing found compatible search
60+
# faster than incompatible search due to generator termination and
61+
# time-complexity growth with number of blocking keys. The more
62+
# normalization and preprocessing done in `feature_iter`, the slower
63+
# this search method becomes. If heavy processing is performed,
64+
# consider switching to incompatible search.
65+
num_agreeing_blocking_fields = 0
66+
mpi_record = p.record
67+
for bk, allowed_vals in blocking_vals_in_incoming.items():
68+
# Compare incoming blocking value to what would be the blocking
69+
# value of the mpi record to make sure we compare on e.g. same
70+
# number of characters at beginning/end of string
71+
mpi_vals = mpi_record.blocking_keys(getattr(models.BlockingKey, bk))
72+
73+
# Generator gets us best performance, fastest way to check membership
74+
# because we return True as soon as we get 1 rather than build the
75+
# whole list. Also count compatibility if mpi_val is missing.
76+
found_compatible_val = (len(mpi_vals) == 0) or any(x in mpi_vals for x in allowed_vals)
77+
if found_compatible_val:
78+
num_agreeing_blocking_fields += 1
79+
80+
# If we get through all the blocking criteria with no missing entries
81+
# and no true-value agreement, we exclude
82+
if num_agreeing_blocking_fields < len(blocking_keys):
83+
pats_to_exclude.add(p.id)
84+
85+
return [pat for pat in patients if pat.id not in pats_to_exclude]
86+
87+
2188
def get_block_data(
2289
session: orm.Session, record: schemas.PIIRecord, algorithm_pass: models.AlgorithmPass
2390
) -> typing.Sequence[models.Patient]:
@@ -65,7 +132,8 @@ def get_block_data(
65132
# NOTE: We probably apply the filter here to throw away patients with
66133
# non-empty but wrong blocking fields?
67134
expr = expression.select(models.Patient).where(models.Patient.person_id.in_(base))
68-
return session.execute(expr).scalars().all()
135+
candidates = session.execute(expr).scalars().all()
136+
return _filter_incorrect_blocks(record, candidates, algorithm_pass.blocking_keys)
69137

70138

71139
def insert_patient(

tests/unit/assets/possible_match_default_patient_bundle.json

+7-7
Original file line numberDiff line numberDiff line change
@@ -84,16 +84,16 @@
8484
"birthDate": "1980-01-01",
8585
"gender": "male",
8686
"address": [
87-
{
87+
{
8888
"line": [
8989
"Bay 16",
9090
"Ward Sector 24"
9191
],
92-
"city": "Brooklyn",
93-
"state": "New York",
94-
"postalCode": "54321",
92+
"city": "Boston",
93+
"state": "Massachusetts",
94+
"postalCode": "99999",
9595
"use": "home"
96-
}
96+
}
9797
],
9898
"telecom": [
9999
{
@@ -134,7 +134,7 @@
134134
"birthDate": "1980-01-01",
135135
"gender": "male",
136136
"address": [
137-
{
137+
{
138138
"line": [
139139
"1234 Silversun Strip"
140140
],
@@ -161,4 +161,4 @@
161161
}
162162
}
163163
]
164-
}
164+
}

tests/unit/database/test_mpi_service.py

+66-6
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,9 @@ class TestGetBlockData:
396396
@pytest.fixture
397397
def prime_index(self, session: Session):
398398
person_1 = models.Person()
399+
person_2 = models.Person()
399400
session.add(person_1)
401+
session.add(person_2)
400402
session.flush()
401403

402404
data = [
@@ -498,8 +500,36 @@ def prime_index(self, session: Session):
498500
],
499501
"birthdate": "",
500502
},
501-
models.Person(),
503+
person_2,
502504
),
505+
(
506+
{
507+
"name": [
508+
{
509+
"given": [
510+
"Ferris",
511+
],
512+
"family": "Bueller",
513+
}
514+
],
515+
"birthdate": "1974-11-07",
516+
},
517+
person_2
518+
),
519+
(
520+
{
521+
"name": [
522+
{
523+
"given": [
524+
"Ferris",
525+
],
526+
"family": "Bueller",
527+
}
528+
],
529+
"birthdate": "1983-08-17",
530+
},
531+
person_2
532+
)
503533
]
504534
for datum, person in data:
505535
mpi_service.insert_patient(session, schemas.PIIRecord(**datum), person=person)
@@ -553,6 +583,30 @@ def test_block_empty_block_key(self, session: Session, prime_index: None):
553583
matches = mpi_service.get_block_data(session, schemas.PIIRecord(**data), algorithm_pass)
554584
assert len(matches) == 0
555585

586+
def test_block_filter_mpi_candidates(self, session: Session, prime_index: None):
587+
"""
588+
Tests filtering candidates returned from the MPI for either blocking
589+
agreement or missing information. Patients who are in pulled clusters
590+
but have wrong blocking fields should be eliminated from consideration.
591+
"""
592+
data = {
593+
"name": [
594+
{
595+
"given": [
596+
"Ferris",
597+
],
598+
"family": "Bueller",
599+
}
600+
],
601+
"birthdate": "1974-11-07",
602+
}
603+
algorithm_pass = models.AlgorithmPass(blocking_keys=["BIRTHDATE", "FIRST_NAME"])
604+
# Will initially be 3 patients in this person cluster
605+
# One agrees on blocking, one has missing values, and one
606+
# is wrong, so we should throw away that one
607+
matches = mpi_service.get_block_data(session, schemas.PIIRecord(**data), algorithm_pass)
608+
assert len(matches) == 2
609+
556610
def test_block_on_birthdate(self, session: Session, prime_index: None):
557611
data = {
558612
"name": [
@@ -600,7 +654,8 @@ def test_block_on_first_name(self, session: Session, prime_index: None):
600654
}
601655
algorithm_pass = models.AlgorithmPass(blocking_keys=["FIRST_NAME"])
602656
matches = mpi_service.get_block_data(session, schemas.PIIRecord(**data), algorithm_pass)
603-
assert len(matches) == 5
657+
# One candidate in MPI person_1 is a Bill, will be ruled out
658+
assert len(matches) == 4
604659

605660
def test_block_on_birthdate_and_first_name(self, session: Session, prime_index: None):
606661
data = {
@@ -617,7 +672,8 @@ def test_block_on_birthdate_and_first_name(self, session: Session, prime_index:
617672
}
618673
algorithm_pass = models.AlgorithmPass(blocking_keys=["BIRTHDATE", "FIRST_NAME"])
619674
matches = mpi_service.get_block_data(session, schemas.PIIRecord(**data), algorithm_pass)
620-
assert len(matches) == 4
675+
# One candidate in MPI person_1 is just a Bill, ruled out
676+
assert len(matches) == 3
621677

622678
def test_block_on_birthdate_first_name_and_last_name(self, session: Session, prime_index: None):
623679
data = {
@@ -636,7 +692,8 @@ def test_block_on_birthdate_first_name_and_last_name(self, session: Session, pri
636692
blocking_keys=["BIRTHDATE", "FIRST_NAME", "LAST_NAME"]
637693
)
638694
matches = mpi_service.get_block_data(session, schemas.PIIRecord(**data), algorithm_pass)
639-
assert len(matches) == 3
695+
# One person in MPI person_1 is just a Bill, ruled out
696+
assert len(matches) == 2
640697
data = {
641698
"name": [
642699
{
@@ -649,7 +706,9 @@ def test_block_on_birthdate_first_name_and_last_name(self, session: Session, pri
649706
"birthdate": "Jan 1 1980",
650707
}
651708
matches = mpi_service.get_block_data(session, schemas.PIIRecord(**data), algorithm_pass)
652-
assert len(matches) == 3
709+
# Blocking uses feature_iter, which yields only the first `given` for a
710+
# single name object, so only the patient with 'Bill' is caught
711+
assert len(matches) == 1
653712
data = {
654713
"name": [
655714
{
@@ -681,7 +740,8 @@ def test_block_on_multiple_names(self, session: Session, prime_index: None):
681740
kwargs={},
682741
)
683742
matches = mpi_service.get_block_data(session, schemas.PIIRecord(**data), algorithm_pass)
684-
assert len(matches) == 5
743+
# One of patients in MPI person_1 is a Bill, so is excluded
744+
assert len(matches) == 4
685745

686746
def test_block_missing_keys(self, session: Session, prime_index: None):
687747
data = {"birthdate": "01/01/1980"}

0 commit comments

Comments
 (0)