Skip to content

Commit 1be1737

Browse files
committed
[DC-1022] Adds script to generate concept_ancestor_extension table.
Signed-off-by: Krishna Kalluri <[email protected]>
1 parent e047a54 commit 1be1737

File tree

2 files changed

+177
-1
lines changed

2 files changed

+177
-1
lines changed

data_steward/common.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
# Python imports
22
import os
33

4+
import jinja2
5+
46
# Project imports
57
from constants.bq_utils import VALIDATION_DATASET_REGEX
68
from constants.validation.participants.identity_match import REPORT_DIRECTORY_REGEX
7-
import jinja2
89

910
# AOU required PII tables
1011
PII_WILDCARD = 'pii*'
@@ -112,6 +113,8 @@
112113
CONCEPT, CONCEPT_ANCESTOR, CONCEPT_CLASS, CONCEPT_RELATIONSHIP,
113114
CONCEPT_SYNONYM, DOMAIN, DRUG_STRENGTH, RELATIONSHIP, VOCABULARY
114115
]
116+
CONCEPT_ANCESTOR_EXTENSION = 'concept_ancestor_extension'
117+
115118
# Achilles
116119
ACHILLES_ANALYSIS = 'achilles_analysis'
117120
ACHILLES_RESULTS = 'achilles_results'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
"""
2+
concept_relationship stores two types of relationships
3+
1) hierarchical relationships: 'is_a' / 'subsume' that defines the hierarchy of the vocabulary;
4+
2) associative relationships: relationships across the hierarchy such as Drug A
5+
'is an indication of' Condition B. The concept_ancestor table is built based on 'Is A' and 'Subsume' relationships
6+
recursively so any ancestor/descendent pairs (regardless of the levels of separation) are pre-computed for us.
7+
8+
The relationship for LOINC component concepts (e.g. Triglycerides) used to be in the subsumption relationship with lab
9+
concepts (e.g. Triglyceride [Mass or Moles/volume] in Serum or Plasma) in the previous version of vocab, however, the
10+
OMOP vocab team changed this relationship from 'subsume' to 'component of ' to align with the LOINC system. As a
11+
consequence, concept_ancestor missed all of ancestor/descendent relationships involving LOINC component concepts.
12+
13+
This script generates the concept_ancestor_ext table for all the concepts in measurement domain using loinc hierarchy.
14+
"""
15+
16+
import argparse
17+
import logging
18+
19+
from common import CONCEPT_ANCESTOR_EXTENSION
20+
from utils import bq
21+
22+
LOGGER = logging.getLogger(__name__)
23+
24+
CONCEPT_ANCESTOR_EXT_QUERY = '''
25+
DECLARE
26+
num_of_new_records INT64;
27+
-- Instantiate concept_ancestor_extension with all LONIC measurement concepts and direct descendant concepts
28+
CREATE OR REPLACE TABLE
29+
`{project}.{dataset}.{ancestor_extension}` ( ancestor_concept_id INT64,
30+
descendant_concept_id INT64,
31+
levels_of_separation INT64 ) AS (
32+
SELECT
33+
DISTINCT cr.concept_id_1 AS ancestor_concept_id,
34+
cr.concept_id_2 AS descendant_concept_id,
35+
1 AS levels_of_separation
36+
FROM (
37+
SELECT
38+
concept_id AS ancestor_concept_id
39+
FROM
40+
`{project}.{dataset}.concept` AS c
41+
WHERE
42+
c.vocabulary_id = 'LOINC'
43+
AND domain_id = 'Measurement' ) AS loinc_ids
44+
JOIN
45+
`{project}.{dataset}.concept_relationship` AS cr
46+
ON
47+
loinc_ids.ancestor_concept_id = cr.concept_id_1
48+
AND relationship_id IN ('Subsumes',
49+
'Component of')
50+
AND cr.concept_id_1 <> cr.concept_id_2
51+
JOIN
52+
`{project}.{dataset}.concept` AS c2
53+
ON
54+
cr.concept_id_2 = c2.concept_id
55+
AND c2.domain_id = 'Measurement' );
56+
LOOP
57+
CREATE OR REPLACE TEMP TABLE descendants_next_iteration AS (
58+
SELECT
59+
DISTINCT cae.ancestor_concept_id,
60+
cr.concept_id_2 AS descendant_concept_id,
61+
cae.levels_of_separation + 1 AS levels_of_separation
62+
FROM
63+
`{project}.{dataset}.{ancestor_extension}` AS cae
64+
JOIN
65+
`{project}.{dataset}.concept_relationship` AS cr
66+
ON
67+
cae.descendant_concept_id = cr.concept_id_1
68+
AND relationship_id IN ('Subsumes',
69+
'Component of')
70+
AND cr.concept_id_1 <> cr.concept_id_2
71+
JOIN
72+
`{project}.{dataset}.concept` AS c2
73+
ON
74+
cr.concept_id_2 = c2.concept_id
75+
AND c2.domain_id = 'Measurement' );
76+
SET
77+
num_of_new_records = (
78+
SELECT
79+
COUNT(*)
80+
FROM
81+
descendants_next_iteration AS cae_new
82+
LEFT JOIN
83+
`{project}.{dataset}.{ancestor_extension}` AS cae
84+
ON
85+
cae_new.ancestor_concept_id = cae.ancestor_concept_id
86+
AND cae_new.descendant_concept_id = cae.descendant_concept_id
87+
AND cae_new.levels_of_separation = cae.levels_of_separation
88+
AND cae.ancestor_concept_id <> cae_new.descendant_concept_id
89+
WHERE
90+
cae.ancestor_concept_id IS NULL );
91+
IF
92+
num_of_new_records = 0 THEN
93+
LEAVE
94+
;
95+
END IF
96+
;
97+
INSERT
98+
`{project}.{dataset}.{ancestor_extension}`
99+
SELECT
100+
cae_new.*
101+
FROM
102+
descendants_next_iteration AS cae_new
103+
LEFT JOIN
104+
`{project}.{dataset}.{ancestor_extension}` AS cae
105+
ON
106+
cae_new.ancestor_concept_id = cae.ancestor_concept_id
107+
AND cae_new.descendant_concept_id = cae.descendant_concept_id
108+
AND cae_new.levels_of_separation = cae.levels_of_separation
109+
AND cae.ancestor_concept_id <> cae_new.descendant_concept_id
110+
WHERE
111+
cae.ancestor_concept_id IS NULL;
112+
END LOOP
113+
;
114+
CREATE OR REPLACE TABLE
115+
`{project}.{dataset}.{ancestor_extension}` AS
116+
SELECT
117+
ancestor_concept_id,
118+
descendant_concept_id,
119+
MIN(levels_of_separation) AS min_levels_of_separation,
120+
MAX(levels_of_separation) AS max_levels_of_separation
121+
FROM
122+
`{project}.{dataset}.{ancestor_extension}`
123+
GROUP BY
124+
ancestor_concept_id,
125+
descendant_concept_id;
126+
'''
127+
128+
129+
def generate_concept_ancestor_extension(project_id, dataset_id):
130+
"""
131+
generates concept ancestor extension table from the concept relationship table for LOINC hierarchy
132+
:param project_id: identifier for project id
133+
:param dataset_id: identifier for dataset
134+
:return: Bq job result
135+
"""
136+
137+
client = bq.get_client(project_id)
138+
query = CONCEPT_ANCESTOR_EXT_QUERY.format(
139+
project=project_id,
140+
dataset=dataset_id,
141+
ancestor_extension=CONCEPT_ANCESTOR_EXTENSION)
142+
query_job = client.query(query)
143+
res = query_job.result()
144+
return res
145+
146+
147+
def get_args_parser():
148+
parser = argparse.ArgumentParser()
149+
parser.add_argument(
150+
'-p',
151+
'--project_id',
152+
dest='project_id',
153+
action='store',
154+
help=
155+
'Identifies the project containing the ehr dataset and lookup dataset',
156+
required=True)
157+
parser.add_argument(
158+
'-d',
159+
'--dataset_id',
160+
dest='ehr_dataset_id',
161+
action='store',
162+
help=
163+
'Identifies the dataset where the concept_ancestor_ext table is to be created.',
164+
required=True)
165+
return parser
166+
167+
168+
if __name__ == '__main__':
169+
args_parser = get_args_parser()
170+
args = args_parser.parse_args()
171+
concept_ancestor_ext = generate_concept_ancestor_extension(
172+
args.project_id, args.ehr_dataset_id)
173+
LOGGER.info(concept_ancestor_ext)

0 commit comments

Comments
 (0)