Skip to content

Commit

Permalink
Merge pull request #167 from cancervariants/staging
Browse files Browse the repository at this point in the history
Staging
  • Loading branch information
korikuzma authored Apr 8, 2022
2 parents 05fd07d + d8133c4 commit 0223fad
Show file tree
Hide file tree
Showing 24 changed files with 546 additions and 523 deletions.
26 changes: 11 additions & 15 deletions .ebextensions/02_app_config.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,42 +9,38 @@ commands:
command: "export $(cat /opt/elasticbeanstalk/deployment/env | xargs)"

container_commands:
01_refseq_gene_symbols_download:
test: test ! -f "/var/app/venv/staging-LQM1lest/lib/python3.8/site-packages/variation/data/refseq_gene_symbols.txt"
command: "aws s3 cp s3://${AWS_BUCKET_NAME}/variation/refseq_gene_symbols.txt /var/app/venv/staging-LQM1lest/lib/python3.8/site-packages/variation/data/refseq_gene_symbols.txt --region us-east-2"
01_uta_permissions:
test: test -d "/var/app/venv/staging-LQM1lest/lib/python3.8/site-packages/uta_tools"
command: "chmod -R 777 /var/app/venv/staging-LQM1lest/lib/python3.8/site-packages/uta_tools/data"

02_mane_grch38_refseq_download:
test: test ! -f "/var/app/venv/staging-LQM1lest/lib/python3.8/site-packages/variation/data/MANE.GRCh38.v0.93.summary.txt"
command: "aws s3 cp s3://${AWS_BUCKET_NAME}/variation/MANE.GRCh38.v0.93.summary.txt /var/app/venv/staging-LQM1lest/lib/python3.8/site-packages/variation/data/MANE.GRCh38.v0.93.summary.txt --region us-east-2"

03_s3_download:
02_s3_download:
test: test ! -d "/usr/local/share/seqrepo"
command: "aws s3 cp s3://${AWS_BUCKET_NAME}/${AWS_SEQREPO_OBJECT} /usr/local/share/seqrepo.zip --region us-east-2"

04_unzip_seqrepo:
03_unzip_seqrepo:
test: test -f "/usr/local/share/seqrepo.zip"
command: "unzip /usr/local/share/seqrepo.zip -d /usr/local/share"

05_seqrepo_permission:
04_seqrepo_permission:
test: test -d "/usr/local/share/seqrepo"
command: "chmod -R 777 /usr/local/share/seqrepo"

06_macosx_permission:
05_macosx_permission:
test: test -d "/usr/local/share/__MACOSX"
command: "chmod -R +wr /usr/local/share/__MACOSX"

07_seqrepo_zip_permission:
06_seqrepo_zip_permission:
test: test -f "/usr/local/share/seqrepo.zip"
command: "chmod +wr /usr/local/share/seqrepo.zip"

08_remove_macosx:
07_remove_macosx:
test: test -d "/usr/local/share/__MACOSX"
command: "rm -R /usr/local/share/__MACOSX"

09_remove_seqrepo_zip:
08_remove_seqrepo_zip:
test: test -f "/usr/local/share/seqrepo.zip"
command: "rm /usr/local/share/seqrepo.zip"

10_data_permission:
9_data_permission:
test: test -d "/usr/local/share/seqrepo"
command: "chmod -R +wrx /usr/local/share/seqrepo"
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[flake8]
ignore = D205, D400, W503
max-line-length = 88
exclude =
.git
venv
Expand Down
15 changes: 8 additions & 7 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,27 @@ verify_ssl = true
name = "pypi"

[packages]
"ga4gh.vrs" = {version = ">=0.7.2", extras = ["extras"]}
"ga4gh.vrs" = {version = ">=0.7.5.dev1", extras = ["extras"]}
civicpy = "*"
requests = "*"
jsondiff = "*"
pydantic = "*"
requests-cache = "*"
gene-normalizer = ">=0.1.23"
gene-normalizer = ">=0.1.25"
disease-normalizer = ">=0.2.12"
thera-py = ">=0.3.3"
thera-py = ">=0.3.4"
neo4j = "*"
uvicorn = "*"
fastapi = ">=0.72.0"
click = "*"
fastapi = "*"
uvloop = "*"
websockets = "*"
httptools = "*"
typing-extensions = "*"
boto3 = "*"
botocore = "*"
variation-normalizer = ">=0.2.18"
"ga4gh.vrsatile.pydantic" = ">=0.0.6"
variation-normalizer = ">=0.3.0"
"ga4gh.vrsatile.pydantic" = ">=0.0.11"
asyncclick = "*"

[dev-packages]
metakb = {editable = true, path = "."}
Expand All @@ -39,3 +39,4 @@ pytest-cov = "*"
jupyterlab = "*"
jupyter = "*"
ipykernel = "*"
pytest-asyncio = "*"
4 changes: 4 additions & 0 deletions metakb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO) # noqa: E501
logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO) # noqa: E501
logging.getLogger("requests_cache.patcher").setLevel(logging.INFO)
logging.getLogger("bioregistry.resource_manager").setLevel(logging.INFO)
logging.getLogger("blib2to3.pgen2.driver").setLevel(logging.INFO)
logging.getLogger("neo4j").setLevel(logging.INFO)
logging.getLogger("asyncio").setLevel(logging.INFO)
logger.handlers = []

if 'METAKB_NORM_EB_PROD' in environ:
Expand Down
99 changes: 46 additions & 53 deletions metakb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import tempfile
from zipfile import ZipFile

import click
import asyncclick as click
from disease.database import Database as DiseaseDatabase
from disease.schemas import SourceName as DiseaseSources
from disease.cli import CLI as DiseaseCLI
Expand All @@ -36,6 +36,14 @@
logger.setLevel(logging.DEBUG)


def echo_info(msg: str):
"""Log (as INFO) and echo given message.
:param str msg: message to emit
"""
click.echo(msg)
logger.info(msg)


class CLI:
"""Update database."""

Expand Down Expand Up @@ -106,13 +114,12 @@ class CLI:
"from VICC S3 bucket, and load the database with retrieved "
"data. Exclusive with --load_latest_cdms and load_target_cdm.")
)
def update_metakb_db(db_url: str, db_username: str, db_password: str,
load_normalizers_db: bool,
force_load_normalizers_db: bool,
normalizers_db_url: str,
load_latest_cdms: bool,
load_target_cdm: Optional[Path],
load_latest_s3_cdms: bool):
async def update_metakb_db(
db_url: str, db_username: str, db_password: str,
load_normalizers_db: bool, force_load_normalizers_db: bool,
normalizers_db_url: str, load_latest_cdms: bool,
load_target_cdm: Optional[Path], load_latest_s3_cdms: bool
):
"""Execute data harvest and transformation from resources and upload
to graph datastore.
"""
Expand All @@ -135,13 +142,11 @@ def update_metakb_db(db_url: str, db_username: str, db_password: str,
CLI()._load_normalizers_db(force_load_normalizers_db)

CLI()._harvest_sources()
CLI()._transform_sources()
await CLI()._transform_sources()

# Load neo4j database
start = timer()
msg = "Loading neo4j database..."
click.echo(msg)
logger.info(msg)
echo_info("Loading neo4j database...")

g = Graph(uri=db_url, credentials=(db_username, db_password))
if load_target_cdm:
Expand All @@ -166,9 +171,9 @@ def update_metakb_db(db_url: str, db_username: str, db_password: str,
g.load_from_json(path)
g.close()
end = timer()
msg = f"Successfully loaded neo4j database in {(end-start):.5f} s"
click.echo(f"{msg}\n")
logger.info(msg)
echo_info(
f"Successfully loaded neo4j database in {(end-start):.5f} s\n"
)

s3_cdm_pattern = re.compile(
r"cdm/20[23]\d[01]\d[0123]\d/(.*)_cdm_(.*).json.zip")
Expand All @@ -182,9 +187,7 @@ def _retrieve_s3_cdms(self) -> str:
:raise: FileNotFoundError if unable to find files matching expected
pattern in VICC MetaKB bucket.
"""
msg = "Attempting to fetch CDM files from S3 bucket"
logger.info(msg)
click.echo(msg)
echo_info("Attempting to fetch CDM files from S3 bucket")
s3 = boto3.resource("s3", config=Config(region_name="us-east-2"))
if not s3:
raise ResourceLoadException("Unable to initiate AWS S3 Resource")
Expand Down Expand Up @@ -218,70 +221,60 @@ def _retrieve_s3_cdms(self) -> str:
if newest_version is None:
raise FileNotFoundError("Unable to locate files matching expected "
"resource pattern in VICC s3 bucket")
msg = f"Retrieved CDM files dated {newest_version}"
click.echo(msg)
logger.info(msg)
echo_info(f"Retrieved CDM files dated {newest_version}")
return newest_version

@staticmethod
def _harvest_sources() -> None:
"""Run harvesting procedure for all sources."""
logger.info("Harvesting sources...")
echo_info("Harvesting sources...")
# TODO: Switch to using constant
harvester_sources = {
'civic': CIViCHarvester,
'moa': MOAHarvester
}
total_start = timer()
for source_str, source_class in harvester_sources.items():
harvest_start = f"Harvesting {source_str}..."
click.echo(harvest_start)
logger.info(harvest_start)
echo_info(f"Harvesting {source_str}...")
start = timer()
source: Harvester = source_class()
source_successful = source.harvest()
end = timer()
if not source_successful:
logger.info(f'{source_str} harvest failed.')
echo_info(f'{source_str} harvest failed.')
click.get_current_context().exit()
harvest_finish = \
f"{source_str} harvest finished in {(end - start):.5f} s"
click.echo(harvest_finish)
logger.info(harvest_finish)
echo_info(
f"{source_str} harvest finished in {(end - start):.5f} s")
total_end = timer()
msg = f"Successfully harvested all sources in " \
f"{(total_end - total_start):.5f} s"
click.echo(f"{msg}\n")
logger.info(msg)
echo_info(
f"Successfully harvested all sources in "
f"{(total_end - total_start):.5f} s\n"
)

@staticmethod
def _transform_sources() -> None:
async def _transform_sources() -> None:
"""Run transformation procedure for all sources."""
logger.info("Transforming harvested data to CDM...")
echo_info("Transforming harvested data to CDM...")
# TODO: Switch to using constant
transform_sources = {
'civic': CIViCTransform,
'moa': MOATransform
}
total_start = timer()
for src_str, src_name in transform_sources.items():
transform_start = f"Transforming {src_str}..."
click.echo(transform_start)
logger.info(transform_start)
echo_info(f"Transforming {src_str}...")
start = timer()
source: Transform = src_name()
source.transform()
await source.transform()
end = timer()
transform_end = \
f"{src_str} transform finished in {(end - start):.5f} s."
click.echo(transform_end)
logger.info(transform_end)
echo_info(
f"{src_str} transform finished in {(end - start):.5f} s.")
source.create_json()
total_end = timer()
msg = f"Successfully transformed all sources to CDM in " \
f"{(total_end-total_start):.5f} s"
click.echo(f"{msg}\n")
logger.info(msg)
echo_info(
f"Successfully transformed all sources to CDM in "
f"{(total_end-total_start):.5f} s\n"
)

def _load_normalizers_db(self, load_normalizer_db):
"""Load normalizer database source data.
Expand All @@ -306,7 +299,7 @@ def _load_normalizers_db(self, load_normalizer_db):
name = \
str(normalizer_cli).split()[1].split('.')[0][1:].capitalize()
self._update_normalizer_db(name, load_source, normalizer_cli)
click.echo("Normalizers database loaded.\n")
echo_info("Normalizers database loaded.\n")

@staticmethod
def _check_normalizer(db, sources) -> bool:
Expand Down Expand Up @@ -335,15 +328,15 @@ def _update_normalizer_db(name, load_normalizer, source_cli) -> None:
"""
if load_normalizer:
try:
click.echo(f'\nLoading {name} Normalizer data...')
echo_info(f'\nLoading {name} Normalizer data...')
source_cli.update_normalizer_db(
['--update_all', '--update_merged'])
click.echo(f'Successfully Loaded {name} Normalizer data.\n')
echo_info(f'Successfully Loaded {name} Normalizer data.\n')
except SystemExit as e:
if e.code != 0:
raise e
else:
click.echo(f'{name} Normalizer is already loaded.\n')
echo_info(f'{name} Normalizer is already loaded.\n')

@staticmethod
def _check_db_param(param: str, name: str) -> str:
Expand Down Expand Up @@ -382,4 +375,4 @@ def _help_msg(msg: str = ""):


if __name__ == '__main__':
CLI().update_metakb_db()
CLI().update_metakb_db(_anyio_backend="asyncio")
8 changes: 4 additions & 4 deletions metakb/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def _add_variation_descriptor(tx, descriptor_in: Dict,
expressions = descriptor.get('expressions')
if expressions:
for expression in expressions:
syntax = expression['syntax'].split(':')[1]
syntax = expression['syntax'].split('.')[1]
key = f"expressions_{syntax}"
if key in descriptor:
descriptor[key].append(expression['value'])
Expand All @@ -247,9 +247,9 @@ def _add_variation_descriptor(tx, descriptor_in: Dict,
'xrefs', 'alternate_labels',
'structural_type',
'molecule_context',
'expressions_transcript',
'expressions_genomic',
'expressions_protein',
'expressions_c',
'expressions_g',
'expressions_p',
'vrs_ref_allele_seq'))]

# handle extensions
Expand Down
26 changes: 14 additions & 12 deletions metakb/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,17 @@ def custom_openapi():
response_model=SearchService,
description=search_description,
response_model_exclude_none=True)
def search(variation: Optional[str] = Query(None, description=v_description),
disease: Optional[str] = Query(None, description=d_description),
therapy: Optional[str] = Query(None, description=t_description),
gene: Optional[str] = Query(None, description=g_description),
statement_id: Optional[str] = Query(None, description=s_description), # noqa: E501
detail: Optional[bool] = Query(False, description=detail_description) # noqa: E501
):
async def search(
variation: Optional[str] = Query(None, description=v_description),
disease: Optional[str] = Query(None, description=d_description),
therapy: Optional[str] = Query(None, description=t_description),
gene: Optional[str] = Query(None, description=g_description),
statement_id: Optional[str] = Query(None, description=s_description),
detail: Optional[bool] = Query(False, description=detail_description)
):
"""Search endpoint"""
return query.search(variation, disease, therapy, gene, statement_id,
detail)
resp = await query.search(variation, disease, therapy, gene, statement_id, detail)
return resp


search_statements_summary = (
Expand All @@ -83,15 +84,16 @@ def search(variation: Optional[str] = Query(None, description=v_description),
response_model=SearchStatementsService,
description=search_statements_descr,
response_model_exclude_none=True)
def get_statements(
async def get_statements(
variation: Optional[str] = Query(None, description=v_description),
disease: Optional[str] = Query(None, description=d_description),
therapy: Optional[str] = Query(None, description=t_description),
gene: Optional[str] = Query(None, description=g_description),
statement_id: Optional[str] = Query(None, description=s_description)):
"""Return nested statements for queried concepts"""
return query.search_statements(
variation, disease, therapy, gene, statement_id)
resp = await query.search_statements(variation, disease, therapy, gene,
statement_id)
return resp


id_query_desc = ("Given Meta-KB statement_id, proposition_id, descriptor_id,"
Expand Down
Loading

0 comments on commit 0223fad

Please sign in to comment.