From a86d8b68a19bf8bec52b6ab8125932013ce93732 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 15 Jan 2026 02:09:31 +0000 Subject: [PATCH 1/8] Initial plan From d1c98ce3f742254aee7625d7227ead7de27e61db Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 15 Jan 2026 02:14:13 +0000 Subject: [PATCH 2/8] Add tools/load_json.py tool Co-authored-by: gvanrossum-ms <184014243+gvanrossum-ms@users.noreply.github.com> --- tools/load_json.py | 156 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 tools/load_json.py diff --git a/tools/load_json.py b/tools/load_json.py new file mode 100644 index 00000000..a0ac3749 --- /dev/null +++ b/tools/load_json.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +JSON to SQLite Database Loader + +This tool loads a JSON-serialized podcast database into a SQLite database +that can be queried using tools/query.py. + +Usage: + python tools/load_json.py --database + python tools/load_json.py tests/testdata/Episode_53_AdrianTchaikovsky_index -d transcript.db + +The index_path should exclude the "_data.json" suffix. +""" + +import argparse +import asyncio +import os + +from typeagent.aitools import utils +from typeagent.knowpro.convsettings import ConversationSettings +from typeagent.podcasts import podcast +from typeagent.storage.sqlite.provider import SqliteStorageProvider +from typeagent.storage.utils import create_storage_provider + + +async def load_json_to_database( + podcast_file_prefix: str, + dbname: str, + verbose: bool = False, +) -> None: + """Load JSON-serialized podcast data into a SQLite database. + + Args: + podcast_file_prefix: Path to podcast index files (without "_data.json" suffix) + dbname: Path to SQLite database file to create + verbose: Whether to show verbose output + """ + if verbose: + print(f"Loading podcast from JSON: {podcast_file_prefix}") + print(f"Target database: {dbname}") + + # Create settings and storage provider + settings = ConversationSettings() + settings.storage_provider = await create_storage_provider( + settings.message_text_index_settings, + settings.related_term_index_settings, + dbname, + podcast.PodcastMessage, + ) + + # Get the storage provider to check if database is empty + provider = await settings.get_storage_provider() + msgs = await provider.get_message_collection() + + # Check if database already has data + if await msgs.size() > 0: + raise RuntimeError( + f"Database {dbname!r} already has messages. " + "Please use an empty database or specify a new filename." + ) + + # Load podcast from JSON files + with utils.timelog(f"Loading podcast from {podcast_file_prefix!r}"): + conversation = await podcast.Podcast.read_from_file( + podcast_file_prefix, settings, dbname + ) + if isinstance(provider, SqliteStorageProvider): + provider.db.commit() + + # Print statistics + if verbose: + print(f"\nSuccessfully loaded podcast data:") + print(f" {await conversation.messages.size()} messages") + print(f" {await conversation.semantic_refs.size()} semantic refs") + if conversation.semantic_ref_index: + print( + f" {await conversation.semantic_ref_index.size()} semantic ref index entries" + ) + + print(f"\nDatabase created: {dbname}") + print(f"\nTo query the database, use:") + print(f" python tools/query.py --database '{dbname}' --query 'Your question here'") + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Load JSON-serialized podcast data into a SQLite database", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python tools/load_json.py tests/testdata/Episode_53_AdrianTchaikovsky_index -d podcast.db + python tools/load_json.py path/to/index -d output.db -v + +Note: The index path should exclude the "_data.json" suffix. + """, + ) + + parser.add_argument( + "index_path", + help="Path to the podcast index files (excluding the '_data.json' suffix)", + ) + + parser.add_argument( + "-d", + "--database", + required=True, + help="Path to the SQLite database file to create", + ) + + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Show verbose output including statistics", + ) + + args = parser.parse_args() + + # Validate index file exists + index_file = args.index_path + "_data.json" + if not os.path.exists(index_file): + raise SystemExit( + f"Error: Podcast index file not found: {index_file}\n" + f"Please verify the path exists and is accessible.\n" + f"Note: The path should exclude the '_data.json' suffix." + ) + + # Check if database already exists + if os.path.exists(args.database): + raise SystemExit( + f"Error: Database file already exists: {args.database}\n" + f"Please remove it first or choose a different filename." + ) + + # Load environment variables for API access + utils.load_dotenv() + + # Run the loading process + try: + asyncio.run( + load_json_to_database( + args.index_path, + args.database, + args.verbose, + ) + ) + except (RuntimeError, ValueError) as err: + raise SystemExit(f"Error: {err}") + + +if __name__ == "__main__": + main() From d80014fd4dbf8f325f9700e18e085a8c85f71112 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 15 Jan 2026 02:15:50 +0000 Subject: [PATCH 3/8] Remove --podcast flag from query.py, require --database Co-authored-by: gvanrossum-ms <184014243+gvanrossum-ms@users.noreply.github.com> --- tools/query.py | 74 ++++++++++++++------------------------------------ 1 file changed, 20 insertions(+), 54 deletions(-) diff --git a/tools/query.py b/tools/query.py index 3e93ecde..0a109881 100644 --- a/tools/query.py +++ b/tools/query.py @@ -55,7 +55,6 @@ Topic, ) from typeagent.podcasts import podcast -from typeagent.storage.sqlite.provider import SqliteStorageProvider from typeagent.storage.utils import create_storage_provider ### Classes ### @@ -536,24 +535,6 @@ async def main(): args = parser.parse_args() fill_in_debug_defaults(parser, args) - # Validate required podcast argument - if args.podcast is None and args.database is None: - scriptname = sys.argv[0] - raise SystemExit( - f"Error: Either --podcast or --database is required.\n" - f"Usage: python {scriptname} --podcast \n" - f" or: python {scriptname} --database \n" - f"Example: python {scriptname} --podcast tests/testdata/Episode_53_AdrianTchaikovsky_index" - ) - if args.podcast is not None: - index_file = args.podcast + "_data.json" - if not os.path.exists(index_file): - raise SystemExit( - f"Error: Podcast index file not found: {index_file}\n" - f"Please verify the path exists and is accessible.\n" - f"Note: The path should exclude the '_data.json' suffix." - ) - if args.logfire: utils.setup_logfire() @@ -564,9 +545,24 @@ async def main(): args.database, podcast.PodcastMessage, ) - query_context = await load_podcast_index( - args.podcast, settings, args.database, args.verbose - ) + + # Load existing database + provider = await settings.get_storage_provider() + msgs = await provider.get_message_collection() + if await msgs.size() == 0: + raise SystemExit( + f"Error: Database {args.database!r} is empty.\n" + f"Please load data into the database first using tools/load_json.py:\n" + f" python tools/load_json.py --database {args.database}\n" + f"Example:\n" + f" python tools/load_json.py tests/testdata/Episode_53_AdrianTchaikovsky_index --database {args.database}" + ) + + with utils.timelog(f"Loading conversation from database {args.database!r}"): + conversation = await podcast.Podcast.create(settings) + + await print_conversation_stats(conversation, args.verbose) + query_context = query.QueryEvalContext(conversation) ar_list, ar_index = load_index_file( args.qafile, "question", QuestionAnswerData, args.verbose @@ -943,12 +939,6 @@ def make_arg_parser(description: str) -> argparse.ArgumentParser: ), ) - parser.add_argument( - "--podcast", - type=str, - default=None, - help="Path to the podcast index files (excluding the '_data.json' suffix)", - ) explain_qa = "a list of questions and answers to test the full pipeline" parser.add_argument( "--qafile", @@ -973,8 +963,8 @@ def make_arg_parser(description: str) -> argparse.ArgumentParser: "-d", "--database", type=str, - default=None, - help="Path to the SQLite database file (default: in-memory)", + required=True, + help="Path to the SQLite database file", ) parser.add_argument( "--query", @@ -1110,30 +1100,6 @@ def fill_in_debug_defaults( ### Data loading ### -async def load_podcast_index( - podcast_file_prefix: str, - settings: ConversationSettings, - dbname: str | None, - verbose: bool = True, -) -> query.QueryEvalContext: - provider = await settings.get_storage_provider() - msgs = await provider.get_message_collection() - if await msgs.size() > 0: # Sqlite provider with existing non-empty database - with utils.timelog(f"Reusing database {dbname!r}"): - conversation = await podcast.Podcast.create(settings) - else: - with utils.timelog(f"Loading podcast from {podcast_file_prefix!r}"): - conversation = await podcast.Podcast.read_from_file( - podcast_file_prefix, settings, dbname - ) - if isinstance(provider, SqliteStorageProvider): - provider.db.commit() - - await print_conversation_stats(conversation, verbose) - - return query.QueryEvalContext(conversation) - - def load_index_file[T: Mapping[str, typing.Any]]( file: str | None, selector: str, cls: type[T], verbose: bool = True ) -> tuple[list[T], dict[str, T]]: From 8b25997afa892193babfd9dd4603b7fc9fdab6f5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 15 Jan 2026 02:17:58 +0000 Subject: [PATCH 4/8] Improve error handling in load_json.py Co-authored-by: gvanrossum-ms <184014243+gvanrossum-ms@users.noreply.github.com> --- tools/load_json.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/load_json.py b/tools/load_json.py index a0ac3749..e8c3cb42 100644 --- a/tools/load_json.py +++ b/tools/load_json.py @@ -58,8 +58,9 @@ async def load_json_to_database( # Check if database already has data if await msgs.size() > 0: raise RuntimeError( - f"Database {dbname!r} already has messages. " - "Please use an empty database or specify a new filename." + f"Database '{dbname}' already contains {await msgs.size()} messages. " + "The database must be empty to load new data. " + "Please use a different database file or remove the existing one." ) # Load podcast from JSON files @@ -129,13 +130,6 @@ def main(): f"Note: The path should exclude the '_data.json' suffix." ) - # Check if database already exists - if os.path.exists(args.database): - raise SystemExit( - f"Error: Database file already exists: {args.database}\n" - f"Please remove it first or choose a different filename." - ) - # Load environment variables for API access utils.load_dotenv() From 5fab84175c6262affdc1fefff0eca66622d5d051 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 15 Jan 2026 02:21:15 +0000 Subject: [PATCH 5/8] Fix docstrings in load_json.py to clarify database requirement Co-authored-by: gvanrossum-ms <184014243+gvanrossum-ms@users.noreply.github.com> --- tools/load_json.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/load_json.py b/tools/load_json.py index e8c3cb42..99dce1c6 100644 --- a/tools/load_json.py +++ b/tools/load_json.py @@ -35,7 +35,7 @@ async def load_json_to_database( Args: podcast_file_prefix: Path to podcast index files (without "_data.json" suffix) - dbname: Path to SQLite database file to create + dbname: Path to SQLite database file (must be empty) verbose: Whether to show verbose output """ if verbose: @@ -109,7 +109,7 @@ def main(): "-d", "--database", required=True, - help="Path to the SQLite database file to create", + help="Path to the SQLite database file (must be empty)", ) parser.add_argument( From 40012508cddb19f55d5736bef3de86cdf89bedb6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 15 Jan 2026 02:22:20 +0000 Subject: [PATCH 6/8] Address final code review feedback - optimize database queries and improve error message consistency Co-authored-by: gvanrossum-ms <184014243+gvanrossum-ms@users.noreply.github.com> --- tools/load_json.py | 5 +++-- tools/query.py | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/load_json.py b/tools/load_json.py index 99dce1c6..b3d7eddc 100644 --- a/tools/load_json.py +++ b/tools/load_json.py @@ -56,9 +56,10 @@ async def load_json_to_database( msgs = await provider.get_message_collection() # Check if database already has data - if await msgs.size() > 0: + msg_count = await msgs.size() + if msg_count > 0: raise RuntimeError( - f"Database '{dbname}' already contains {await msgs.size()} messages. " + f"Database '{dbname}' already contains {msg_count} messages. " "The database must be empty to load new data. " "Please use a different database file or remove the existing one." ) diff --git a/tools/query.py b/tools/query.py index 0a109881..88934d5d 100644 --- a/tools/query.py +++ b/tools/query.py @@ -551,11 +551,11 @@ async def main(): msgs = await provider.get_message_collection() if await msgs.size() == 0: raise SystemExit( - f"Error: Database {args.database!r} is empty.\n" + f"Error: Database '{args.database}' is empty.\n" f"Please load data into the database first using tools/load_json.py:\n" - f" python tools/load_json.py --database {args.database}\n" + f" python tools/load_json.py -d {args.database}\n" f"Example:\n" - f" python tools/load_json.py tests/testdata/Episode_53_AdrianTchaikovsky_index --database {args.database}" + f" python tools/load_json.py tests/testdata/Episode_53_AdrianTchaikovsky_index -d {args.database}" ) with utils.timelog(f"Loading conversation from database {args.database!r}"): From dd79821ed8424d8ba188823ea8f36fc6eac9dfaf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 15 Jan 2026 02:54:48 +0000 Subject: [PATCH 7/8] Address review feedback: simplify error messages, use async context manager, reorder args Co-authored-by: gvanrossum-ms <184014243+gvanrossum-ms@users.noreply.github.com> --- tools/load_json.py | 40 ++++++++++++++-------------------------- tools/query.py | 8 +------- 2 files changed, 15 insertions(+), 33 deletions(-) diff --git a/tools/load_json.py b/tools/load_json.py index b3d7eddc..f5e8d007 100644 --- a/tools/load_json.py +++ b/tools/load_json.py @@ -22,7 +22,6 @@ from typeagent.aitools import utils from typeagent.knowpro.convsettings import ConversationSettings from typeagent.podcasts import podcast -from typeagent.storage.sqlite.provider import SqliteStorageProvider from typeagent.storage.utils import create_storage_provider @@ -69,8 +68,8 @@ async def load_json_to_database( conversation = await podcast.Podcast.read_from_file( podcast_file_prefix, settings, dbname ) - if isinstance(provider, SqliteStorageProvider): - provider.db.commit() + async with provider: + pass # Commit happens in __aexit__ # Print statistics if verbose: @@ -91,19 +90,6 @@ def main(): """Main entry point.""" parser = argparse.ArgumentParser( description="Load JSON-serialized podcast data into a SQLite database", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - python tools/load_json.py tests/testdata/Episode_53_AdrianTchaikovsky_index -d podcast.db - python tools/load_json.py path/to/index -d output.db -v - -Note: The index path should exclude the "_data.json" suffix. - """, - ) - - parser.add_argument( - "index_path", - help="Path to the podcast index files (excluding the '_data.json' suffix)", ) parser.add_argument( @@ -120,9 +106,14 @@ def main(): help="Show verbose output including statistics", ) + parser.add_argument( + "index_path", + help="Path to the podcast index files (excluding the '_data.json' suffix)", + ) + args = parser.parse_args() - # Validate index file exists + # Ensure index file exists index_file = args.index_path + "_data.json" if not os.path.exists(index_file): raise SystemExit( @@ -135,16 +126,13 @@ def main(): utils.load_dotenv() # Run the loading process - try: - asyncio.run( - load_json_to_database( - args.index_path, - args.database, - args.verbose, - ) + asyncio.run( + load_json_to_database( + args.index_path, + args.database, + args.verbose, ) - except (RuntimeError, ValueError) as err: - raise SystemExit(f"Error: {err}") + ) if __name__ == "__main__": diff --git a/tools/query.py b/tools/query.py index 88934d5d..f2cc311f 100644 --- a/tools/query.py +++ b/tools/query.py @@ -550,13 +550,7 @@ async def main(): provider = await settings.get_storage_provider() msgs = await provider.get_message_collection() if await msgs.size() == 0: - raise SystemExit( - f"Error: Database '{args.database}' is empty.\n" - f"Please load data into the database first using tools/load_json.py:\n" - f" python tools/load_json.py -d {args.database}\n" - f"Example:\n" - f" python tools/load_json.py tests/testdata/Episode_53_AdrianTchaikovsky_index -d {args.database}" - ) + raise SystemExit(f"Error: Database '{args.database}' is empty.") with utils.timelog(f"Loading conversation from database {args.database!r}"): conversation = await podcast.Podcast.create(settings) From bfab170d16bbc324594a0de88a52e756a1298bda Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 15 Jan 2026 03:05:16 +0000 Subject: [PATCH 8/8] Fix async context manager placement and format function call Co-authored-by: gvanrossum-ms <184014243+gvanrossum-ms@users.noreply.github.com> --- tools/load_json.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/tools/load_json.py b/tools/load_json.py index f5e8d007..573b4a79 100644 --- a/tools/load_json.py +++ b/tools/load_json.py @@ -65,11 +65,10 @@ async def load_json_to_database( # Load podcast from JSON files with utils.timelog(f"Loading podcast from {podcast_file_prefix!r}"): - conversation = await podcast.Podcast.read_from_file( - podcast_file_prefix, settings, dbname - ) async with provider: - pass # Commit happens in __aexit__ + conversation = await podcast.Podcast.read_from_file( + podcast_file_prefix, settings, dbname + ) # Print statistics if verbose: @@ -126,13 +125,7 @@ def main(): utils.load_dotenv() # Run the loading process - asyncio.run( - load_json_to_database( - args.index_path, - args.database, - args.verbose, - ) - ) + asyncio.run(load_json_to_database(args.index_path, args.database, args.verbose)) if __name__ == "__main__":