Skip to content

Commit 48f69c3

Browse files
authored
Add load_json.py tool, remove --podcast flag from query.py (#164)
Co-authored-by: copilot-swe-agent[bot] <[email protected]>
1 parent d8c6a74 commit 48f69c3

2 files changed

Lines changed: 146 additions & 54 deletions

File tree

tools/load_json.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Microsoft Corporation.
3+
# Licensed under the MIT License.
4+
5+
"""
6+
JSON to SQLite Database Loader
7+
8+
This tool loads a JSON-serialized podcast database into a SQLite database
9+
that can be queried using tools/query.py.
10+
11+
Usage:
12+
python tools/load_json.py <index_path> --database <db_file>
13+
python tools/load_json.py tests/testdata/Episode_53_AdrianTchaikovsky_index -d transcript.db
14+
15+
The index_path should exclude the "_data.json" suffix.
16+
"""
17+
18+
import argparse
19+
import asyncio
20+
import os
21+
22+
from typeagent.aitools import utils
23+
from typeagent.knowpro.convsettings import ConversationSettings
24+
from typeagent.podcasts import podcast
25+
from typeagent.storage.utils import create_storage_provider
26+
27+
28+
async def load_json_to_database(
29+
podcast_file_prefix: str,
30+
dbname: str,
31+
verbose: bool = False,
32+
) -> None:
33+
"""Load JSON-serialized podcast data into a SQLite database.
34+
35+
Args:
36+
podcast_file_prefix: Path to podcast index files (without "_data.json" suffix)
37+
dbname: Path to SQLite database file (must be empty)
38+
verbose: Whether to show verbose output
39+
"""
40+
if verbose:
41+
print(f"Loading podcast from JSON: {podcast_file_prefix}")
42+
print(f"Target database: {dbname}")
43+
44+
# Create settings and storage provider
45+
settings = ConversationSettings()
46+
settings.storage_provider = await create_storage_provider(
47+
settings.message_text_index_settings,
48+
settings.related_term_index_settings,
49+
dbname,
50+
podcast.PodcastMessage,
51+
)
52+
53+
# Get the storage provider to check if database is empty
54+
provider = await settings.get_storage_provider()
55+
msgs = await provider.get_message_collection()
56+
57+
# Check if database already has data
58+
msg_count = await msgs.size()
59+
if msg_count > 0:
60+
raise RuntimeError(
61+
f"Database '{dbname}' already contains {msg_count} messages. "
62+
"The database must be empty to load new data. "
63+
"Please use a different database file or remove the existing one."
64+
)
65+
66+
# Load podcast from JSON files
67+
with utils.timelog(f"Loading podcast from {podcast_file_prefix!r}"):
68+
async with provider:
69+
conversation = await podcast.Podcast.read_from_file(
70+
podcast_file_prefix, settings, dbname
71+
)
72+
73+
# Print statistics
74+
if verbose:
75+
print(f"\nSuccessfully loaded podcast data:")
76+
print(f" {await conversation.messages.size()} messages")
77+
print(f" {await conversation.semantic_refs.size()} semantic refs")
78+
if conversation.semantic_ref_index:
79+
print(
80+
f" {await conversation.semantic_ref_index.size()} semantic ref index entries"
81+
)
82+
83+
print(f"\nDatabase created: {dbname}")
84+
print(f"\nTo query the database, use:")
85+
print(f" python tools/query.py --database '{dbname}' --query 'Your question here'")
86+
87+
88+
def main():
89+
"""Main entry point."""
90+
parser = argparse.ArgumentParser(
91+
description="Load JSON-serialized podcast data into a SQLite database",
92+
)
93+
94+
parser.add_argument(
95+
"-d",
96+
"--database",
97+
required=True,
98+
help="Path to the SQLite database file (must be empty)",
99+
)
100+
101+
parser.add_argument(
102+
"-v",
103+
"--verbose",
104+
action="store_true",
105+
help="Show verbose output including statistics",
106+
)
107+
108+
parser.add_argument(
109+
"index_path",
110+
help="Path to the podcast index files (excluding the '_data.json' suffix)",
111+
)
112+
113+
args = parser.parse_args()
114+
115+
# Ensure index file exists
116+
index_file = args.index_path + "_data.json"
117+
if not os.path.exists(index_file):
118+
raise SystemExit(
119+
f"Error: Podcast index file not found: {index_file}\n"
120+
f"Please verify the path exists and is accessible.\n"
121+
f"Note: The path should exclude the '_data.json' suffix."
122+
)
123+
124+
# Load environment variables for API access
125+
utils.load_dotenv()
126+
127+
# Run the loading process
128+
asyncio.run(load_json_to_database(args.index_path, args.database, args.verbose))
129+
130+
131+
if __name__ == "__main__":
132+
main()

tools/query.py

Lines changed: 14 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@
5555
Topic,
5656
)
5757
from typeagent.podcasts import podcast
58-
from typeagent.storage.sqlite.provider import SqliteStorageProvider
5958
from typeagent.storage.utils import create_storage_provider
6059

6160
### Classes ###
@@ -536,24 +535,6 @@ async def main():
536535
args = parser.parse_args()
537536
fill_in_debug_defaults(parser, args)
538537

539-
# Validate required podcast argument
540-
if args.podcast is None and args.database is None:
541-
scriptname = sys.argv[0]
542-
raise SystemExit(
543-
f"Error: Either --podcast or --database is required.\n"
544-
f"Usage: python {scriptname} --podcast <path_to_index>\n"
545-
f" or: python {scriptname} --database <path_to_database>\n"
546-
f"Example: python {scriptname} --podcast tests/testdata/Episode_53_AdrianTchaikovsky_index"
547-
)
548-
if args.podcast is not None:
549-
index_file = args.podcast + "_data.json"
550-
if not os.path.exists(index_file):
551-
raise SystemExit(
552-
f"Error: Podcast index file not found: {index_file}\n"
553-
f"Please verify the path exists and is accessible.\n"
554-
f"Note: The path should exclude the '_data.json' suffix."
555-
)
556-
557538
if args.logfire:
558539
utils.setup_logfire()
559540

@@ -564,9 +545,18 @@ async def main():
564545
args.database,
565546
podcast.PodcastMessage,
566547
)
567-
query_context = await load_podcast_index(
568-
args.podcast, settings, args.database, args.verbose
569-
)
548+
549+
# Load existing database
550+
provider = await settings.get_storage_provider()
551+
msgs = await provider.get_message_collection()
552+
if await msgs.size() == 0:
553+
raise SystemExit(f"Error: Database '{args.database}' is empty.")
554+
555+
with utils.timelog(f"Loading conversation from database {args.database!r}"):
556+
conversation = await podcast.Podcast.create(settings)
557+
558+
await print_conversation_stats(conversation, args.verbose)
559+
query_context = query.QueryEvalContext(conversation)
570560

571561
ar_list, ar_index = load_index_file(
572562
args.qafile, "question", QuestionAnswerData, args.verbose
@@ -943,12 +933,6 @@ def make_arg_parser(description: str) -> argparse.ArgumentParser:
943933
),
944934
)
945935

946-
parser.add_argument(
947-
"--podcast",
948-
type=str,
949-
default=None,
950-
help="Path to the podcast index files (excluding the '_data.json' suffix)",
951-
)
952936
explain_qa = "a list of questions and answers to test the full pipeline"
953937
parser.add_argument(
954938
"--qafile",
@@ -973,8 +957,8 @@ def make_arg_parser(description: str) -> argparse.ArgumentParser:
973957
"-d",
974958
"--database",
975959
type=str,
976-
default=None,
977-
help="Path to the SQLite database file (default: in-memory)",
960+
required=True,
961+
help="Path to the SQLite database file",
978962
)
979963
parser.add_argument(
980964
"--query",
@@ -1110,30 +1094,6 @@ def fill_in_debug_defaults(
11101094
### Data loading ###
11111095

11121096

1113-
async def load_podcast_index(
1114-
podcast_file_prefix: str,
1115-
settings: ConversationSettings,
1116-
dbname: str | None,
1117-
verbose: bool = True,
1118-
) -> query.QueryEvalContext:
1119-
provider = await settings.get_storage_provider()
1120-
msgs = await provider.get_message_collection()
1121-
if await msgs.size() > 0: # Sqlite provider with existing non-empty database
1122-
with utils.timelog(f"Reusing database {dbname!r}"):
1123-
conversation = await podcast.Podcast.create(settings)
1124-
else:
1125-
with utils.timelog(f"Loading podcast from {podcast_file_prefix!r}"):
1126-
conversation = await podcast.Podcast.read_from_file(
1127-
podcast_file_prefix, settings, dbname
1128-
)
1129-
if isinstance(provider, SqliteStorageProvider):
1130-
provider.db.commit()
1131-
1132-
await print_conversation_stats(conversation, verbose)
1133-
1134-
return query.QueryEvalContext(conversation)
1135-
1136-
11371097
def load_index_file[T: Mapping[str, typing.Any]](
11381098
file: str | None, selector: str, cls: type[T], verbose: bool = True
11391099
) -> tuple[list[T], dict[str, T]]:

0 commit comments

Comments
 (0)