Skip to content

Commit 372c07b

Browse files
committed
some tests with gpt 4o and 40 mini
1 parent 8b6527f commit 372c07b

7 files changed

+43360
-6
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
test_data/
Binary file not shown.
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from pathlib import Path
2+
import re
3+
import requests
4+
from selectolax.parser import HTMLParser
5+
6+
7+
def download_minutes_pdfs():
8+
folder = "test_data"
9+
base_url = "https://www.cityoftulsa.org/apps/CouncilDocuments"
10+
Path("./notebooks/experiments/test_data").mkdir(parents=True, exist_ok=True)
11+
12+
for item_num in range(47711, 48000):
13+
url = f"{base_url}?item={item_num}"
14+
print(f"Checking page: {url}")
15+
16+
response = requests.get(url)
17+
if response.status_code != 200:
18+
print(f"Failed to fetch page {item_num}")
19+
break
20+
21+
# Parse the HTML using selectolax
22+
tree = HTMLParser(response.content)
23+
24+
# Find all rows that contain filename divs
25+
rows = tree.css("div.row")
26+
27+
for row in rows:
28+
# Find the filename div in this row
29+
filename_div = row.css_first("div.fileName")
30+
if not filename_div:
31+
continue
32+
33+
filename = filename_div.text().strip()
34+
if "minutes" not in filename.lower():
35+
continue
36+
37+
# Check if file already exists
38+
full_filepath = Path(folder) / filename
39+
if full_filepath.exists():
40+
print(f"File already exists, skipping: {filename}")
41+
continue
42+
43+
# Find the hidden div with the document ID
44+
doc_id_div = row.css_first("div.pdfString.hidden")
45+
if not doc_id_div:
46+
print(f"No document ID found for {filename}")
47+
continue
48+
49+
doc_id = doc_id_div.text().strip()
50+
pdf_url = f"https://www.cityoftulsa.org/apps/COTDisplayDocument/?DocumentType=CouncilDocument&DocumentIdentifiers={doc_id}"
51+
52+
print(f"Downloading: {filename}")
53+
pdf_response = requests.get(pdf_url, stream=True)
54+
55+
if pdf_response.status_code == 200:
56+
with open(full_filepath, "wb") as f:
57+
for chunk in pdf_response.iter_content(chunk_size=8192):
58+
f.write(chunk)
59+
print(f"Successfully downloaded {filename}")
60+
else:
61+
print(f"Failed to download {filename}")
62+
63+
item_num += 1
64+
65+
66+
download_minutes_pdfs()
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
from pathlib import Path
2+
from langchain_community.document_loaders import PyPDFLoader
3+
from openai import OpenAI
4+
import json
5+
6+
import tiktoken
7+
8+
# Initialize OpenAI client
9+
client = OpenAI()
10+
11+
12+
def extract_text_from_pdf(pdf_path):
13+
"""Extract text from PDF using PyPDFLoader."""
14+
loader = PyPDFLoader(str(pdf_path))
15+
pages = loader.load()
16+
return "\n".join(page.page_content for page in pages)
17+
18+
19+
def get_diarization():
20+
"""Get the diarization data from the JSON file."""
21+
diarization_path = Path(
22+
"./notebooks/experiments/minutes_diarization/regular_council_meeting___2025_02_26.diarized.json"
23+
)
24+
if not diarization_path.exists():
25+
raise FileNotFoundError("Diarization JSON file not found")
26+
27+
with open(diarization_path, "r") as f:
28+
return json.load(f)
29+
30+
31+
def simplify_diarization(transcript_data):
32+
def format_timestamp(seconds: float) -> str:
33+
"""Convert seconds to HH:MM:SS format"""
34+
hours = int(seconds // 3600)
35+
minutes = int((seconds % 3600) // 60)
36+
secs = int(seconds % 60)
37+
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
38+
39+
# Create formatted HTML output
40+
speaker_lines = ["Meeting Script - Combined by Speaker"]
41+
42+
current_speaker = None
43+
current_text = []
44+
current_start = None
45+
46+
for segment in transcript_data["segments"]:
47+
if current_speaker != segment["speaker"]:
48+
# Output previous speaker's text
49+
if current_speaker:
50+
timestamp = format_timestamp(current_start)
51+
wrapped_text = " ".join(current_text)
52+
speaker_lines.append(
53+
f"[{timestamp}] {current_speaker}:\n{wrapped_text}\n"
54+
)
55+
56+
# Start new speaker
57+
current_speaker = segment["speaker"]
58+
current_text = [segment["text"].strip()]
59+
current_start = segment["start"]
60+
else:
61+
# Continue current speaker
62+
current_text.append(segment["text"].strip())
63+
64+
# Output final speaker
65+
if current_speaker:
66+
timestamp = format_timestamp(current_start)
67+
wrapped_text = " ".join(current_text)
68+
speaker_lines.append(f"[{timestamp}] {current_speaker}:\n{wrapped_text}")
69+
return "\n".join(speaker_lines)
70+
71+
72+
def match_speakers_with_chatgpt(minutes_text, diarization):
73+
"""Use ChatGPT to match speakers from diarization with names from minutes."""
74+
# Format diarization data for the prompt
75+
76+
prompt = f"""I have a city council meeting minutes document and a diarization of the audio recording.
77+
The diarization has identified different speakers but doesn't know their names.
78+
Please analyze the minutes text and match the speakers from the diarization with the names mentioned in the minutes.
79+
80+
Minutes text:
81+
{minutes_text}
82+
83+
Diarization segments:
84+
{diarization}
85+
86+
For each speaker in the diarization, please identify who they are based on the minutes text.
87+
If you can't determine who they are, mark them as "Unknown".
88+
Format your response as a JSON object where the keys are the speaker numbers (e.g., "SPEAKER_00")
89+
and the values are the identified names or "Unknown".
90+
"""
91+
92+
response = client.chat.completions.create(
93+
model="gpt-4o",
94+
messages=[
95+
{
96+
"role": "system",
97+
"content": "You are a helpful assistant that analyzes meeting minutes and audio diarization to identify speakers.",
98+
},
99+
{"role": "user", "content": prompt},
100+
],
101+
response_format={"type": "json_object"},
102+
)
103+
104+
return json.loads(response.choices[0].message.content)
105+
106+
107+
def main():
108+
minutes_path = Path(
109+
"./notebooks/experiments/minutes_diarization/test_data/25-173-2_25-173-2 2025-02-26 5PM Minutes.pdf"
110+
)
111+
# Extract text from PDF
112+
minutes_text = extract_text_from_pdf(minutes_path)
113+
114+
# Get diarization data
115+
diarization = get_diarization()
116+
117+
simple_diarization = simplify_diarization(diarization)
118+
print(simple_diarization)
119+
120+
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
121+
122+
print(
123+
f"Diarization segments length: {len(encoding.encode(str(simple_diarization)))}"
124+
)
125+
print(f"Minutes text length: {len(encoding.encode(minutes_text))}")
126+
127+
# Use ChatGPT to match speakers
128+
speaker_matches = match_speakers_with_chatgpt(minutes_text, simple_diarization)
129+
print(speaker_matches)
130+
131+
132+
if __name__ == "__main__":
133+
main()

0 commit comments

Comments
 (0)