Skip to content

Commit f8a4851

Browse files
authored
Merge pull request #10 from codefortulsa/video-to-transcription
Video to transcription
2 parents d965ea4 + a279415 commit f8a4851

12 files changed

+2609
-201
lines changed

notebooks/roll_call.ipynb

+329
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"### This notebook is for focusing on a roll call to see how it is transcribed\n",
8+
"\n",
9+
"Recognizing short words by different speakers is difficult. This notebook focuses in a roll call vote to see if changing model parameters can improve it. \n"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 5,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"import sys\n",
19+
"import pandas as pd\n",
20+
"sys.path.append(\"../\")\n",
21+
"from pathlib import Path"
22+
]
23+
},
24+
{
25+
"cell_type": "markdown",
26+
"metadata": {},
27+
"source": [
28+
"### use ffmpeg to get a section of a meeting\n",
29+
"This 30 second clip is a roll call vote"
30+
]
31+
},
32+
{
33+
"cell_type": "code",
34+
"execution_count": 6,
35+
"metadata": {},
36+
"outputs": [
37+
{
38+
"name": "stdout",
39+
"output_type": "stream",
40+
"text": [
41+
"Clip successfully extracted to: ../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\n"
42+
]
43+
}
44+
],
45+
"source": [
46+
"import subprocess\n",
47+
"from pathlib import Path\n",
48+
"\n",
49+
"# Input and output file paths\n",
50+
"input_file = Path(\"../data/video/regular_council_meeting___2025_02_26.mp4\")\n",
51+
"clip_file = Path(\"../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\")\n",
52+
"\n",
53+
"# Parameters for clip extraction\n",
54+
"start_time = \"4:50\"\n",
55+
"duration = \"30\" # 30 seconds\n",
56+
"\n",
57+
"# Run FFmpeg command\n",
58+
"result = subprocess.run(\n",
59+
" [\n",
60+
" \"ffmpeg\",\n",
61+
" \"-i\",\n",
62+
" str(input_file),\n",
63+
" \"-ss\",\n",
64+
" start_time,\n",
65+
" \"-t\",\n",
66+
" duration,\n",
67+
" \"-c\",\n",
68+
" \"copy\", # Copy codec (fast but might not be frame accurate)\n",
69+
" \"-avoid_negative_ts\",\n",
70+
" \"1\",\n",
71+
" str(clip_file),\n",
72+
" \"-y\", # Overwrite if exists\n",
73+
" ],\n",
74+
" capture_output=True,\n",
75+
" text=True,\n",
76+
")\n",
77+
"\n",
78+
"# Check if command was successful\n",
79+
"if result.returncode == 0:\n",
80+
" print(f\"Clip successfully extracted to: {clip_file}\")\n",
81+
"else:\n",
82+
" print(f\"Error extracting clip: {result.stderr}\")"
83+
]
84+
},
85+
{
86+
"cell_type": "markdown",
87+
"metadata": {},
88+
"source": [
89+
"### experiment with model parameters\n",
90+
"\n",
91+
"using these setting actually made the results worse:\n",
92+
"- min_speakers=3, # Specify at least 3 speakers\n",
93+
"- max_speakers=15, # Limit to at most 10 speakers\n",
94+
"- diarize_min_duration=0.1, # Shorter minimum segment duration\n",
95+
"I also tested with medium, and large versions but the results using tiny were the same\n"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": 7,
101+
"metadata": {},
102+
"outputs": [
103+
{
104+
"name": "stderr",
105+
"output_type": "stream",
106+
"text": [
107+
"INFO:src.videos:Transcribing video with speaker diarization: ../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\n",
108+
"INFO:src.videos:Output will be saved to: ../data/transcripts/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.diarized.json\n",
109+
"INFO:src.huggingface:Auto-detected device: cpu\n",
110+
"INFO:src.huggingface:Auto-selected compute_type: int8\n",
111+
"INFO:src.huggingface:Loading WhisperX model: tiny on cpu with int8 precision\n"
112+
]
113+
},
114+
{
115+
"data": {
116+
"application/vnd.jupyter.widget-view+json": {
117+
"model_id": "168afa65d3ae4108af591eb1993fe482",
118+
"version_major": 2,
119+
"version_minor": 0
120+
},
121+
"text/plain": [
122+
"tokenizer.json: 0%| | 0.00/2.20M [00:00<?, ?B/s]"
123+
]
124+
},
125+
"metadata": {},
126+
"output_type": "display_data"
127+
},
128+
{
129+
"data": {
130+
"application/vnd.jupyter.widget-view+json": {
131+
"model_id": "89d35faecb8e447db3ccb95407e2a775",
132+
"version_major": 2,
133+
"version_minor": 0
134+
},
135+
"text/plain": [
136+
"config.json: 0%| | 0.00/2.25k [00:00<?, ?B/s]"
137+
]
138+
},
139+
"metadata": {},
140+
"output_type": "display_data"
141+
},
142+
{
143+
"data": {
144+
"application/vnd.jupyter.widget-view+json": {
145+
"model_id": "f616039556ee46aaaee2f975f016aeb0",
146+
"version_major": 2,
147+
"version_minor": 0
148+
},
149+
"text/plain": [
150+
"vocabulary.txt: 0%| | 0.00/460k [00:00<?, ?B/s]"
151+
]
152+
},
153+
"metadata": {},
154+
"output_type": "display_data"
155+
},
156+
{
157+
"data": {
158+
"application/vnd.jupyter.widget-view+json": {
159+
"model_id": "50bd4e88d6084638b91847587cc9ed0a",
160+
"version_major": 2,
161+
"version_minor": 0
162+
},
163+
"text/plain": [
164+
"model.bin: 0%| | 0.00/75.5M [00:00<?, ?B/s]"
165+
]
166+
},
167+
"metadata": {},
168+
"output_type": "display_data"
169+
},
170+
{
171+
"name": "stderr",
172+
"output_type": "stream",
173+
"text": [
174+
"Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../Library/Caches/pypoetry/virtualenvs/tgov_scraper-zRR99ne3-py3.11/lib/python3.11/site-packages/whisperx/assets/pytorch_model.bin`\n",
175+
"INFO:src.huggingface:Loading diarization pipeline\n"
176+
]
177+
},
178+
{
179+
"name": "stdout",
180+
"output_type": "stream",
181+
"text": [
182+
"No language specified, language will be first be detected for each audio file (increases inference time).\n",
183+
">>Performing voice activity detection using Pyannote...\n",
184+
"Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.\n",
185+
"Model was trained with torch 1.10.0+cu102, yours is 2.4.1. Bad things might happen unless you revert torch to 1.x.\n"
186+
]
187+
},
188+
{
189+
"name": "stderr",
190+
"output_type": "stream",
191+
"text": [
192+
"INFO:src.huggingface:WhisperX model loaded in 4.50 seconds\n",
193+
"INFO:src.videos:Running initial transcription with batch size 8...\n"
194+
]
195+
},
196+
{
197+
"name": "stdout",
198+
"output_type": "stream",
199+
"text": [
200+
"Detected language: en (0.99) in first 30s of audio...\n"
201+
]
202+
},
203+
{
204+
"name": "stderr",
205+
"output_type": "stream",
206+
"text": [
207+
"INFO:src.videos:Detected language: en\n",
208+
"INFO:src.videos:Loading alignment model for detected language: en\n",
209+
"INFO:src.videos:Aligning transcription with audio...\n",
210+
"INFO:src.videos:Running speaker diarization...\n",
211+
"/Users/owner/Library/Caches/pypoetry/virtualenvs/tgov_scraper-zRR99ne3-py3.11/lib/python3.11/site-packages/pyannote/audio/models/blocks/pooling.py:104: UserWarning: std(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/ReduceOps.cpp:1808.)\n",
212+
" std = sequences.std(dim=-1, correction=1)\n",
213+
"INFO:src.videos:Assigning speakers to transcription...\n",
214+
"INFO:src.videos:Processing transcription segments...\n",
215+
"INFO:src.videos:Diarized transcription completed in 30.03 seconds\n",
216+
"INFO:src.videos:Detailed JSON saved to: ../data/transcripts/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.diarized.json\n"
217+
]
218+
}
219+
],
220+
"source": [
221+
"from src.videos import transcribe_video_with_diarization\n",
222+
"\n",
223+
"transcription_dir = Path(\"../data/transcripts\")\n",
224+
"\n",
225+
"transcript_data = await transcribe_video_with_diarization(\n",
226+
" clip_file,\n",
227+
" transcription_dir,\n",
228+
" model_size=\"tiny\",\n",
229+
")"
230+
]
231+
},
232+
{
233+
"cell_type": "code",
234+
"execution_count": 8,
235+
"metadata": {},
236+
"outputs": [
237+
{
238+
"data": {
239+
"application/vnd.jupyter.widget-view+json": {
240+
"model_id": "5d97ff70c1c3409da83c10c478f2bfaa",
241+
"version_major": 2,
242+
"version_minor": 0
243+
},
244+
"text/plain": [
245+
"HTML(value='<h3>Meeting Script</h3><hr><p><b>[00:00:00] SPEAKER_01:</b><br>Thank you, Mr. Huffinds. Any counci…"
246+
]
247+
},
248+
"metadata": {},
249+
"output_type": "display_data"
250+
}
251+
],
252+
"source": [
253+
"def format_timestamp(seconds: float) -> str:\n",
254+
" \"\"\"Convert seconds to HH:MM:SS format\"\"\"\n",
255+
" hours = int(seconds // 3600)\n",
256+
" minutes = int((seconds % 3600) // 60)\n",
257+
" secs = int(seconds % 60)\n",
258+
" return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n",
259+
"\n",
260+
"\n",
261+
"from ipywidgets import HTML, VBox, Layout\n",
262+
"from textwrap import fill\n",
263+
"\n",
264+
"# Create formatted HTML output\n",
265+
"html_output = [\"<h3>Meeting Script</h3>\"]\n",
266+
"html_output.append(\"<hr>\")\n",
267+
"\n",
268+
"current_speaker = None\n",
269+
"current_text = []\n",
270+
"current_start = None\n",
271+
"\n",
272+
"for segment in transcript_data[\"segments\"]:\n",
273+
" if current_speaker != segment[\"speaker\"]:\n",
274+
" # Output previous speaker's text\n",
275+
" if current_speaker:\n",
276+
" timestamp = format_timestamp(current_start)\n",
277+
" wrapped_text = fill(\" \".join(current_text), width=80)\n",
278+
" html_output.append(f\"<p><b>[{timestamp}] {current_speaker}:</b><br>\")\n",
279+
" html_output.append(f\"{wrapped_text}</p>\")\n",
280+
" html_output.append(\"<hr>\")\n",
281+
"\n",
282+
" # Start new speaker\n",
283+
" current_speaker = segment[\"speaker\"]\n",
284+
" current_text = [segment[\"text\"].strip()]\n",
285+
" current_start = segment[\"start\"]\n",
286+
" else:\n",
287+
" # Continue current speaker\n",
288+
" current_text.append(segment[\"text\"].strip())\n",
289+
"\n",
290+
"# Output final speaker\n",
291+
"if current_speaker:\n",
292+
" timestamp = format_timestamp(current_start)\n",
293+
" wrapped_text = fill(\" \".join(current_text), width=80)\n",
294+
" html_output.append(f\"<p><b>[{timestamp}] {current_speaker}:</b><br>\")\n",
295+
" html_output.append(f\"{wrapped_text}</p>\")\n",
296+
" html_output.append(\"<hr>\")\n",
297+
"\n",
298+
"# Display formatted output\n",
299+
"display(\n",
300+
" HTML(\n",
301+
" value=\"\".join(html_output),\n",
302+
" layout=Layout(width=\"100%\", border=\"1px solid gray\", padding=\"10px\"),\n",
303+
" )\n",
304+
")"
305+
]
306+
}
307+
],
308+
"metadata": {
309+
"kernelspec": {
310+
"display_name": "TGOV Scraper",
311+
"language": "python",
312+
"name": "tgov-scraper"
313+
},
314+
"language_info": {
315+
"codemirror_mode": {
316+
"name": "ipython",
317+
"version": 3
318+
},
319+
"file_extension": ".py",
320+
"mimetype": "text/x-python",
321+
"name": "python",
322+
"nbconvert_exporter": "python",
323+
"pygments_lexer": "ipython3",
324+
"version": "3.11.9"
325+
}
326+
},
327+
"nbformat": 4,
328+
"nbformat_minor": 2
329+
}

0 commit comments

Comments
 (0)