-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp2.py
118 lines (98 loc) · 3.84 KB
/
app2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import streamlit as st
import av
import numpy as np
import torch
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
from pytube import YouTube
import os
import openai
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# Check if CUDA is available and set the device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load pretrained processor, tokenizer, and model
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = VisionEncoderDecoderModel.from_pretrained(
"Neleac/timesformer-gpt2-video-captioning"
).to(device)
def main():
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
st.title("Video Captioning with TimesFormer-GPT2")
# Option to choose between uploading a video file or pasting a YouTube link
option = st.radio(
"Choose an option:", ("Upload a video file", "Paste a YouTube link")
)
if option == "Upload a video file":
video_file = st.file_uploader("Upload a video file", type=["mp4"])
if video_file is not None:
st.video(video_file)
frames, seg_len = extract_frames(video_file)
if frames:
generate_captions(frames, seg_len)
elif option == "Paste a YouTube link":
youtube_link = st.text_input("Paste a YouTube link")
if youtube_link:
st.write("Downloading video from YouTube...")
video_file_path = download_youtube_video(youtube_link)
if video_file_path:
st.video(video_file_path)
frames, seg_len = extract_frames(video_file_path)
if frames:
generate_captions(frames, seg_len)
# Add a function to download a YouTube video
def download_youtube_video(youtube_link):
try:
yt = YouTube(youtube_link)
stream = yt.streams.filter(file_extension="mp4", progressive=True).first()
video_file_path = stream.download()
return video_file_path
except Exception as e:
st.error(f"Error downloading YouTube video: {e}")
return None
# Add a function to extract frames from a video
def extract_frames(video_file):
container = av.open(video_file)
seg_len = container.streams.video[0].frames
clip_len = model.config.encoder.num_frames
indices = set(
np.linspace(0, seg_len, num=clip_len, endpoint=False).astype(np.int64)
)
frames = []
container.seek(0)
for i, frame in enumerate(container.decode(video=0)):
if i in indices:
frames.append(frame.to_ndarray(format="rgb24"))
return frames, seg_len
# Add a function to generate captions
def generate_captions(frames, seg_len):
st.write("Generating captions...")
gen_kwargs = {
"min_length": 10,
"max_length": 20,
"num_beams": 8,
}
pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to(device)
tokens = model.generate(pixel_values, **gen_kwargs)
caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are given a caption line from a video summarizer model. Expand it into a few more lines. Don't get too creative to avoid writing something that is not in the video"},
{"role": "user", "content": f"Caption: {caption}"},
],
)
st.write("Generated Caption:")
if not completion:
st.write(caption)
else:
st.write(completion.choices[0].message["content"])
if __name__ == "__main__":
main()