Extract specific frames of youtube video without downloading video
Question:
I need to extract specific frames of an online video to work on an algorithm but I don’t want to download the whole video because that would make it highly inefficient.
For starters, I tried working with youtube videos. I can download whole of the video using youtube-dl
in this way:
ydl_opts = {'outtmpl': r'OUTPUT_DIRECTORY_HERE',}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
And then I can capture individual frames.
I need to avoid downloading the whole video. After some research, I have found that ffmpeg
might help me do this. I found no way to download just the frames so if this is not possible, the second option is that I can download specific portions of the video. One such example in linux is here but I couldn’t find any solution for python.
What is a good way to download just the frames, or portions of videos (in python) without downloading the entire thing?
Answers:
I tried what @AyeshaKhan shared in the comments.
After importing cv2
,numpy
,youtube-dl
:
url=saved_url #The Youtube URL
ydl_opts={}
ydl=youtube_dl.YoutubeDL(ydl_opts)
info_dict=ydl.extract_info(video_url, download=False)
formats = info_dict.get('formats',None)
print("Obtaining frames")
for f in formats:
if f.get('format_note',None) == '144p':
url = f.get('url',None)
cap = cv2.VideoCapture(url)
x=0
count=0
while x<10:
ret, frame = cap.read()
if not ret:
break
filename =r"PATHshot"+str(x)+".png"
x+=1
cv2.imwrite(filename.format(count), frame)
count+=300 #Skip 300 frames i.e. 10 seconds for 30 fps
cap.set(1,count)
if cv2.waitKey(30)&0xFF == ord('q'):
break
cap.release()
The answer in the comments was downloading all of the frames so the count
I added in .format()
ensured that I skipped the frames as per my requirement.
Additionally, x
here limits the number to 10.
Although, I am still not sure whether this method is actually capturing the specified frames, or if it is capturing all the frames and just saving the specified frames to my local storage. I needed the former thing.
But this is still fast enough and works for me!
Just to add on to the current answer, performance can further be enhanced using multiprocessing. For example, if you wanted to split up the video into frames and process them independently in num_cpu processes:
import os
from functools import partial
from multiprocessing.pool import Pool
import cv2
import youtube_dl
def process_video_parallel(url, skip_frames, process_number):
cap = cv2.VideoCapture(url)
num_processes = os.cpu_count()
frames_per_process = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) // num_processes
cap.set(cv2.CAP_PROP_POS_FRAMES, frames_per_process * process_number)
x = 0
count = 0
while x < 10 and count < frames_per_process:
ret, frame = cap.read()
if not ret:
break
filename =r"PATHshot"+str(x)+".png"
x += 1
cv2.imwrite(filename.format(count), frame)
count += skip_frames # Skip 300 frames i.e. 10 seconds for 30 fps
cap.set(1, count)
cap.release()
video_url = "..." # The Youtube URL
ydl_opts = {}
ydl = youtube_dl.YoutubeDL(ydl_opts)
info_dict = ydl.extract_info(video_url, download=False)
formats = info_dict.get('formats', None)
print("Obtaining frames")
for f in formats:
if f.get('format_note', None) == '144p':
url = f.get('url', None)
cpu_count = os.cpu_count()
with Pool(cpu_count) as pool:
pool.map(partial(process_video_parallel, url, 300), range(cpu_count))
For the purposes of this application, since images are just being saved from the video, this may not result in a huge improvement (maybe a few seconds), but if additional algorithms needed to be applied on the frames, it could be beneficial.
Alternative to @danielcahall,
This method uses Ray for parallelization instead of Pool
Note: For the first time, it might take more time for initializing the ray components, After the first run, this will be fine
from timeit import default_timer as timer
import os, ray, shutil
import cv2
import youtube_dl
try:
os.makedirs('test_fol')
except:
shutil.rmtree('test_fol')
os.makedirs('test_fol')
ray.init()
@ray.remote
def process_video_parallel(url, total_frames, process_number):
cap = cv2.VideoCapture(url)
num_processes = os.cpu_count()
frames_per_process = int(total_frames) // num_processes
cap.set(cv2.CAP_PROP_POS_FRAMES, frames_per_process * process_number)
count = frames_per_process * process_number
while count < frames_per_process * (process_number+1):
ret, frame = cap.read()
if not ret:
break
filename = f"test_fol/{count}.jpg"
cv2.imwrite(filename, frame)
count += 1
cap.release()
t1 = timer()
video_url = "..." # The Youtube URL
ydl_opts = {}
ydl = youtube_dl.YoutubeDL(ydl_opts)
info_dict = ydl.extract_info(video_url, download=False)
duration = info_dict['duration']
formats = info_dict.get('formats', None)
for f in formats:
if f.get('format_note', None) == '144p':
url = f.get('url', None)
cpu_count = os.cpu_count()
data = ray.get([process_video_parallel.remote(url, int(duration*31), x) for x in range(cpu_count)])
break
print("Total Time", timer()-t1)
I need to extract specific frames of an online video to work on an algorithm but I don’t want to download the whole video because that would make it highly inefficient.
For starters, I tried working with youtube videos. I can download whole of the video using youtube-dl
in this way:
ydl_opts = {'outtmpl': r'OUTPUT_DIRECTORY_HERE',}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
And then I can capture individual frames.
I need to avoid downloading the whole video. After some research, I have found that ffmpeg
might help me do this. I found no way to download just the frames so if this is not possible, the second option is that I can download specific portions of the video. One such example in linux is here but I couldn’t find any solution for python.
What is a good way to download just the frames, or portions of videos (in python) without downloading the entire thing?
I tried what @AyeshaKhan shared in the comments.
After importing cv2
,numpy
,youtube-dl
:
url=saved_url #The Youtube URL
ydl_opts={}
ydl=youtube_dl.YoutubeDL(ydl_opts)
info_dict=ydl.extract_info(video_url, download=False)
formats = info_dict.get('formats',None)
print("Obtaining frames")
for f in formats:
if f.get('format_note',None) == '144p':
url = f.get('url',None)
cap = cv2.VideoCapture(url)
x=0
count=0
while x<10:
ret, frame = cap.read()
if not ret:
break
filename =r"PATHshot"+str(x)+".png"
x+=1
cv2.imwrite(filename.format(count), frame)
count+=300 #Skip 300 frames i.e. 10 seconds for 30 fps
cap.set(1,count)
if cv2.waitKey(30)&0xFF == ord('q'):
break
cap.release()
The answer in the comments was downloading all of the frames so the count
I added in .format()
ensured that I skipped the frames as per my requirement.
Additionally, x
here limits the number to 10.
Although, I am still not sure whether this method is actually capturing the specified frames, or if it is capturing all the frames and just saving the specified frames to my local storage. I needed the former thing.
But this is still fast enough and works for me!
Just to add on to the current answer, performance can further be enhanced using multiprocessing. For example, if you wanted to split up the video into frames and process them independently in num_cpu processes:
import os
from functools import partial
from multiprocessing.pool import Pool
import cv2
import youtube_dl
def process_video_parallel(url, skip_frames, process_number):
cap = cv2.VideoCapture(url)
num_processes = os.cpu_count()
frames_per_process = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) // num_processes
cap.set(cv2.CAP_PROP_POS_FRAMES, frames_per_process * process_number)
x = 0
count = 0
while x < 10 and count < frames_per_process:
ret, frame = cap.read()
if not ret:
break
filename =r"PATHshot"+str(x)+".png"
x += 1
cv2.imwrite(filename.format(count), frame)
count += skip_frames # Skip 300 frames i.e. 10 seconds for 30 fps
cap.set(1, count)
cap.release()
video_url = "..." # The Youtube URL
ydl_opts = {}
ydl = youtube_dl.YoutubeDL(ydl_opts)
info_dict = ydl.extract_info(video_url, download=False)
formats = info_dict.get('formats', None)
print("Obtaining frames")
for f in formats:
if f.get('format_note', None) == '144p':
url = f.get('url', None)
cpu_count = os.cpu_count()
with Pool(cpu_count) as pool:
pool.map(partial(process_video_parallel, url, 300), range(cpu_count))
For the purposes of this application, since images are just being saved from the video, this may not result in a huge improvement (maybe a few seconds), but if additional algorithms needed to be applied on the frames, it could be beneficial.
Alternative to @danielcahall,
This method uses Ray for parallelization instead of Pool
Note: For the first time, it might take more time for initializing the ray components, After the first run, this will be fine
from timeit import default_timer as timer
import os, ray, shutil
import cv2
import youtube_dl
try:
os.makedirs('test_fol')
except:
shutil.rmtree('test_fol')
os.makedirs('test_fol')
ray.init()
@ray.remote
def process_video_parallel(url, total_frames, process_number):
cap = cv2.VideoCapture(url)
num_processes = os.cpu_count()
frames_per_process = int(total_frames) // num_processes
cap.set(cv2.CAP_PROP_POS_FRAMES, frames_per_process * process_number)
count = frames_per_process * process_number
while count < frames_per_process * (process_number+1):
ret, frame = cap.read()
if not ret:
break
filename = f"test_fol/{count}.jpg"
cv2.imwrite(filename, frame)
count += 1
cap.release()
t1 = timer()
video_url = "..." # The Youtube URL
ydl_opts = {}
ydl = youtube_dl.YoutubeDL(ydl_opts)
info_dict = ydl.extract_info(video_url, download=False)
duration = info_dict['duration']
formats = info_dict.get('formats', None)
for f in formats:
if f.get('format_note', None) == '144p':
url = f.get('url', None)
cpu_count = os.cpu_count()
data = ray.get([process_video_parallel.remote(url, int(duration*31), x) for x in range(cpu_count)])
break
print("Total Time", timer()-t1)