videomatch/videohash.py at main · rtlnl/videomatch

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

import os

import urllib.request

import shutil

import logging

import hashlib

import time

from PIL import Image

import imagehash

from moviepy.editor import VideoFileClip

from moviepy.video.fx.all import crop

import numpy as np

from pytube import YouTube

import subprocess as sp

from config import FPS, VIDEO_DIRECTORY

def filepath_from_url(url):

"""Function to generate filepath from url.

Args:

url (str): The url of the input video.

Returns:

(str): Filepath of the video based on md5 hash of the url.

"""

return os.path.join(VIDEO_DIRECTORY, hashlib.md5(url.encode()).hexdigest())

def download_video_from_url(url):

"""Download video from url or return md5 hash as video name.

Args:

url (str): The url of the input video

Returns:

filepath (str): Filepath to the downloaded video from the url.

"""

start = time.time()

# Generate filepath from url

filepath = filepath_from_url(url)

# Check if it exists already

if not os.path.exists(filepath):

# For YouTube links

if url.startswith('https://www.youtube.com') or url.startswith('youtube.com') or url.startswith('http://www.youtube.com'):

file_dir = '/'.join(x for x in filepath.split('/')[:-1])

filename = filepath.split('/')[-1]

logging.info(f"file_dir = {file_dir}")

logging.info(f"filename = {filename}")

YouTube(url).streams.get_highest_resolution().download(file_dir, skip_existing = False, filename = filename)

logging.info(f"Downloaded YouTube video from {url} to {filepath} in {time.time() - start:.1f} seconds.")

return filepath

# Works for basically all links, except youtube

with (urllib.request.urlopen(url)) as f, open(filepath, 'wb') as fileout:

logging.info(f"Starting copyfileobj on {f}")

shutil.copyfileobj(f, fileout, length=16*1024*1024)

logging.info(f"Downloaded video from {url} to {filepath} in {time.time() - start:.1f} seconds.")

else:

logging.info(f"Skipping downloading from {url} because {filepath} already exists.")

return filepath

def change_ffmpeg_fps(clip, fps=FPS):

"""Change frame rate of a clip.

Args:

clip (moviepy.editor.VideoFileClip): Input clip.

fps (int): The desired frame rate for the clip.

Returns:

clip (moviepy.editor.VideoFileClip): New clip with the desired frames per seconds.

"""

# Hacking the ffmpeg call based on

# https://github.com/Zulko/moviepy/blob/master/moviepy/video/io/ffmpeg_reader.py#L126

# Define ffmpeg style command

cmd = [arg + ",fps=%d" % fps if arg.startswith("scale=") else arg for arg in clip.reader.proc.args]

clip.reader.close()

clip.reader.proc = sp.Popen(cmd, bufsize=clip.reader.bufsize,

stdout=sp.PIPE, stderr=sp.PIPE, stdin=sp.DEVNULL)

clip.fps = clip.reader.fps = fps

clip.reader.lastread = clip.reader.read_frame()

return clip

def crop_video(clip, crop_percentage=0.75, w=224, h=224):

"""Crop video clip to given crop percentage.

Args:

clip (moviepy.editor.VideoFileClip): Clip to be cropped.

crop_percentage (float): How much of the width and heights needs to remain after cropping.

width (float): Final width the video clip will be resized to.

height (float): Final height the video clip will be resized to.

Returns:

(moviepy.editor.VideoFileClip): Cropped and resized clip.

"""

# Original width and height- which combined with crop_percentage determines the size of the new video

ow, oh = clip.size

logging.info(f"Cropping and resizing video to ({w}, {h})")

# 75% of the width and height from the center of the clip is taken, so 25% is discarded

# The video is then resized to given w,h - for faster computation of hashes

return crop(clip, x_center=ow/2, y_center=oh/2, width=int(ow*crop_percentage), height=int(crop_percentage*oh)).resize((w,h))

def compute_hash(frame, hash_size=16):

"""Compute (p)hashes of the given frame.

Args:

frame (numpy.ndarray): Frame from the video.

hash_size (int): Size of the required hash.

Returns:

(numpy.ndarray): Perceptual hash of the frame of size (hash_size, hash_size)

"""

image = Image.fromarray(np.array(frame))

return imagehash.phash(image, hash_size)

def binary_array_to_uint8s(arr):

"""Convert binary array to form uint8s.

Args:

arr (numpy.ndarray): Frame from the video.

Returns:

(list): Hash converted from uint8 format

"""

# First make a bitstring out of the (hash_size, hash_size) ndarray

bit_string = ''.join(str(1 * x) for l in arr for x in l)

# Converting to uint8- segment at every 8th bit and convert to decimal value

return [int(bit_string[i:i+8], 2) for i in range(0, len(bit_string), 8)]

def compute_hashes(url: str, fps=FPS):

"""Compute hashes of the video at the given url.

Args:

url (str): Url of the input video.

Yields:

({str: int, str: numpy.ndarray}): Dict with the frame number and the corresponding hash.

"""

# Try downloading the video from url. If that fails, load it directly from the url instead

# Then crop the video

try:

filepath = download_video_from_url(url)

clip = crop_video(VideoFileClip(filepath))

except IOError:

logging.warn(f"Falling back to direct streaming from {url} because the downloaded video failed.")

clip = crop_video(VideoFileClip(url))

for index, frame in enumerate(change_ffmpeg_fps(clip, fps).iter_frames()):

# Each frame is a triplet of size (height, width, 3) of the video since it is RGB

# The hash itself is of size (hash_size, hash_size)

# The uint8 version of the hash is of size (hash_size * highfreq_factor,) and represents the hash

hashed = np.array(binary_array_to_uint8s(compute_hash(frame).hash), dtype='uint8')

yield {"frame": 1+index*fps, "hash": hashed}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

videohash.py

videohash.py

Files

videohash.py

Latest commit

History

videohash.py

File metadata and controls