-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Split big files #75
Split big files #75
Changes from 3 commits
c186602
a577691
7b594c4
ef4e2c3
4d3c47b
5623c67
e4d38ea
22e9fa3
44569e1
f3d41d1
2ffddca
a0394d7
644e696
b0f0224
eee54eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,22 @@ | |
import traceback | ||
import sys | ||
|
||
import subprocess | ||
import os.path | ||
from dejavu.decoder import get_duration | ||
|
||
class SplitError(Exception): | ||
def __init__(self, file_path, output_file, error_code): | ||
Exception.__init__(self) | ||
self.file_path = file_path | ||
self.error_code = error_code | ||
self.output_file = output_file | ||
|
||
def __str__(self): | ||
return "Spliting of file({0}) failed to ({1}). ffmpeg returned error code: {2}".format(self.file_path, self.output_file, self.error_code) | ||
|
||
|
||
|
||
|
||
class Dejavu(object): | ||
|
||
|
@@ -16,6 +32,9 @@ class Dejavu(object): | |
OFFSET = 'offset' | ||
OFFSET_SECS = 'offset_seconds' | ||
|
||
SPLIT_DIR = "split_dir" | ||
OVERWRITE_WHEN_SPLITING = 1 | ||
|
||
def __init__(self, config): | ||
super(Dejavu, self).__init__() | ||
|
||
|
@@ -43,7 +62,7 @@ def get_fingerprinted_songs(self): | |
song_name = song[self.db.FIELD_SONGNAME] | ||
self.songnames_set.add(song_name) | ||
|
||
def fingerprint_directory(self, path, extensions, nprocesses=None): | ||
def fingerprint_directory(self, path, extensions, nprocesses=None, splited=False, splited_song_name=""): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I agree to the note. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will change it to "threat_as_split" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed the spelling error |
||
# Try to use the maximum amount of processes if not given. | ||
try: | ||
nprocesses = nprocesses or multiprocessing.cpu_count() | ||
|
@@ -71,6 +90,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None): | |
# Send off our tasks | ||
iterator = pool.imap_unordered(_fingerprint_worker, | ||
worker_input) | ||
if splited and splited_song_name: | ||
sid = self.db.insert_song(splited_song_name) | ||
|
||
# Loop till we have all of them | ||
while True: | ||
|
@@ -85,8 +106,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None): | |
# Print traceback because we can't reraise it here | ||
traceback.print_exc(file=sys.stdout) | ||
else: | ||
sid = self.db.insert_song(song_name) | ||
|
||
if not splited: | ||
sid = self.db.insert_song(song_name) | ||
self.db.insert_hashes(sid, hashes) | ||
self.db.set_song_fingerprinted(sid) | ||
self.get_fingerprinted_songs() | ||
|
@@ -111,6 +132,61 @@ def fingerprint_file(self, filepath, song_name=None): | |
self.db.set_song_fingerprinted(sid) | ||
self.get_fingerprinted_songs() | ||
|
||
def fingerprint_with_duration_check(self, input_file, minutes=5, song_name=None, processes=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does does this actually save memory? or are we just replacing N memory with one processor with N/2 memory with 2 processors (faster but still same memory usage)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
duration = get_duration(input_file) | ||
split_length = minutes * 60 | ||
if duration < split_length: | ||
return self.fingerprint_file(input_file) | ||
songname, extension = os.path.splitext(os.path.basename(input_file)) | ||
song_name = song_name or songname | ||
# don't refingerprint already fingerprinted files | ||
if song_name in self.songnames_set: | ||
print "%s already fingerprinted, continuing..." % song_name | ||
return | ||
file_directory = os.path.dirname(input_file) | ||
output_split_path = os.path.join(file_directory, self.SPLIT_DIR) | ||
try: | ||
os.mkdir(output_split_path) | ||
except WindowsError: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What error would this be? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. directory already exists There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i will check for an OS-independant solution with better handling |
||
pass | ||
output_path = os.path.join(output_split_path, song_name) | ||
try: | ||
os.mkdir(output_path) | ||
except WindowsError: | ||
pass | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are you ignoring the error here with just a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. directory already exists There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually i can remove the tree and recreate it again if the directory already exists There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably not a good idea when fingerprinting two parallel long files... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, ... |
||
|
||
start_offset = 0 | ||
end_offset = split_length | ||
retcode = 0 | ||
sid = self.db.insert_song(song_name) | ||
while start_offset < duration: | ||
output_file = os.path.join(output_path, "start_sec{0}_end_sec{1}{2}".format(start_offset, end_offset, extension)) | ||
convertion_command = [ 'ffmpeg', | ||
'-i', input_file, | ||
"-acodec", "copy", #fastest convertion possible 1:1 copy | ||
["-n","-y"][self.OVERWRITE_WHEN_SPLITING], # always overwrite existing files | ||
"-vn", # Drop any video streams if there are any | ||
'-ss', str(start_offset), | ||
'-t', str(split_length), | ||
output_file] | ||
#songname for the input | ||
retcode = subprocess.call(convertion_command, stderr=open(os.devnull)) | ||
if retcode != 0: | ||
raise SplitError(input_file, output_file, retcode) | ||
start_offset += split_length | ||
end_offset += split_length | ||
end_offset = min(end_offset, duration) | ||
|
||
# song_name = song_name or songname | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. get rid of commented out code |
||
# song_name, hashes = _fingerprint_worker(output_file, | ||
# self.limit, | ||
# song_name=song_name) | ||
# self.db.insert_hashes(sid, hashes) | ||
self.db.set_song_fingerprinted(sid) | ||
self.get_fingerprinted_songs() | ||
self.fingerprint_directory(output_path, [extension], nprocesses=processes, splited=True, splited_song_name=song_name) | ||
#TODO: delete files in the output_split_path after FP | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be done for this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, will add it in the evening |
||
|
||
def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS): | ||
hashes = fingerprint.fingerprint(samples, Fs=Fs) | ||
return self.db.return_matches(hashes) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from dejavu import Dejavu | ||
import warnings | ||
import json | ||
warnings.filterwarnings("ignore") | ||
|
||
# load config from a JSON file (or anything outputting a python dictionary) | ||
with open("dejavu.cnf.SAMPLE") as f: | ||
config = json.load(f) | ||
|
||
|
||
if __name__ == '__main__': | ||
''' | ||
This will audio files that are too long into sections. | ||
There are probably better libs to do this. | ||
''' | ||
|
||
# create a Dejavu instance | ||
djv = Dejavu(config) | ||
|
||
# Fingerprint all the mp3's in the directory we give it | ||
# short_song = "./hugeFilesTests/Josh-Woodward--I-Want-To-Destroy-Something-Beautiful.mp3" | ||
# djv.fingerprint_with_duration_check(short_song, minutes=3) | ||
|
||
long_song = "./hugeFilesTests/Roger Penrose - Forbidden crystal symmetry in mathematics and architecture.mp3" | ||
djv.fingerprint_with_duration_check(long_song, minutes=3, song_name="RogerPenrose1",processes=3) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
overwrites what?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
its about overwriting the temp files that were split with ffmpeg.
probably wouldnt be needed if we delete the temporary directory.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
probably should have coded it as "always overwrite" with no conditions
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
renamed the constant to OVERWRITE_TEMP_FILES_WHEN_SPLITING