Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split big files #75

Closed
wants to merge 15 commits into from
1 change: 1 addition & 0 deletions INSTALLATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Install the dependencies:
sudo yum install numpy scipy python-matplotlib ffmpeg portaudio-devel
pip install PyAudio
pip install pydub
pip install audioread # https://github.com/sampsyo/audioread

Now setup virtualenv ([howto?](http://www.pythoncentral.io/how-to-install-virtualenv-python/)):

Expand Down
82 changes: 79 additions & 3 deletions dejavu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,22 @@
import traceback
import sys

import subprocess
import os.path
from dejavu.decoder import get_duration

class SplitError(Exception):
def __init__(self, file_path, output_file, error_code):
Exception.__init__(self)
self.file_path = file_path
self.error_code = error_code
self.output_file = output_file

def __str__(self):
return "Spliting of file({0}) failed to ({1}). ffmpeg returned error code: {2}".format(self.file_path, self.output_file, self.error_code)




class Dejavu(object):

Expand All @@ -16,6 +32,9 @@ class Dejavu(object):
OFFSET = 'offset'
OFFSET_SECS = 'offset_seconds'

SPLIT_DIR = "split_dir"
OVERWRITE_WHEN_SPLITING = 1
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

overwrites what?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its about overwriting the temp files that were split with ffmpeg.
probably wouldnt be needed if we delete the temporary directory.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably should have coded it as "always overwrite" with no conditions

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

renamed the constant to OVERWRITE_TEMP_FILES_WHEN_SPLITING


def __init__(self, config):
super(Dejavu, self).__init__()

Expand Down Expand Up @@ -43,7 +62,7 @@ def get_fingerprinted_songs(self):
song_name = song[self.db.FIELD_SONGNAME]
self.songnames_set.add(song_name)

def fingerprint_directory(self, path, extensions, nprocesses=None):
def fingerprint_directory(self, path, extensions, nprocesses=None, splited=False, splited_song_name=""):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

splitted is not great English grammar, I'd say 'splitting' or 'split' are better wordings here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I agree to the note.
probably : threat_as_split_from_big_file would be even better

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will change it to "threat_as_split"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed the spelling error
variable is now "treat_as_split"

# Try to use the maximum amount of processes if not given.
try:
nprocesses = nprocesses or multiprocessing.cpu_count()
Expand Down Expand Up @@ -71,6 +90,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
# Send off our tasks
iterator = pool.imap_unordered(_fingerprint_worker,
worker_input)
if splited and splited_song_name:
sid = self.db.insert_song(splited_song_name)

# Loop till we have all of them
while True:
Expand All @@ -85,8 +106,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
# Print traceback because we can't reraise it here
traceback.print_exc(file=sys.stdout)
else:
sid = self.db.insert_song(song_name)

if not splited:
sid = self.db.insert_song(song_name)
self.db.insert_hashes(sid, hashes)
self.db.set_song_fingerprinted(sid)
self.get_fingerprinted_songs()
Expand All @@ -111,6 +132,61 @@ def fingerprint_file(self, filepath, song_name=None):
self.db.set_song_fingerprinted(sid)
self.get_fingerprinted_songs()

def fingerprint_with_duration_check(self, input_file, minutes=5, song_name=None, processes=None):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does minutes here mean that any song longer than minutes will be split up and read by two separate processes?

does this actually save memory? or are we just replacing N memory with one processor with N/2 memory with 2 processors (faster but still same memory usage)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • on the "minutes" part of the question:
    yes; this is the maximum limit for single file fingerprinting;
    if file is shorter than "minutes" the standard file_fingerprinting is applied.
  • on the "Memory" part of the question:
    Distributes memory usage in time:
    It fingerprints some portions from the file.
    Then releases that memory used by those portions.
    Again fingerprints some portions and so on.

duration = get_duration(input_file)
split_length = minutes * 60
if duration < split_length:
return self.fingerprint_file(input_file)
songname, extension = os.path.splitext(os.path.basename(input_file))
song_name = song_name or songname
# don't refingerprint already fingerprinted files
if song_name in self.songnames_set:
print "%s already fingerprinted, continuing..." % song_name
return
file_directory = os.path.dirname(input_file)
output_split_path = os.path.join(file_directory, self.SPLIT_DIR)
try:
os.mkdir(output_split_path)
except WindowsError:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What error would this be?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

directory already exists
EDIT:
are you having problem with that exception, is it available in your operating system?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i will check for an OS-independant solution with better handling
such as described here: http://stackoverflow.com/questions/17619079/python-why-does-os-makedirs-cause-windowserror

pass
output_path = os.path.join(output_split_path, song_name)
try:
os.mkdir(output_path)
except WindowsError:
pass
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you ignoring the error here with just a pass?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

directory already exists

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually i can remove the tree and recreate it again if the directory already exists

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably not a good idea when fingerprinting two parallel long files...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, ...
https://docs.python.org/2/library/os.html#os.makedirs says it creates the directory recursively, so will join those too calls to os.mkdir


start_offset = 0
end_offset = split_length
retcode = 0
sid = self.db.insert_song(song_name)
while start_offset < duration:
output_file = os.path.join(output_path, "start_sec{0}_end_sec{1}{2}".format(start_offset, end_offset, extension))
convertion_command = [ 'ffmpeg',
'-i', input_file,
"-acodec", "copy", #fastest convertion possible 1:1 copy
["-n","-y"][self.OVERWRITE_WHEN_SPLITING], # always overwrite existing files
"-vn", # Drop any video streams if there are any
'-ss', str(start_offset),
'-t', str(split_length),
output_file]
#songname for the input
retcode = subprocess.call(convertion_command, stderr=open(os.devnull))
if retcode != 0:
raise SplitError(input_file, output_file, retcode)
start_offset += split_length
end_offset += split_length
end_offset = min(end_offset, duration)

# song_name = song_name or songname
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get rid of commented out code

# song_name, hashes = _fingerprint_worker(output_file,
# self.limit,
# song_name=song_name)
# self.db.insert_hashes(sid, hashes)
self.db.set_song_fingerprinted(sid)
self.get_fingerprinted_songs()
self.fingerprint_directory(output_path, [extension], nprocesses=processes, splited=True, splited_song_name=song_name)
#TODO: delete files in the output_split_path after FP
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be done for this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, will add it in the evening


def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
hashes = fingerprint.fingerprint(samples, Fs=Fs)
return self.db.return_matches(hashes)
Expand Down
13 changes: 13 additions & 0 deletions dejavu/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@
from pydub.utils import audioop
import wavio

# pip install audioread
# https://github.com/sampsyo/audioread
import audioread

def get_duration(file_path):
duration = 0
with audioread.audio_open(file_path) as f:
duration = f.duration
f.close()
return duration



def find_files(path, extensions):
# Allow both with ".mp3" and without "mp3" to be used for extensions
extensions = [e.replace(".", "") for e in extensions]
Expand Down
43 changes: 22 additions & 21 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,30 @@
# load config from a JSON file (or anything outputting a python dictionary)
with open("dejavu.cnf.SAMPLE") as f:
config = json.load(f)
if __name__ == '__main__':

# create a Dejavu instance
djv = Dejavu(config)
# create a Dejavu instance
djv = Dejavu(config)

# Fingerprint all the mp3's in the directory we give it
djv.fingerprint_directory("mp3", [".mp3"])
# Fingerprint all the mp3's in the directory we give it
djv.fingerprint_directory("mp3", [".mp3"])

# Recognize audio from a file
from dejavu.recognize import FileRecognizer
song = djv.recognize(FileRecognizer, "mp3/Sean-Fournier--Falling-For-You.mp3")
print "From file we recognized: %s\n" % song
# Recognize audio from a file
from dejavu.recognize import FileRecognizer
song = djv.recognize(FileRecognizer, "mp3/Sean-Fournier--Falling-For-You.mp3")
print "From file we recognized: %s\n" % song

# Or recognize audio from your microphone for `secs` seconds
from dejavu.recognize import MicrophoneRecognizer
secs = 5
song = djv.recognize(MicrophoneRecognizer, seconds=secs)
if song is None:
print "Nothing recognized -- did you play the song out loud so your mic could hear it? :)"
else:
print "From mic with %d seconds we recognized: %s\n" % (secs, song)
# Or recognize audio from your microphone for `secs` seconds
from dejavu.recognize import MicrophoneRecognizer
secs = 5
song = djv.recognize(MicrophoneRecognizer, seconds=secs)
if song is None:
print "Nothing recognized -- did you play the song out loud so your mic could hear it? :)"
else:
print "From mic with %d seconds we recognized: %s\n" % (secs, song)

# Or use a recognizer without the shortcut, in anyway you would like
from dejavu.recognize import FileRecognizer
recognizer = FileRecognizer(djv)
song = recognizer.recognize_file("mp3/Josh-Woodward--I-Want-To-Destroy-Something-Beautiful.mp3")
print "No shortcut, we recognized: %s\n" % song
# Or use a recognizer without the shortcut, in anyway you would like
from dejavu.recognize import FileRecognizer
recognizer = FileRecognizer(djv)
song = recognizer.recognize_file("mp3/Josh-Woodward--I-Want-To-Destroy-Something-Beautiful.mp3")
print "No shortcut, we recognized: %s\n" % song
Binary file not shown.
Binary file not shown.
27 changes: 27 additions & 0 deletions long_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from dejavu import Dejavu
import warnings
import json
warnings.filterwarnings("ignore")

# load config from a JSON file (or anything outputting a python dictionary)
with open("dejavu.cnf.SAMPLE") as f:
config = json.load(f)


if __name__ == '__main__':
'''
This will audio files that are too long into sections.
There are probably better libs to do this.
'''

# create a Dejavu instance
djv = Dejavu(config)

# Fingerprint all the mp3's in the directory we give it
# short_song = "./hugeFilesTests/Josh-Woodward--I-Want-To-Destroy-Something-Beautiful.mp3"
# djv.fingerprint_with_duration_check(short_song, minutes=3)

long_song = "./hugeFilesTests/Roger Penrose - Forbidden crystal symmetry in mathematics and architecture.mp3"
djv.fingerprint_with_duration_check(long_song, minutes=3, song_name="RogerPenrose1",processes=3)