worldveil · thesunlover · Mar 22, 2015 · Mar 23, 2015 · Mar 23, 2015 · Mar 30, 2015
diff --git a/INSTALLATION.md b/INSTALLATION.md
@@ -21,6 +21,7 @@ Install the dependencies:
     sudo yum install numpy scipy python-matplotlib ffmpeg portaudio-devel
     pip install PyAudio
     pip install pydub
+	pip install audioread  # https://github.com/sampsyo/audioread
 
 Now setup virtualenv ([howto?](http://www.pythoncentral.io/how-to-install-virtualenv-python/)):
 

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
@@ -6,6 +6,22 @@
 import traceback
 import sys
 
+import subprocess
+import os.path
+from dejavu.decoder import get_duration
+
+class SplitError(Exception):
+    def __init__(self, file_path, output_file, error_code):
+        Exception.__init__(self)
+        self.file_path = file_path
+        self.error_code = error_code
+        self.output_file = output_file
+
+    def __str__(self):
+        return "Spliting of file({0}) failed to ({1}). ffmpeg returned error code: {2}".format(self.file_path, self.output_file, self.error_code)
+
+
+
 
 class Dejavu(object):
 
@@ -16,6 +32,9 @@ class Dejavu(object):
     OFFSET = 'offset'
     OFFSET_SECS = 'offset_seconds'
 
+    SPLIT_DIR = "split_dir"
+    OVERWRITE_WHEN_SPLITING = 1
+
     def __init__(self, config):
         super(Dejavu, self).__init__()
 
@@ -43,7 +62,7 @@ def get_fingerprinted_songs(self):
             song_name = song[self.db.FIELD_SONGNAME]
             self.songnames_set.add(song_name)
 
-    def fingerprint_directory(self, path, extensions, nprocesses=None):
+    def fingerprint_directory(self, path, extensions, nprocesses=None, splited=False, splited_song_name=""):
         # Try to use the maximum amount of processes if not given.
         try:
             nprocesses = nprocesses or multiprocessing.cpu_count()
@@ -71,6 +90,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
         # Send off our tasks
         iterator = pool.imap_unordered(_fingerprint_worker,
                                        worker_input)
+        if splited and splited_song_name:
+            sid = self.db.insert_song(splited_song_name)
 
         # Loop till we have all of them
         while True:
@@ -85,8 +106,8 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
                 # Print traceback because we can't reraise it here
                 traceback.print_exc(file=sys.stdout)
             else:
-                sid = self.db.insert_song(song_name)
-
+                if not splited:
+                    sid = self.db.insert_song(song_name)
                 self.db.insert_hashes(sid, hashes)
                 self.db.set_song_fingerprinted(sid)
                 self.get_fingerprinted_songs()
@@ -111,6 +132,61 @@ def fingerprint_file(self, filepath, song_name=None):
             self.db.set_song_fingerprinted(sid)
             self.get_fingerprinted_songs()
 
+    def fingerprint_with_duration_check(self, input_file, minutes=5, song_name=None, processes=None):
+        duration = get_duration(input_file)
+        split_length =  minutes * 60
+        if duration < split_length:
+            return self.fingerprint_file(input_file)
+        songname, extension = os.path.splitext(os.path.basename(input_file))
+        song_name = song_name or songname
+        # don't refingerprint already fingerprinted files
+        if song_name in self.songnames_set:
+            print "%s already fingerprinted, continuing..." % song_name
+            return
+        file_directory = os.path.dirname(input_file)
+        output_split_path = os.path.join(file_directory, self.SPLIT_DIR)
+        try:
+            os.mkdir(output_split_path)
+        except WindowsError:
+            pass
+        output_path = os.path.join(output_split_path, song_name)
+        try:
+            os.mkdir(output_path)
+        except WindowsError:
+            pass
+
+        start_offset = 0
+        end_offset = split_length
+        retcode = 0
+        sid = self.db.insert_song(song_name)
+        while start_offset < duration:
+            output_file = os.path.join(output_path, "start_sec{0}_end_sec{1}{2}".format(start_offset, end_offset, extension))
+            convertion_command = [ 'ffmpeg',
+                                    '-i', input_file,
+                                    "-acodec", "copy", #fastest convertion possible 1:1 copy
+                                    ["-n","-y"][self.OVERWRITE_WHEN_SPLITING],  # always overwrite existing files
+                                    "-vn",  # Drop any video streams if there are any
+                                    '-ss', str(start_offset),
+                                    '-t', str(split_length),
+                                    output_file]
+            #songname for the input
+            retcode = subprocess.call(convertion_command, stderr=open(os.devnull))
+            if retcode != 0:
+                raise SplitError(input_file, output_file, retcode)
+            start_offset += split_length
+            end_offset += split_length
+            end_offset = min(end_offset, duration)
+
+            # song_name = song_name or songname
+            # song_name, hashes = _fingerprint_worker(output_file,
+            #                                         self.limit,
+            #                                         song_name=song_name)
+            # self.db.insert_hashes(sid, hashes)
+        self.db.set_song_fingerprinted(sid)
+        self.get_fingerprinted_songs()
+        self.fingerprint_directory(output_path, [extension], nprocesses=processes, splited=True, splited_song_name=song_name)
+        #TODO: delete files in the output_split_path after FP
+
     def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
         hashes = fingerprint.fingerprint(samples, Fs=Fs)
         return self.db.return_matches(hashes)

diff --git a/dejavu/decoder.py b/dejavu/decoder.py
@@ -5,6 +5,19 @@
 from pydub.utils import audioop
 import wavio
 
+# pip install audioread
+# https://github.com/sampsyo/audioread
+import audioread
+
+def get_duration(file_path):
+    duration = 0
+    with audioread.audio_open(file_path) as f:
+        duration = f.duration
+        f.close()
+    return duration
+
+
+
 def find_files(path, extensions):
     # Allow both with ".mp3" and without "mp3" to be used for extensions
     extensions = [e.replace(".", "") for e in extensions]

diff --git a/example.py b/example.py
@@ -6,29 +6,30 @@
 # load config from a JSON file (or anything outputting a python dictionary)
 with open("dejavu.cnf.SAMPLE") as f:
     config = json.load(f)
+if __name__ == '__main__':
 
-# create a Dejavu instance
-djv = Dejavu(config)
+	# create a Dejavu instance
+	djv = Dejavu(config)
 
-# Fingerprint all the mp3's in the directory we give it
-djv.fingerprint_directory("mp3", [".mp3"])
+	# Fingerprint all the mp3's in the directory we give it
+	djv.fingerprint_directory("mp3", [".mp3"])
 
-# Recognize audio from a file
-from dejavu.recognize import FileRecognizer
-song = djv.recognize(FileRecognizer, "mp3/Sean-Fournier--Falling-For-You.mp3")
-print "From file we recognized: %s\n" % song
+	# Recognize audio from a file
+	from dejavu.recognize import FileRecognizer
+	song = djv.recognize(FileRecognizer, "mp3/Sean-Fournier--Falling-For-You.mp3")
+	print "From file we recognized: %s\n" % song
 
-# Or recognize audio from your microphone for `secs` seconds
-from dejavu.recognize import MicrophoneRecognizer
-secs = 5
-song = djv.recognize(MicrophoneRecognizer, seconds=secs)
-if song is None:
-	print "Nothing recognized -- did you play the song out loud so your mic could hear it? :)"
-else:
-	print "From mic with %d seconds we recognized: %s\n" % (secs, song)
+	# Or recognize audio from your microphone for `secs` seconds
+	from dejavu.recognize import MicrophoneRecognizer
+	secs = 5
+	song = djv.recognize(MicrophoneRecognizer, seconds=secs)
+	if song is None:
+		print "Nothing recognized -- did you play the song out loud so your mic could hear it? :)"
+	else:
+		print "From mic with %d seconds we recognized: %s\n" % (secs, song)
 
-# Or use a recognizer without the shortcut, in anyway you would like
-from dejavu.recognize import FileRecognizer
-recognizer = FileRecognizer(djv)
-song = recognizer.recognize_file("mp3/Josh-Woodward--I-Want-To-Destroy-Something-Beautiful.mp3")
-print "No shortcut, we recognized: %s\n" % song
+	# Or use a recognizer without the shortcut, in anyway you would like
+	from dejavu.recognize import FileRecognizer
+	recognizer = FileRecognizer(djv)
+	song = recognizer.recognize_file("mp3/Josh-Woodward--I-Want-To-Destroy-Something-Beautiful.mp3")
+	print "No shortcut, we recognized: %s\n" % song
diff --git a/hugeFilesTests/Josh-Woodward--I-Want-To-Destroy-Something-Beautiful.mp3 b/hugeFilesTests/Josh-Woodward--I-Want-To-Destroy-Something-Beautiful.mp3
diff --git a/...FilesTests/Roger Penrose - Forbidden crystal symmetry in mathematics and architecture.mp3 b/...FilesTests/Roger Penrose - Forbidden crystal symmetry in mathematics and architecture.mp3
diff --git a/long_test.py b/long_test.py
@@ -0,0 +1,27 @@
+from dejavu import Dejavu
+import warnings
+import json
+warnings.filterwarnings("ignore")
+
+# load config from a JSON file (or anything outputting a python dictionary)
+with open("dejavu.cnf.SAMPLE") as f:
+    config = json.load(f)
+
+
+if __name__ == '__main__':
+    '''
+    This will audio files that are too long into sections.
+    There are probably better libs to do this. 
+    '''
+
+    # create a Dejavu instance
+    djv = Dejavu(config)
+
+    # Fingerprint all the mp3's in the directory we give it
+    # short_song = "./hugeFilesTests/Josh-Woodward--I-Want-To-Destroy-Something-Beautiful.mp3"
+    # djv.fingerprint_with_duration_check(short_song, minutes=3)
+
+    long_song = "./hugeFilesTests/Roger Penrose - Forbidden crystal symmetry in mathematics and architecture.mp3"
+    djv.fingerprint_with_duration_check(long_song, minutes=3, song_name="RogerPenrose1",processes=3)
+
+