@@ -355,6 +355,222 @@ def seg_align_eval(dump_dir, tagged_dir, out_file, verbose=False):
355
355
356
356
f .write ('\n ' )
357
357
358
+ def line_align (songs , dump_dir , boundary_algorithm = 'olda' , label_algorithm = 'fmc2d' , do_twinnet = True ):
359
+
360
+ logging .info ('Beginning alignment...' )
361
+
362
+ if isinstance (songs , dict ):
363
+ songs = [songs ]
364
+
365
+ # Module initializations
366
+ snd = SND (silencedb = - 15 )
367
+ sc = SyllableCounter ()
368
+
369
+ # Perform MaD TwinNet in one batch
370
+ if do_twinnet :
371
+ paths = [song ['path' ] for song in songs ]
372
+ twinnet .twinnet_process (paths )
373
+ else :
374
+ logging .info ('Skipping MaD TwinNet' )
375
+
376
+ for song in songs :
377
+
378
+ logging .info ('Processing {} by {}' .format (song ['song' ], song ['artist' ]))
379
+
380
+ start_time = time .time ()
381
+
382
+ # Get file names
383
+ mixed_path = song ['path' ]
384
+ voice_path = os .path .splitext (song ['path' ])[0 ] + '_voice.wav'
385
+
386
+ # Get lyrics from Genius
387
+ lyrics = get_lyrics (song ['song' ], song ['artist' ])
388
+
389
+ # Get syllable count from lyrics
390
+ formatted_lyrics = sc .build_lyrics (lyrics )
391
+ syl_lyrics = sc .get_syllable_count_lyrics (formatted_lyrics )
392
+ sc_syllables = sc .get_syllable_count_per_section (syl_lyrics )
393
+
394
+ # Get syllable count from SND
395
+ snd_syllables = snd .run (voice_path )
396
+
397
+ # Structural segmentation analysis on original audio
398
+ sections , labels = msaf .process (mixed_path , boundaries_id = boundary_algorithm , labels_id = label_algorithm )
399
+
400
+ # Save instrumental section indices
401
+ instrumentals = []
402
+
403
+ # Get SND counts, densities per label
404
+ max_count = 0
405
+
406
+ labels_density = {}
407
+ i_s = 0
408
+ for i , section in enumerate (zip (labels , sections [:- 1 ], sections [1 :])):
409
+ count = 0
410
+ while i_s < len (snd_syllables ) and snd_syllables [i_s ] < section [2 ]:
411
+ count += 1
412
+ i_s += 1
413
+ max_count = max (max_count , count )
414
+
415
+ density = count / (section [2 ]- section [1 ])
416
+
417
+ # TODO: fix instrumentalization
418
+ # if density <= 0.7:
419
+ # instrumentals.append(i)
420
+ # else:
421
+ # if section[0] not in labels_density:
422
+ # labels_density[section[0]] = [[], []]
423
+ # labels_density[section[0]][0].append(count)
424
+ # labels_density[section[0]][1].append(density)
425
+ if section [0 ] not in labels_density :
426
+ labels_density [section [0 ]] = [[], []]
427
+ labels_density [section [0 ]][0 ].append (count )
428
+ labels_density [section [0 ]][1 ].append (density )
429
+
430
+ # Normalize SND syllable counts
431
+ for label in labels_density :
432
+ labels_density [label ][0 ] = [count / max_count for count in labels_density [label ][0 ]]
433
+
434
+ # Normalize SSA syllable counts
435
+ gt_max_syl = max (section [1 ] for section in sc_syllables )
436
+ gt_chorus_syl = mean (section [1 ]/ gt_max_syl for section in sc_syllables if section [0 ] == 'chorus' )
437
+
438
+ # Find label most similar to chorus
439
+ min_label = labels [0 ]
440
+ min_distance = float ('inf' )
441
+ for label in labels_density :
442
+ if len (labels_density [label ][0 ]) < 2 :
443
+ continue
444
+
445
+ # TODO: Fix distance scales
446
+ mean_syl = mean (labels_density [label ][0 ])
447
+ std_den = stdev (labels_density [label ][1 ])
448
+ distance = sqrt (((mean_syl - gt_chorus_syl )/ gt_chorus_syl )** 2 + std_den ** 2 )
449
+
450
+ if distance < min_distance :
451
+ min_distance = distance
452
+ min_label = label
453
+
454
+ # Relabel
455
+ relabels = ['' ] * len (labels )
456
+
457
+ temp = defaultdict (list )
458
+ for i , label in enumerate (labels ):
459
+ temp [label ].append (i )
460
+ for label in temp :
461
+ for i in temp [label ]:
462
+ if i in instrumentals :
463
+ continue
464
+ elif label == min_label :
465
+ relabels [i ] = 'chorus'
466
+ elif len (temp [label ]) > 1 :
467
+ relabels [i ] = 'verse'
468
+ else :
469
+ relabels [i ] = 'other'
470
+ del temp
471
+
472
+ relabels = [label for label in relabels if label ]
473
+ if not relabels :
474
+ logging .error ('Whole song tagged as instrumental! Skipping...' )
475
+ continue
476
+
477
+ # Calculate accumulated error matrix
478
+ dp = [[- 1 for j in range (len (relabels ))] for i in range (len (sc_syllables ))]
479
+ for i in range (len (sc_syllables )):
480
+ for j in range (len (relabels )):
481
+ dp [i ][j ] = dp_err_matrix [sc_syllables [i ][0 ]][relabels [j ]]
482
+ if i == 0 and j == 0 :
483
+ pass
484
+ elif i == 0 :
485
+ dp [i ][j ] += dp [i ][j - 1 ]
486
+ elif j == 0 :
487
+ dp [i ][j ] += dp [i - 1 ][j ]
488
+ else :
489
+ dp [i ][j ] += min (dp [i - 1 ][j ], dp [i ][j - 1 ], dp [i - 1 ][j - 1 ])
490
+
491
+ # Backtrack
492
+ i , j = len (sc_syllables )- 1 , len (relabels )- 1
493
+ path = []
494
+ while True :
495
+ path .append ((i , j ))
496
+ if (i , j ) == (0 , 0 ):
497
+ break
498
+ elif i == 0 :
499
+ j -= 1
500
+ elif j == 0 :
501
+ i -= 1
502
+ else :
503
+ min_dir = min (dp [i - 1 ][j ], dp [i ][j - 1 ], dp [i - 1 ][j - 1 ])
504
+ if dp [i - 1 ][j ] == min_dir :
505
+ i -= 1
506
+ elif dp [i ][j - 1 ] == min_dir :
507
+ j -= 1
508
+ else :
509
+ i -= 1
510
+ j -= 1
511
+ path .reverse ()
512
+
513
+ # Process alignment and write to file
514
+ alignment = [[] for i in range (len (labels ))]
515
+ for i in instrumentals :
516
+ alignment [i ].append ('instrumental' )
517
+
518
+ section_id = 0
519
+ j_prev = 0
520
+ for (i , j ) in path :
521
+ if j != j_prev :
522
+ section_id += 1
523
+ j_prev = j
524
+ while 'instrumental' in alignment [section_id ]:
525
+ section_id += 1
526
+ alignment [section_id ].append (i )
527
+
528
+ end_time = time .time ()
529
+
530
+ align_data = {'song' : song ['song' ],
531
+ 'artist' : song ['artist' ],
532
+ 'genre' : song ['genre' ],
533
+ 'process time' : end_time - start_time ,
534
+ 'duration' : round ((sections [- 1 ] - sections [0 ]).item (), 2 ),
535
+ 'align' : []}
536
+
537
+ cur_lyric_section = - 1
538
+ for i , section in enumerate (alignment ):
539
+ for n , lyric_section in enumerate (section ):
540
+ if lyric_section != cur_lyric_section :
541
+ break_point = round ((sections [i ] + n * (sections [i + 1 ]- sections [i ])/ len (section )).item (), 2 )
542
+ if cur_lyric_section != 'instrumental' and align_data ['align' ]:
543
+ align_data ['align' ][- 1 ]['end' ] = break_point
544
+ if lyric_section != 'instrumental' :
545
+ align_data ['align' ].append ({'label' : sc_syllables [lyric_section ][0 ],
546
+ 'syllables' : sc_syllables [lyric_section ][1 ],
547
+ 'start' : break_point ,
548
+ 'lines' : []})
549
+ cur_lyric_section = lyric_section
550
+
551
+ if 'end' not in align_data ['align' ][- 1 ]:
552
+ align_data ['align' ][- 1 ]['end' ] = break_point
553
+
554
+ for i , section in enumerate (align_data ['align' ]):
555
+ duration = section ['end' ] - section ['start' ]
556
+ line_start = section ['start' ]
557
+ for j , line in enumerate (formatted_lyrics [i ][1 ]):
558
+ line_text = ' ' .join (line )
559
+ line_syls = sum (syl_lyrics [i ][1 ][j ])
560
+ line_duration = line_syls / align_data ['align' ][i ]['syllables' ] * duration
561
+
562
+ align_data ['align' ][i ]['lines' ].append ({'start' : line_start ,
563
+ 'text' : line_text })
564
+
565
+ line_start += line_duration
566
+
567
+ file_name = '{}_{}.yml' .format (song ['artist' ], song ['song' ]).replace (' ' , '' )
568
+ file_path = os .path .join (dump_dir , file_name )
569
+
570
+ with open (file_path , 'w' ) as f :
571
+ yaml .dump (align_data , f , default_flow_style = False )
572
+
573
+
358
574
def iter_boundary_label_algorithms (songs , dump_dir , tagged_dir , evals_dir , do_twinnet = False , verbose = True ):
359
575
for b_alg in msaf .get_all_boundary_algorithms ():
360
576
if b_alg == 'example' :
0 commit comments