From 24446e63c2e8eb52c2042a2cf6c4262bcb8de418 Mon Sep 17 00:00:00 2001 From: Jose Perez-Silva Date: Mon, 23 Feb 2026 14:12:23 +0000 Subject: [PATCH 1/3] avoid downloadData to delete and download with this change the module will not delete everything and redownload. instead for an already existing file it will check the md5sum and if matching the one in the csv file, it will skip that. if the file is not in the csv or the md5sum is not matching (indicating a corrupted file) it will proceed as usual (delete and re-download). --- .../Hive/RunnableDB/HiveDownloadData.pm | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm b/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm index 02db4fa66..7e417945e 100644 --- a/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm +++ b/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm @@ -249,9 +249,32 @@ sub pre_cleanup { my ($self) = @_; my $filepath = catfile($self->param_required('output_dir'), basename($self->param_required('url'))); + if (-e $filepath) { - unlink $filepath; + if ($self->param_is_defined('md5sum')) { + my $digest = Digest::MD5->new(); + open(my $fh, $filepath) || $self->throw("Could not open '$filepath': $!"); + binmode $fh; + my $md5sum = $digest->addfile($fh)->hexdigest; + close($fh) || $self->throw("Could not close '$filepath': $!"); + + if ($md5sum eq $self->param('md5sum')) { + $self->say_with_header("File '$filepath' already exists and md5sum matches ($md5sum), keeping it to skip download"); + } + else { + $self->say_with_header("File '$filepath' exists but md5sum mismatch (expected: ".$self->param('md5sum').", got: $md5sum), removing for re-download"); + unlink $filepath or $self->warning("Could not remove '$filepath': $!"); + } + } + else { + $self->say_with_header("File '$filepath' already exists but no md5sum defined, removing it to force a clean download"); + unlink $filepath or $self->warning("Could not remove '$filepath': $!"); + } + } + else { + $self->say_with_header("File '$filepath' does not exist, nothing to clean up"); } } 1; + From f484c99efa024f51aea3eff491fcd1a283a3c55a Mon Sep 17 00:00:00 2001 From: Jose Perez-Silva Date: Wed, 4 Mar 2026 09:43:46 +0000 Subject: [PATCH 2/3] Added better reporting For debuggingg and clarity purposes --- modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm b/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm index 7e417945e..4c7aa8d5c 100644 --- a/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm +++ b/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm @@ -260,6 +260,7 @@ sub pre_cleanup { if ($md5sum eq $self->param('md5sum')) { $self->say_with_header("File '$filepath' already exists and md5sum matches ($md5sum), keeping it to skip download"); + $self->complete_early("File already present and verified"); } else { $self->say_with_header("File '$filepath' exists but md5sum mismatch (expected: ".$self->param('md5sum').", got: $md5sum), removing for re-download"); From 50cbedaa5b8ad8de9c7cb5dc942f11a86d19cd37 Mon Sep 17 00:00:00 2001 From: Jose Perez-Silva Date: Fri, 6 Mar 2026 15:55:53 +0000 Subject: [PATCH 3/3] Fix HiveDownloadData to correctly handle pre-existing files pre_cleanup now only deletes files with md5 mismatch or no md5 defined, leaving verified files for fetch_input to handle. fetch_input exits early when a file is present and md5-verified, querying the read_length_table to decide whether to dataflow to HiveCalculateReadLength or skip it, avoiding duplicate job INSERT errors on re-runs and manual file placement. changes required for a misunderstanding of the behaviour of pre_cleanup and the need to cater to the niche but possible scenario of a manual addition of files whose read length calculation would have been skipped, maybe, in the previous scenario. --- .../Analysis/Hive/Config/StarScallopRnaseq.pm | 2 ++ .../Hive/RunnableDB/HiveDownloadData.pm | 36 ++++++++++++++++--- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/modules/Bio/EnsEMBL/Analysis/Hive/Config/StarScallopRnaseq.pm b/modules/Bio/EnsEMBL/Analysis/Hive/Config/StarScallopRnaseq.pm index ca07ef763..5c2da2972 100644 --- a/modules/Bio/EnsEMBL/Analysis/Hive/Config/StarScallopRnaseq.pm +++ b/modules/Bio/EnsEMBL/Analysis/Hive/Config/StarScallopRnaseq.pm @@ -436,6 +436,8 @@ sub pipeline_analyses { -parameters => { output_dir => $self->o('input_dir'), download_method => $self->o('download_method'), + read_length_table => $self->o('read_length_table'), + uncompress => 0, }, -flow_into => { diff --git a/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm b/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm index 4c7aa8d5c..58393da46 100644 --- a/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm +++ b/modules/Bio/EnsEMBL/Analysis/Hive/RunnableDB/HiveDownloadData.pm @@ -99,6 +99,35 @@ sub param_defaults { sub fetch_input { my ($self) = @_; + # Early exit if file already present and verified + my $filepath = catfile($self->param_required('output_dir'), basename($self->param_required('url'))); + if (-e $filepath && $self->param_is_defined('md5sum')) { + my $digest = Digest::MD5->new(); + open(my $fh, $filepath) || $self->throw("Could not open '$filepath': $!"); + binmode $fh; + my $md5sum = $digest->addfile($fh)->hexdigest; + close($fh) || $self->throw("Could not close '$filepath': $!"); + if ($md5sum eq $self->param('md5sum')) { + $self->say_with_header("File '$filepath' already exists and md5sum matches, skipping download", 1); + my $final_file = $self->param('uncompress') ? ($filepath =~ /^(\S+)\.\w+$/)[0] : $filepath; + + my $already_processed = 0; + eval { + my $sth = $self->db->dbc->prepare('SELECT read_length FROM '.$self->param('read_length_table').' WHERE fastq = ?'); + $sth->execute(basename($final_file)); + my $row = $sth->fetchrow_arrayref; + $already_processed = 1 if $row; + $sth->finish; + }; + + unless ($already_processed) { + $self->dataflow_output_id({filename => $final_file}, $self->param('_branch_to_flow_to')); + } + $self->input_job->autoflow(0); + $self->complete_early("File already present and verified"); + } + } + if ($] =~ '^5.024') { $self->throw("Perl 5.24 doesn't work with this module. If you manage to make it work, please submit a pull-request"); } @@ -258,14 +287,11 @@ sub pre_cleanup { my $md5sum = $digest->addfile($fh)->hexdigest; close($fh) || $self->throw("Could not close '$filepath': $!"); - if ($md5sum eq $self->param('md5sum')) { - $self->say_with_header("File '$filepath' already exists and md5sum matches ($md5sum), keeping it to skip download"); - $self->complete_early("File already present and verified"); - } - else { + unless ($md5sum eq $self->param('md5sum')) { $self->say_with_header("File '$filepath' exists but md5sum mismatch (expected: ".$self->param('md5sum').", got: $md5sum), removing for re-download"); unlink $filepath or $self->warning("Could not remove '$filepath': $!"); } + # if md5 matches: do nothing here, fetch_input handles the early exit } else { $self->say_with_header("File '$filepath' already exists but no md5sum defined, removing it to force a clean download");