diff --git a/app/master/build.py b/app/master/build.py index 3a910a5..743830e 100644 --- a/app/master/build.py +++ b/app/master/build.py @@ -190,8 +190,7 @@ def complete_subjob(self, subjob_id, payload=None): self._mark_subjob_complete(subjob_id) except Exception: - self._logger.exception('Error while completing subjob; marking build as failed.') - self.mark_failed('Error occurred while completing subjob {}.'.format(subjob_id)) + self._logger.exception('Error while processing subjob {} payload'.format(subjob_id)) raise def _parse_payload_for_atom_exit_code(self, subjob_id): diff --git a/app/slave/cluster_slave.py b/app/slave/cluster_slave.py index e82cf7c..1d30d0b 100644 --- a/app/slave/cluster_slave.py +++ b/app/slave/cluster_slave.py @@ -359,13 +359,15 @@ def _execute_subjob(self, build_id, subjob_id, executor, atomic_commands): files = {'file': ('payload', open(results_file, 'rb'), 'application/x-compressed')} self._idle_executors.put(executor) # work is done; mark executor as idle - resp = self._network.post(results_url, data=data, files=files) - if resp.ok: - self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id) - else: - self._logger.error( - ('Build {}, Subjob {} encountered an error when sending results to master.' - '\n\tStatus Code {}\n\t{}').format(build_id, subjob_id, resp.status_code, resp.text)) + for attempt in range(3): + resp = self._network.post(results_url, data=data, files=files) + if resp.status_code == 200: + self._logger.info('Build {}, Subjob {} completed and sent results to master.', build_id, subjob_id) + break + else: + self._logger.error( + ('Build {}, Subjob {} encountered an error when sending results to master.' + '\n\tStatus Code {} attempt {}\n\t{}').format(build_id, subjob_id, resp.status_code, attempt+1, resp.text)) def _notify_master_of_state_change(self, new_state): """ diff --git a/app/util/fs.py b/app/util/fs.py index de354b9..f0bc4b4 100644 --- a/app/util/fs.py +++ b/app/util/fs.py @@ -83,6 +83,9 @@ def extract_tar(archive_file, target_dir=None, delete=False): if not target_dir: target_dir, _ = os.path.split(archive_file) # default to same directory as tar file + if not tarfile.is_tarfile(archive_file): + raise Exception("Not a tarfile: {}".format(archive_file)) + try: with tarfile.open(archive_file, 'r:gz') as f: f.extractall(target_dir) diff --git a/app/util/network.py b/app/util/network.py index b016c29..46897c9 100644 --- a/app/util/network.py +++ b/app/util/network.py @@ -51,7 +51,7 @@ def get(self, *args, **kwargs): return self._request('GET', *args, **kwargs) # todo: may be a bad idea to retry -- what if post was successful but just had a response error? - @retry_on_exception_exponential_backoff(exceptions=(requests.ConnectionError,)) + @retry_on_exception_exponential_backoff(exceptions=(requests.ConnectionError,requests.Timeout), initial_delay=1.0) def post(self, *args, **kwargs): """ Send a POST request to a url. Arguments to this method, unless otherwise documented below in _request(), are @@ -75,7 +75,7 @@ def post_with_digest(self, url, request_params, secret, error_on_failure=False): error_on_failure=error_on_failure) # todo: may be a bad idea to retry -- what if put was successful but just had a response error? - @retry_on_exception_exponential_backoff(exceptions=(requests.ConnectionError,)) + @retry_on_exception_exponential_backoff(exceptions=(requests.ConnectionError,requests.Timeout)) def put(self, *args, **kwargs): """ Send a PUT request to a url. Arguments to this method, unless otherwise documented below in _request(), are