diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index 914838a67f56..59d7aabd1a8e 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -1120,8 +1120,10 @@ def validate_user(): return def get_urls(content): - """Returns all urls foundafter href and src in content. - Excludes urls that are only '#'.""" + """ + Returns all urls found after href and src in content. + Excludes urls that are only '#'. + """ regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']' url_list = re.findall(regex, content) return url_list @@ -1149,9 +1151,33 @@ def convert_to_standard_url(url, course_key): else: return url + def scan_course_for_links(course_key): + """ + Returns a list of all urls in a course. + Returns: [ [block_id1, url1], [block_id2, url2], ... ] + """ + verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only) + blocks = [] + urls_to_validate = [] + + for vertical in verticals: + blocks.extend(vertical.get_children()) + + for block in blocks: + block_id = str(block.usage_key) + block_info = get_block_info(block) + block_data = block_info['data'] + + url_list = get_urls(block_data) + urls_to_validate += [[block_id, url] for url in url_list] + + return urls_to_validate + async def validate_url_access(session, url_data, course_key): - """Returns status of a url request. - url_list is [id, url]""" + """ + Returns the status of a url request + Returns: {block_id1, url1, status} + """ block_id, url = url_data result = {'block_id': block_id, 'url': url} standardized_url = convert_to_standard_url(url, course_key) @@ -1160,12 +1186,14 @@ async def validate_url_access(session, url_data, course_key): result.update({'status': response.status}) except Exception as e: result.update({'status': None}) - print('[Validate url error]', type(e), e, url) + LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}') return result async def validate_urls_access_in_batches(url_list, course_key, batch_size=100): - """Returns the statuses of a list of url requests. - url_list is [block_id, url]""" + """ + Returns the statuses of a list of url requests. + Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ] + """ responses = [] url_count = len(url_list) @@ -1175,67 +1203,72 @@ async def validate_urls_access_in_batches(url_list, course_key, batch_size=100): tasks = [validate_url_access(session, url_data, course_key) for url_data in batch] batch_results = await asyncio.gather(*tasks) responses.extend(batch_results) - print(f'batch {i // batch_size+1} of {url_count // batch_size + 1}') + LOGGER.debug(f'[Link Check] request batch {i // batch_size+1} of {url_count // batch_size + 1}') return responses - def scan_course_for_links(course_key): + def filter_by_status(results): """ - Returns a list of links that are broken or locked. - [block_id, link, is_locked] + Filter results by status. + 200: OK. No need to do more + 403: Forbidden. Record as locked link. + None: Error. Retry up to 3 times. + Other: Failure. Record as broken link. + Returns: + filtered_results: [ [block_id1, url1, is_locked], ... ] + retry_list: [ [block_id1, url1], ... ] """ - verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only) - blocks = [] - links_to_validate = [] - - for vertical in verticals: - blocks.extend(vertical.get_children()) - - for block in blocks: - block_id = str(block.usage_key) - block_info = get_block_info(block) - block_data = block_info['data'] - - url_list = get_urls(block_data) - links_to_validate += [[block_id, url] for url in url_list] - - return links_to_validate + filtered_results = [] + retry_list = [] + for result in results: + if result['status'] == None: + retry_list.append([result['block_id'], result['url']]) + elif result['status'] == 200: + continue + elif result['status'] == 403 and is_studio_url(result['url']): + filtered_results.append([result['block_id'], result['url'], True]) + else: + filtered_results.append([result['block_id'], result['url'], False]) + + return filtered_results, retry_list user = validate_user() self.status.set_state('Scanning') course_key = CourseKey.from_string(course_key_string) - links_list = scan_course_for_links(course_key) - results = asyncio.run(validate_urls_access_in_batches(links_list, course_key, batch_size=100)) - - final_results = [] - for result in results: - if result['status'] == None: # Request error - print('retry') # TODO retry - if result['status'] == 200: # OK - print('remove from list') # TODO remove - elif result['status'] == 403 and is_studio_url(result['url']): - final_results.append([result['block_id'], result['url'], True]) - else: - final_results.append([result['block_id'], result['url'], False]) + url_list = scan_course_for_links(course_key) + validated_url_list = asyncio.run(validate_urls_access_in_batches(url_list, course_key, batch_size=100)) + broken_or_locked_urls, retry_list = filter_by_status(validated_url_list) + + # Retry urls that failed due to connection error + retry_count = 3 + for i in range(0, retry_count): + if retry_list: + LOGGER.debug(f'[Link Check] retry attempt #{i+1}') + retry_validated_url_list = asyncio.run(validate_urls_access_in_batches(retry_list, course_key, batch_size=100)) + retry_results, retry_list = filter_by_status(retry_validated_url_list) + broken_or_locked_urls.extend(retry_results) + + if retry_list: + LOGGER.debug(f'[Link Check] {len(retry_list)} requests failed due to connection error') try: self.status.increment_completed_steps() file_name = str(course_key) - links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json') - LOGGER.debug('json file being generated at %s', links_file.name) + broken_links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json') + LOGGER.debug(f'[Link Check] json file being generated at {broken_links_file.name}') - with open(links_file.name, 'w') as file: - json.dump(final_results, file, indent=4) + with open(broken_links_file.name, 'w') as file: + json.dump(broken_or_locked_urls, file, indent=4) artifact = UserTaskArtifact(status=self.status, name='BrokenLinks') - artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file)) + artifact.file.save(name=os.path.basename(broken_links_file.name), content=File(broken_links_file)) artifact.save() # catch all exceptions so we can record useful error messages - except Exception as exception: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except LOGGER.exception('Error checking links for course %s', course_key, exc_info=True) if self.status.state != UserTaskStatus.FAILED: - self.status.fail({'raw_error_msg': str(exception)}) + self.status.fail({'raw_error_msg': str(e)}) return