Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add helper function send_request for PlaywrightCrawler using APIRequestContext bound to the browser context #1134

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@

from playwright.async_api import TimeoutError as PlaywrightTimeoutError

from crawlee import HttpHeaders
from crawlee._types import BasicCrawlingContext
from crawlee._utils.docs import docs_group
from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext
from crawlee.crawlers._playwright._types import PlaywrightHttpResponse

if TYPE_CHECKING:
from collections.abc import Awaitable, Callable, Sequence
Expand Down Expand Up @@ -186,7 +186,7 @@ async def from_playwright_crawling_context(
context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll')
# This might not be always available.
protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol')
http_response = await _PlaywrightHttpResponse.from_playwright_response(
http_response = await PlaywrightHttpResponse.from_playwright_response(
response=context.response, protocol=protocol_guess or ''
)
# block_requests is useful only on pre-navigation contexts. It is useless here.
Expand Down Expand Up @@ -240,26 +240,3 @@ async def dummy_block_requests(

context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests)
return cls(**context_kwargs)


@dataclass(frozen=True)
class _PlaywrightHttpResponse:
"""Wrapper class for playwright `Response` object to implement `HttpResponse` protocol."""

http_version: str
status_code: int
headers: HttpHeaders
_content: bytes

def read(self) -> bytes:
return self._content

@classmethod
async def from_playwright_response(cls, response: Response, protocol: str) -> Self:
headers = HttpHeaders(response.headers)
status_code = response.status
# Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument.
http_version = protocol
_content = await response.body()

return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
16 changes: 13 additions & 3 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from ._playwright_crawling_context import PlaywrightCrawlingContext
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
from ._utils import block_requests, infinite_scroll
from ._utils import block_requests, infinite_scroll, prepare_send_request_function

TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
Expand Down Expand Up @@ -168,6 +168,8 @@ def __init__(
kwargs.setdefault('_logger', logging.getLogger(__name__))
self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = []

self._use_http_client = bool(kwargs.get('http_client'))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any chance we could make an HttpClient implementation (it could be private to the crawlee.crawlers._playwright package) and use it instead of just overriding context.send_request? I admit I didn't read the code thoroughly, but it looks like you have most of the code in place already.


super().__init__(**kwargs)

async def _open_page(
Expand All @@ -180,11 +182,15 @@ async def _open_page(
# Create a new browser page
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)

send_request = (
context.send_request if self._use_http_client else prepare_send_request_function(crawlee_page.page)
)

pre_navigation_context = PlaywrightPreNavCrawlingContext(
request=context.request,
session=context.session,
add_requests=context.add_requests,
send_request=context.send_request,
send_request=send_request,
push_data=context.push_data,
use_state=context.use_state,
proxy_info=context.proxy_info,
Expand Down Expand Up @@ -238,11 +244,15 @@ async def _navigate(

extract_links = self._create_extract_links_function(context)

send_request = (
context.send_request if self._use_http_client else prepare_send_request_function(context.page)
)

error = yield PlaywrightCrawlingContext(
request=context.request,
session=context.session,
add_requests=context.add_requests,
send_request=context.send_request,
send_request=send_request,
push_data=context.push_data,
use_state=context.use_state,
proxy_info=context.proxy_info,
Expand Down
31 changes: 30 additions & 1 deletion src/crawlee/crawlers/_playwright/_types.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from __future__ import annotations

from typing import Protocol
from dataclasses import dataclass
from typing import TYPE_CHECKING, Protocol

from crawlee import HttpHeaders
from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
from playwright.async_api import APIResponse, Response
from typing_extensions import Self


@docs_group('Functions')
class BlockRequestsFunction(Protocol):
Expand All @@ -22,3 +28,26 @@ async def __call__(
url_patterns: List of URL patterns to block. If None, uses default patterns.
extra_url_patterns: Additional URL patterns to append to the main patterns list.
"""


@dataclass(frozen=True)
class PlaywrightHttpResponse:
"""Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol."""

http_version: str
status_code: int
headers: HttpHeaders
_content: bytes

def read(self) -> bytes:
return self._content

@classmethod
async def from_playwright_response(cls, response: Response | APIResponse, protocol: str) -> Self:
headers = HttpHeaders(response.headers)
status_code = response.status
# Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument.
http_version = protocol
_content = await response.body()

return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
18 changes: 18 additions & 0 deletions src/crawlee/crawlers/_playwright/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
from contextlib import suppress
from typing import TYPE_CHECKING

from ._types import PlaywrightHttpResponse

if TYPE_CHECKING:
from playwright.async_api import Page
from playwright.async_api import Request as PlaywrightRequest

from crawlee._types import HttpHeaders, HttpMethod, SendRequestFunction

_DEFAULT_BLOCK_REQUEST_URL_PATTERNS = [
'.css',
'.webp',
Expand Down Expand Up @@ -108,3 +112,17 @@ async def block_requests(

if specific_files:
await page.route(f'**/{{{",".join(specific_files)}}}*', lambda route, _: route.abort())


def prepare_send_request_function(page: Page) -> SendRequestFunction:
async def send_request(
url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None
) -> PlaywrightHttpResponse:
# It is necessary to pass `set_extra_http_headers` passed earlier to `Playwright`
# TODO: https://github.com/apify/crawlee-python/issues/1055
headers = dict(headers) if headers else None
# `request` is done based on the browser context and uses the same cookies and proxies
response = await page.request.fetch(url_or_request=url, method=method, headers=headers)
return await PlaywrightHttpResponse.from_playwright_response(response=response, protocol='')

return send_request
45 changes: 45 additions & 0 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
)
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
from crawlee.http_clients import HttpxHttpClient
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import SessionPool
from crawlee.statistics import Statistics
Expand Down Expand Up @@ -563,3 +564,47 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
assert crawler.statistics.error_tracker.total == 3 * max_retries
assert crawler.statistics.error_tracker.unique_error_count == 2
assert len(kvs_content) == 4


async def test_send_request(server_url: URL) -> None:
"""Check that the persist context works with fingerprints."""
check_data: dict[str, Any] = {}

crawler = PlaywrightCrawler()

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
response = await context.response.text()
check_data['default'] = dict(json.loads(response))
send_request_response = await context.send_request(str(server_url / 'user-agent'))
check_data['send_request'] = dict(json.loads(send_request_response.read()))

await crawler.run([str(server_url / 'user-agent')])

assert check_data['default'].get('user-agent') is not None
assert check_data['send_request'].get('user-agent') is not None

assert check_data['default'] == check_data['send_request']


async def test_send_request_with_client(server_url: URL) -> None:
"""Check that the persist context works with fingerprints."""
check_data: dict[str, Any] = {}

crawler = PlaywrightCrawler(
http_client=HttpxHttpClient(header_generator=None, headers={'user-agent': 'My User-Agent'})
)

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
response = await context.response.text()
check_data['default'] = dict(json.loads(response))
send_request_response = await context.send_request(str(server_url / 'user-agent'))
check_data['send_request'] = dict(json.loads(send_request_response.read()))

await crawler.run([str(server_url / 'user-agent')])

assert check_data['default'].get('user-agent') is not None
assert check_data['send_request']['user-agent'] == 'My User-Agent'

assert check_data['default'] != check_data['send_request']
4 changes: 2 additions & 2 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading