Skip to content

Commit

Permalink
feat: add playwright fingerprint injector [WIP]
Browse files Browse the repository at this point in the history
Closes: #401
  • Loading branch information
vdusek committed Sep 17, 2024
1 parent 3c3dfe8 commit ccd37e6
Show file tree
Hide file tree
Showing 12 changed files with 155 additions and 18 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ sortedcollections = ">=2.1.0"
tldextract = ">=5.1.0"
typer = ">=0.12.0"
typing-extensions = ">=4.1.0"
playwright-stealth = "^1.0.6"

[tool.poetry.group.dev.dependencies]
build = "~1.2.0"
Expand Down
6 changes: 6 additions & 0 deletions src/crawlee/browsers/_base_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from playwright.async_api import Page

from crawlee.browsers._types import BrowserType
from crawlee.proxy_configuration import ProxyInfo


Expand Down Expand Up @@ -50,6 +51,11 @@ def has_free_capacity(self) -> bool:
def is_browser_connected(self) -> bool:
"""Return if the browser is closed."""

@property
@abstractmethod
def browser_type(self) -> BrowserType:
"""Return the type of the browser."""

@abstractmethod
async def new_page(
self,
Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/browsers/_base_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Literal
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
from collections.abc import Mapping
from types import TracebackType

from crawlee._types import BrowserType
from crawlee.browsers._base_browser_controller import BaseBrowserController


class BaseBrowserPlugin(ABC):
"""An abstract base class for browser plugins.
Expand All @@ -24,7 +24,7 @@ class BaseBrowserPlugin(ABC):

@property
@abstractmethod
def browser_type(self) -> Literal['chromium', 'firefox', 'webkit']:
def browser_type(self) -> BrowserType:
"""Return the browser type name."""

@property
Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/browsers/_browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
from collections import defaultdict
from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING, Any, Literal
from typing import TYPE_CHECKING, Any
from weakref import WeakValueDictionary

from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.recurring_task import RecurringTask
from crawlee.browsers._base_browser_controller import BaseBrowserController
from crawlee.browsers._playwright_browser_plugin import PlaywrightBrowserPlugin
from crawlee.browsers._types import CrawleePage
from crawlee.browsers._types import BrowserType, CrawleePage

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
Expand Down Expand Up @@ -94,7 +94,7 @@ def with_default_plugin(
cls,
*,
headless: bool | None = None,
browser_type: Literal['chromium', 'firefox', 'webkit'] | None = None,
browser_type: BrowserType | None = None,
**kwargs: Any,
) -> BrowserPool:
"""Create a new instance with a single `BaseBrowserPlugin` configured with the provided options.
Expand Down
33 changes: 31 additions & 2 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
from __future__ import annotations

from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, cast

from playwright.async_api import Page
from typing_extensions import override

from crawlee.browsers._base_browser_controller import BaseBrowserController
from crawlee.browsers._types import BrowserType
from crawlee.fingerprint_suite import FingerprintInjector

if TYPE_CHECKING:
from collections.abc import Mapping
Expand All @@ -26,16 +28,25 @@ class PlaywrightBrowserController(BaseBrowserController):
"""

AUTOMATION_LIBRARY = 'playwright'
_DEFAULT_FINGERPRINT_INJECTOR = FingerprintInjector()

def __init__(self, browser: Browser, *, max_open_pages_per_browser: int = 20) -> None:
def __init__(
self,
browser: Browser,
*,
max_open_pages_per_browser: int = 20,
fingerprint_injector: FingerprintInjector = _DEFAULT_FINGERPRINT_INJECTOR,
) -> None:
"""Create a new instance.
Args:
browser: The browser instance to control.
max_open_pages_per_browser: The maximum number of pages that can be open at the same time.
fingerprint_injector: The fingerprint injector to use for modifying HTTP headers.
"""
self._browser = browser
self._max_open_pages_per_browser = max_open_pages_per_browser
self._fingerprint_injector = fingerprint_injector

self._pages = list[Page]()
self._last_page_opened_at = datetime.now(timezone.utc)
Expand Down Expand Up @@ -70,6 +81,11 @@ def has_free_capacity(self) -> bool:
def is_browser_connected(self) -> bool:
return self._browser.is_connected()

@property
@override
def browser_type(self) -> BrowserType:
return cast(BrowserType, self._browser.browser_type.name)

@override
async def new_page(
self,
Expand All @@ -91,6 +107,19 @@ async def new_page(

page = await self._browser.new_page(**page_options)

# Modify HTTP headers to look like a real browser.
await page.route(
url='**/*',
handler=lambda route, request: self._fingerprint_injector.inject_fingerprint(
route=route,
request=request,
browser_type=self.browser_type,
),
)

# TODO: maybe?
# await page.set_extra_http_headers()

# Handle page close event
page.on(event='close', f=self._on_page_close)

Expand Down
8 changes: 5 additions & 3 deletions src/crawlee/browsers/_playwright_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING, Any, Literal
from typing import TYPE_CHECKING, Any

from playwright.async_api import Playwright, async_playwright
from typing_extensions import override
Expand All @@ -15,6 +15,8 @@
from collections.abc import Mapping
from types import TracebackType

from crawlee.browsers._types import BrowserType

logger = getLogger(__name__)


Expand All @@ -29,7 +31,7 @@ class PlaywrightBrowserPlugin(BaseBrowserPlugin):
def __init__(
self,
*,
browser_type: Literal['chromium', 'firefox', 'webkit'] = 'chromium',
browser_type: BrowserType = 'chromium',
browser_options: Mapping[str, Any] | None = None,
page_options: Mapping[str, Any] | None = None,
max_open_pages_per_browser: int = 20,
Expand All @@ -53,7 +55,7 @@ def __init__(

@property
@override
def browser_type(self) -> Literal['chromium', 'firefox', 'webkit']:
def browser_type(self) -> BrowserType:
return self._browser_type

@property
Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/browsers/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
if TYPE_CHECKING:
from playwright.async_api import Page

BrowserType = Literal['chromium', 'firefox', 'webkit']

@dataclass
class CrawleePage:
"""Represents a page object within a browser, with additional metadata for tracking and management."""

id: str
browser_type: Literal['chromium', 'firefox', 'webkit']
browser_type: BrowserType
page: Page
1 change: 1 addition & 0 deletions src/crawlee/fingerprint_suite/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from ._fingerprint_injector import FingerprintInjector
from ._header_generator import HeaderGenerator
12 changes: 12 additions & 0 deletions src/crawlee/fingerprint_suite/_consts.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
# ruff: noqa: E501

# TODO:
# See https://www.useragents.me/ for getting agents?

COMMON_ACCEPT = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'

COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9'

# Playwright default headers (user-agents and sec-ch) for headless browsers.
PW_CHROMIUM_HEADLESS_DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA = '"Not=A?Brand";v="8", "Chromium";v="124", "Google Chrome";v="124"'
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_MOBILE = '?0'
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_PLATFORM = '"macOS"'

PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv125.0) Gecko/20100101 Firefox/125.0'
PW_WEBKIT_HEADLESS_DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15'

# Random 1000 user agents from Apify fingerprint dataset.
USER_AGENT_POOL = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
Expand Down
74 changes: 74 additions & 0 deletions src/crawlee/fingerprint_suite/_fingerprint_injector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from crawlee.fingerprint_suite._consts import (
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA,
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_MOBILE,
PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_PLATFORM,
PW_CHROMIUM_HEADLESS_DEFAULT_USER_AGENT,
PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT,
PW_WEBKIT_HEADLESS_DEFAULT_USER_AGENT,
)
from crawlee.fingerprint_suite._header_generator import HeaderGenerator

if TYPE_CHECKING:
from collections.abc import Mapping

from playwright.async_api import Request, Route

from crawlee.browsers._types import BrowserType


class FingerprintInjector:
"""Injects a fingerprint into the request headers."""

_DEFAULT_HEADER_GENERATOR = HeaderGenerator()

def __init__(
self,
header_generator: HeaderGenerator = _DEFAULT_HEADER_GENERATOR,
) -> None:
self._header_generator = header_generator

async def inject_fingerprint(
self,
route: Route,
request: Request,
browser_type: BrowserType,
) -> None:
"""Injects a fingerprint into the request headers."""
common_headers = self._header_generator.get_common_headers(include_random_user_agent=False)
user_agent_headers = self._get_user_agent_headers(browser_type=browser_type)

headers = {
**request.headers,
**common_headers,
**user_agent_headers,
}

# Continue the request with modified headers
await route.continue_(headers=headers)

def _get_user_agent_headers(
self,
browser_type: BrowserType,
) -> Mapping[str, str]:
headers = {}

if browser_type == 'chromium':
headers['User-Agent'] = PW_CHROMIUM_HEADLESS_DEFAULT_USER_AGENT
headers['Sec-Ch-Ua'] = PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA
headers['Sec-Ch-Ua-Mobile'] = PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_MOBILE
headers['Sec-Ch-Ua-Platform'] = PW_CHROMIUM_HEADLESS_DEFAULT_SEC_CH_UA_PLATFORM

elif browser_type == 'firefox':
headers['User-Agent'] = PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT

elif browser_type == 'webkit':
headers['User-Agent'] = PW_WEBKIT_HEADLESS_DEFAULT_USER_AGENT

else:
raise ValueError(f'Unsupported browser type: {browser_type}')

return headers
17 changes: 14 additions & 3 deletions src/crawlee/fingerprint_suite/_header_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,28 @@
class HeaderGenerator:
"""Generates common headers for HTTP requests."""

def get_common_headers(self) -> Mapping[str, str]:
def get_common_headers(
self,
*,
include_random_user_agent: bool = True,
) -> Mapping[str, str]:
"""Get common headers for HTTP requests.
We do not modify the 'Accept-Encoding', 'Connection' and other headers. They should be included and handled
by the HTTP client.
Args:
include_random_user_agent: Whether to include a random User-Agent header.
Returns:
Dictionary containing common headers.
"""
return {
headers = {
'Accept': COMMON_ACCEPT,
'Accept-Language': COMMON_ACCEPT_LANGUAGE,
'User-Agent': random.choice(USER_AGENT_POOL),
}

if include_random_user_agent:
headers['User-Agent'] = random.choice(USER_AGENT_POOL)

return headers
6 changes: 3 additions & 3 deletions src/crawlee/playwright_crawler/_playwright_crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Literal
from typing import TYPE_CHECKING

from pydantic import ValidationError
from typing_extensions import Unpack
Expand All @@ -20,7 +20,7 @@
from collections.abc import AsyncGenerator

from crawlee._types import AddRequestsKwargs, BasicCrawlingContext

from crawlee.browsers._types import BrowserType

class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]):
"""A crawler that leverages the [Playwright](https://playwright.dev/python/) browser automation library.
Expand All @@ -45,7 +45,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]):
def __init__(
self,
browser_pool: BrowserPool | None = None,
browser_type: Literal['chromium', 'firefox', 'webkit'] | None = None,
browser_type: BrowserType | None = None,
headless: bool | None = None,
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
) -> None:
Expand Down

0 comments on commit ccd37e6

Please sign in to comment.