Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
c00cfa2
chore: update lxml version
mziv Oct 13, 2025
35ea6ad
less restrictive dep update
mziv Oct 13, 2025
456f9e3
fix: reprocess cached html with crawler run config
anna-xing Nov 19, 2025
4412df1
cleanup
anna-xing Nov 19, 2025
1b99071
early return
anna-xing Nov 19, 2025
60cf0e3
restructure
anna-xing Nov 19, 2025
d6064f3
Merge pull request #1 from CoProcure/anna/sc-31444/postprocess-cached…
anna-xing Nov 19, 2025
0bd2915
fix: handle cases when redirected_url is none
ghmeier Nov 19, 2025
54132a3
Merge pull request #2 from CoProcure/ghmeier/fix-non-redirect
ghmeier Nov 19, 2025
8a847ac
fix: make base directory env variable work
anna-xing Nov 20, 2025
1a6fe72
clean up imports
anna-xing Nov 20, 2025
6cba694
cleanup
anna-xing Nov 20, 2025
62e6f39
Merge pull request #3 from CoProcure/anna/sc-31444/custom-base-dir
anna-xing Nov 20, 2025
c4b0bc4
fix: normalize url and make tests runnable
ghmeier Nov 20, 2025
064a356
fix: correct url parsing for images and test
ghmeier Nov 20, 2025
e2f21c9
chore: a letter
ghmeier Nov 20, 2025
8fae6ff
chore: add ruff
ghmeier Nov 20, 2025
20c6b18
Merge pull request #4 from CoProcure/ghmeier/fix-base-url
ghmeier Nov 20, 2025
4fa609a
chore: update comment about cache_mode default
anna-xing Nov 21, 2025
6bd611b
Merge pull request #5 from CoProcure/anna/cache-mode-comment
anna-xing Nov 21, 2025
6dfa25f
feat: use CacheClient for caching crawl results
anna-xing Nov 24, 2025
c0b66d1
fix circular imports
anna-xing Nov 24, 2025
2c650b3
Merge pull request #6 from CoProcure/anna/sc-31491/abstract-cache-client
anna-xing Nov 24, 2025
9647f09
chore: update tests for robots parser
anna-xing Nov 25, 2025
d96f8b4
further consolidation of test files
anna-xing Nov 25, 2025
8bccabe
Merge pull request #7 from CoProcure/anna/robots-parser-caching-test
anna-xing Nov 25, 2025
d98bd6d
feat: use CacheClient for URL seeder
anna-xing Nov 25, 2025
07301de
Merge pull request #8 from CoProcure/anna/sc-31491/cache-url-seeder
anna-xing Nov 25, 2025
5b85912
chore: re-raise run_urls exception (#9)
anna-xing Dec 3, 2025
069a910
chore: lower default TTL to 2 hours (#10)
anna-xing Dec 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions .ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
target-version = "py313"

[lint]
# Specific lint rules can be found at https://docs.astral.sh/ruff/rules
select = [
# Pyflakes
"F",
# isort
"I",
"T20",
# ruff's default subset of codestyle rules that don't overlap with formatting.
"E4",
"E7",
"E9",
"PERF",
"SLF",
"NPY201",
# Enforce consistent rules when using "from __future__ import annotations"
"FA",
"UP",
# Enforce consistent, common imports, like `import pandas as pd` along with custom
# imports like `import brochure.models as app`.
"ICN",
# Enforce consistent return statements.
"RET"
]

[lint.per-file-ignores]
# Ignore unused import amd import * for init files
"__init__.py" = ["F401", "F403"]
# Ignore print statements for commands and private access
"brochure/management/commands/*" = ["T20", "SLF"]
"settings.py" = ["T20"]
# Ignore perf linting in tests.
"tests/*" = ["PERF", "SLF"]

[lint.extend-per-file-ignores]
"brochure/models/__init__.py" = ["I"]
"brochure/migrations/*" = ["I"]

[lint.isort]
known-first-party = ["brochure"]
known-third-party = ["newrelic"]
section-order = ["future","pytest","standard-library","django","third-party","first-party","local-folder"]

[lint.isort.sections]
"django" = ["django"]
"pytest" = ["pytest"]

[lint.flake8-self]
# Ignore a few accesses of private django internals.
ignore-names = ["_prefetched_objects_cache", "_meta"]

[lint.flake8-import-conventions.extend-aliases]
"brochure.models" = "app"
162 changes: 81 additions & 81 deletions crawl4ai/__init__.py
Original file line number Diff line number Diff line change
@@ -1,110 +1,110 @@
# __init__.py
import warnings

from .async_webcrawler import AsyncWebCrawler, CacheMode
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
# Adaptive Crawler
from .adaptive_crawler import (
AdaptiveConfig,
AdaptiveCrawler,
CrawlState,
CrawlStrategy,
StatisticalStrategy,
)

from .content_scraping_strategy import (
ContentScrapingStrategy,
LXMLWebScrapingStrategy,
WebScrapingStrategy, # Backward compatibility alias
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
from .async_configs import (
BrowserConfig,
CrawlerRunConfig,
GeolocationConfig,
HTTPCrawlerConfig,
LinkPreviewConfig,
LLMConfig,
MatchMode,
ProxyConfig,
SeedingConfig,
VirtualScrollConfig,
)
from .async_dispatcher import (
BaseDispatcher,
MemoryAdaptiveDispatcher,
RateLimiter,
SemaphoreDispatcher,
)
from .async_logger import (
AsyncLoggerBase,
AsyncLogger,
AsyncLoggerBase,
)
from .proxy_strategy import (
ProxyRotationStrategy,
RoundRobinProxyStrategy,
)
from .extraction_strategy import (
ExtractionStrategy,
LLMExtractionStrategy,
CosineStrategy,
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,
JsonLxmlExtractionStrategy,
RegexExtractionStrategy
)

# NEW: Import AsyncUrlSeeder
from .async_url_seeder import AsyncUrlSeeder
from .async_webcrawler import AsyncWebCrawler, CacheMode

# Browser Adapters
from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter
from .browser_profiler import BrowserProfiler
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
from .table_extraction import (
TableExtractionStrategy,
DefaultTableExtraction,
NoTableExtraction,
LLMTableExtraction,
)
from .components.crawler_monitor import CrawlerMonitor
from .content_filter_strategy import (
PruningContentFilter,
BM25ContentFilter,
LLMContentFilter,
PruningContentFilter,
RelevantContentFilter,
)
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
from .components.crawler_monitor import CrawlerMonitor
from .link_preview import LinkPreview
from .async_dispatcher import (
MemoryAdaptiveDispatcher,
SemaphoreDispatcher,
RateLimiter,
BaseDispatcher,
from .content_scraping_strategy import (
ContentScrapingStrategy,
LXMLWebScrapingStrategy,
WebScrapingStrategy, # Backward compatibility alias
)
from .docker_client import Crawl4aiDockerClient
from .hub import CrawlerHub
from .browser_profiler import BrowserProfiler
from .deep_crawling import (
DeepCrawlStrategy,
BestFirstCrawlingStrategy,
BFSDeepCrawlStrategy,
FilterChain,
URLPatternFilter,
DomainFilter,
ContentTypeFilter,
URLFilter,
FilterStats,
SEOFilter,
KeywordRelevanceScorer,
URLScorer,
CompositeScorer,
ContentTypeFilter,
DeepCrawlDecorator,
DeepCrawlStrategy,
DFSDeepCrawlStrategy,
DomainAuthorityScorer,
DomainFilter,
FilterChain,
FilterStats,
FreshnessScorer,
KeywordRelevanceScorer,
PathDepthScorer,
BestFirstCrawlingStrategy,
DFSDeepCrawlStrategy,
DeepCrawlDecorator,
)
# NEW: Import AsyncUrlSeeder
from .async_url_seeder import AsyncUrlSeeder
# Adaptive Crawler
from .adaptive_crawler import (
AdaptiveCrawler,
AdaptiveConfig,
CrawlState,
CrawlStrategy,
StatisticalStrategy
SEOFilter,
URLFilter,
URLPatternFilter,
URLScorer,
)

# C4A Script Language Support
from .script import (
compile as c4a_compile,
validate as c4a_validate,
compile_file as c4a_compile_file,
CompilationResult,
ValidationResult,
ErrorDetail
from .docker_client import Crawl4aiDockerClient
from .extraction_strategy import (
CosineStrategy,
ExtractionStrategy,
JsonCssExtractionStrategy,
JsonLxmlExtractionStrategy,
JsonXPathExtractionStrategy,
LLMExtractionStrategy,
RegexExtractionStrategy,
)

# Browser Adapters
from .browser_adapter import (
BrowserAdapter,
PlaywrightAdapter,
UndetectedAdapter
from .hub import CrawlerHub
from .link_preview import LinkPreview
from .markdown_generation_strategy import DefaultMarkdownGenerator
from .models import CrawlResult, DisplayMode, MarkdownGenerationResult
from .proxy_strategy import (
ProxyRotationStrategy,
RoundRobinProxyStrategy,
)
from .script import CompilationResult, ErrorDetail, ValidationResult

from .utils import (
start_colab_display_server,
setup_colab_environment
# C4A Script Language Support
from .script import compile as c4a_compile
from .script import compile_file as c4a_compile_file
from .script import validate as c4a_validate
from .table_extraction import (
DefaultTableExtraction,
LLMTableExtraction,
NoTableExtraction,
TableExtractionStrategy,
)
from .utils import setup_colab_environment, start_colab_display_server

__all__ = [
"AsyncLoggerBase",
Expand Down
Loading