DownloadingTools(byparr_enabled: Optional[bool] = None, max_retries: int = URL_DOWNLOADER_MAX_RETRIES, timeout: int = URL_DOWNLOADER_TIMEOUT, user_agent_rotation: bool = True, enable_caching: bool = False, add_instructions: bool = True, **kwargs)
Bases: StrictToolkit
URL Content Downloader Tool v1.1
A production-ready universal file downloading toolkit with BYPARR integration, anti-bot bypass capabilities, and smart content processing for any file type.
Parameters:
| Name | Type | Description | Default |
byparr_enabled | Optional[bool] | Whether to use BYPARR service (None = auto-detect) | None |
max_retries | int | Maximum number of retry attempts | URL_DOWNLOADER_MAX_RETRIES |
timeout | int | Request timeout in seconds | URL_DOWNLOADER_TIMEOUT |
user_agent_rotation | bool | Whether to rotate user agents | True |
enable_caching | bool | Whether to cache downloaded content | False |
add_instructions | bool | Whether to add LLM usage instructions | True |
Source code in src/enhancedtoolkits/downloading.py
| def __init__( # pylint: disable=too-many-arguments,too-many-positional-arguments
self,
byparr_enabled: Optional[bool] = None,
max_retries: int = URL_DOWNLOADER_MAX_RETRIES,
timeout: int = URL_DOWNLOADER_TIMEOUT,
user_agent_rotation: bool = True,
enable_caching: bool = False,
add_instructions: bool = True,
**kwargs,
):
"""
Initialize URL Content Downloader.
Args:
byparr_enabled: Whether to use BYPARR service (None = auto-detect)
max_retries: Maximum number of retry attempts
timeout: Request timeout in seconds
user_agent_rotation: Whether to rotate user agents
enable_caching: Whether to cache downloaded content
add_instructions: Whether to add LLM usage instructions
"""
# Configuration
self.max_retries = max(1, min(10, max_retries))
self.timeout = max(5, min(300, timeout))
self.user_agent_rotation = user_agent_rotation
self.enable_caching = enable_caching
self.add_instructions = add_instructions
self.instructions = DownloadingTools.get_llm_usage_instructions()
super().__init__(
name="url_content_downloader",
instructions=self.instructions,
add_instructions=self.add_instructions,
**kwargs,
)
# BYPARR configuration
if byparr_enabled is not None:
self.byparr_enabled = byparr_enabled
else:
self.byparr_enabled = BYPARR_ENABLED
# HTTP client configuration
self.client = httpx.Client(
timeout=httpx.Timeout(self.timeout),
follow_redirects=True,
headers=self._get_default_headers(),
)
# Simple cache for downloaded content
# Always a dict to keep typing/simple usage straightforward; guarded by
# ``self.enable_caching``.
self.content_cache: Dict[str, str] = {}
# Register methods
self.register(self.access_website_content)
self.register(self.get_file_from_url)
self.register(self.download_multiple_urls)
self.register(self.get_url_metadata)
self.register(self.check_url_accessibility)
log_info(
f"URL Content Downloader initialized - BYPARR: {self.byparr_enabled}, "
f"Max Retries: {self.max_retries}, Timeout: {self.timeout}s"
)
|
access_website_content
access_website_content(url: str, output: str = 'auto') -> str
Access, download and parse Website content using URL with anti-bot bypass. Automatically detects content type and applies appropriate processing.
Parameters:
| Name | Type | Description | Default |
url | str | URL to download content from | required |
output | str | Output format ("auto", "markdown", "text", "html", or "binary") | 'auto' |
Returns:
| Type | Description |
str | Parsed content in the specified format |
Raises:
| Type | Description |
URLDownloadError | |
ContentParsingError | |
Source code in src/enhancedtoolkits/downloading.py
| def access_website_content(self, url: str, output: str = "auto") -> str:
"""
Access, download and parse Website content using URL with anti-bot bypass.
Automatically detects content type and applies appropriate processing.
Args:
url: URL to download content from
output: Output format ("auto", "markdown", "text", "html", or "binary")
Returns:
Parsed content in the specified format
Raises:
URLDownloadError: If download fails
ContentParsingError: If content parsing fails
"""
return self.get_file_from_url(url, output)
|
get_file_from_url(url: str, output: str = 'auto') -> str
Download any file from a URL with smart content processing. Uses MarkItDown for HTML content, handles binary files appropriately.
Parameters:
| Name | Type | Description | Default |
url | str | URL to download file from | required |
output | str | Output format ("auto", "markdown", "text", "html", or "binary") | 'auto' |
Returns:
| Type | Description |
str | Processed content or file information |
Raises:
| Type | Description |
URLDownloadError | |
ContentParsingError | |
Source code in src/enhancedtoolkits/downloading.py
| def get_file_from_url(self, url: str, output: str = "auto") -> str:
"""
Download any file from a URL with smart content processing.
Uses MarkItDown for HTML content, handles binary files appropriately.
Args:
url: URL to download file from
output: Output format ("auto", "markdown", "text", "html", or "binary")
Returns:
Processed content or file information
Raises:
URLDownloadError: If download fails
ContentParsingError: If content parsing fails
"""
try:
# Validate inputs
validated_url = self._validate_url(url)
validated_format = self._validate_format(output)
log_debug(f"Downloading file from: {validated_url}")
# Check cache first
cache_key = f"{validated_url}:{validated_format}"
if self.enable_caching and cache_key in self.content_cache:
log_debug(f"Using cached content for: {validated_url}")
return self.content_cache[cache_key]
# Try BYPARR first if enabled
response_data = None
content_type = None
if self.byparr_enabled:
try:
byparr_result = self._fetch_content_with_byparr(
validated_url
)
if byparr_result:
response_data = byparr_result
# BYPARR typically returns HTML.
content_type = "text/html"
log_info(
f"Successfully fetched content via BYPARR: {validated_url}"
)
except (
httpx.HTTPError,
ValueError,
KeyError,
TypeError,
) as exc:
log_warning(
f"BYPARR failed for {url}: {exc}; falling back to direct fetch"
)
# Fallback to direct fetch with anti-bot bypass
if not response_data:
response_data, content_type = self._fetch_file_with_antibot(
validated_url
)
log_info(
f"Successfully fetched file via direct fetch: {validated_url}"
)
# Process content based on type and format
processed_content = self._process_file_content(
response_data,
content_type or "application/octet-stream",
validated_format,
validated_url,
)
# Cache the processed content
if self.enable_caching:
self.content_cache[cache_key] = processed_content
log_info(
f"File download completed: {len(str(processed_content))} characters/bytes"
)
return processed_content
except (URLDownloadError, ContentParsingError):
raise
except Exception as exc: # pylint: disable=broad-exception-caught
# This is a toolkit boundary: we want to wrap any unexpected failure
# into a stable, user-facing error type.
log_error(f"Unexpected error downloading {url}: {exc}")
raise URLDownloadError(
f"Failed to download file from {url}: {exc}"
) from exc
|
download_multiple_urls(urls: List[str], output: str = 'auto', **kwargs) -> str
Download content from multiple URLs.
Parameters:
| Name | Type | Description | Default |
urls | List[str] | List of URLs to download. | required |
output | str | Output format for all URLs. | 'auto' |
**kwargs | | Backwards-compatibility for older callers. - format: Alias for output. | {} |
Returns:
| Type | Description |
str | JSON string containing results for all URLs. |
Source code in src/enhancedtoolkits/downloading.py
| def download_multiple_urls(
self, urls: List[str], output: str = "auto", **kwargs
) -> str:
"""Download content from multiple URLs.
Args:
urls: List of URLs to download.
output: Output format for all URLs.
**kwargs: Backwards-compatibility for older callers.
- format: Alias for ``output``.
Returns:
JSON string containing results for all URLs.
"""
if not urls:
raise URLDownloadError("URL list cannot be empty")
if len(urls) > 10:
log_warning(
f"Large URL list ({len(urls)} URLs), limiting to first 10"
)
urls = urls[:10]
if "format" in kwargs and kwargs["format"] is not None:
output = kwargs["format"]
results = []
for i, url in enumerate(urls):
try:
content = self.access_website_content(url, output)
results.append(
{
"url": url,
"success": True,
"content": content,
"error": None,
}
)
log_debug(
f"Successfully downloaded {i + 1}/{len(urls)}: {url}"
)
except (URLDownloadError, ContentParsingError) as exc:
results.append(
{
"url": url,
"success": False,
"content": None,
"error": str(exc),
}
)
log_warning(
f"Failed to download {i + 1}/{len(urls)}: {url} - {exc}"
)
return json.dumps(results, indent=2, ensure_ascii=False)
|
get_url_metadata(url: str) -> str
Extract metadata from a URL without downloading full content.
Parameters:
| Name | Type | Description | Default |
url | str | URL to extract metadata from | required |
Returns:
| Type | Description |
str | JSON string containing URL metadata |
Source code in src/enhancedtoolkits/downloading.py
| def get_url_metadata(self, url: str) -> str:
"""
Extract metadata from a URL without downloading full content.
Args:
url: URL to extract metadata from
Returns:
JSON string containing URL metadata
"""
try:
validated_url = self._validate_url(url)
# Make HEAD request to get metadata
response = self.client.head(validated_url)
response.raise_for_status()
metadata = {
"url": validated_url,
"status_code": response.status_code,
"content_type": response.headers.get("content-type", ""),
"content_length": response.headers.get("content-length"),
"last_modified": response.headers.get("last-modified"),
"server": response.headers.get("server", ""),
"accessible": True,
}
return json.dumps(metadata, indent=2)
except Exception as exc: # pylint: disable=broad-exception-caught
log_error(f"Error getting metadata for {url}: {exc}")
return json.dumps(
{"url": url, "accessible": False, "error": str(exc)},
indent=2,
)
|
check_url_accessibility(url: str) -> str
Check if a URL is accessible without downloading content.
Parameters:
| Name | Type | Description | Default |
url | str | | required |
Returns:
| Type | Description |
str | JSON string with accessibility status |
Source code in src/enhancedtoolkits/downloading.py
| def check_url_accessibility(self, url: str) -> str:
"""
Check if a URL is accessible without downloading content.
Args:
url: URL to check
Returns:
JSON string with accessibility status
"""
try:
validated_url = self._validate_url(url)
response = self.client.head(validated_url)
response.raise_for_status()
result = {
"url": validated_url,
"accessible": True,
"status_code": response.status_code,
"response_time_ms": int(
response.elapsed.total_seconds() * 1000
),
}
except Exception as exc: # pylint: disable=broad-exception-caught
result = {
"url": url,
"accessible": False,
"error": str(exc),
"response_time_ms": None,
}
return json.dumps(result, indent=2)
|
get_llm_usage_instructions() -> str
Return precise, structured instructions for LLM tool calling.
Source code in src/enhancedtoolkits/downloading.py
| @staticmethod
def get_llm_usage_instructions() -> str:
"""Return precise, structured instructions for LLM tool calling."""
instructions = """
<content_downloader_tools_instructions>
URL content downloader (extract text/markdown from HTML/docs; else return minimal metadata)
GOAL
- Given a URL, fetch content safely and return either readable text/markdown or minimal metadata.
OUTPUT
- Always returns a string.
- output=text/markdown/html: extracted content.
- output=binary: JSON metadata; for PDFs/office docs it may return extracted markdown.
TOOLS
- get_file_from_url(url, output="auto") # default
- access_website_content(url, output="auto") # alias
- download_multiple_urls(urls, output="auto") # max 10
- get_url_metadata(url) # HEAD only
- check_url_accessibility(url) # HEAD + timing
OUTPUT OPTIONS
- auto | markdown | text | html | binary
CONTEXT-SIZE RULES (IMPORTANT)
- Prefer get_url_metadata() first when unsure (avoid downloading huge/binary files).
- Prefer output="text" for summarization; use markdown only if formatting matters.
- Do NOT paste full extracted pages/PDFs into the final answer; summarize + quote short excerpts.
ERRORS
- URLDownloadError (validation/fetch)
- ContentParsingError (processing)
</content_downloader_tools_instructions>
"""
return instructions
|