Skip to content

Downloader Tools

enhancedtoolkits.downloading.DownloadingTools

DownloadingTools(byparr_enabled: Optional[bool] = None, max_retries: int = URL_DOWNLOADER_MAX_RETRIES, timeout: int = URL_DOWNLOADER_TIMEOUT, user_agent_rotation: bool = True, enable_caching: bool = False, add_instructions: bool = True, **kwargs)

Bases: StrictToolkit

URL Content Downloader Tool v1.1

A production-ready universal file downloading toolkit with BYPARR integration, anti-bot bypass capabilities, and smart content processing for any file type.

Parameters:

Name Type Description Default
byparr_enabled Optional[bool]

Whether to use BYPARR service (None = auto-detect)

None
max_retries int

Maximum number of retry attempts

URL_DOWNLOADER_MAX_RETRIES
timeout int

Request timeout in seconds

URL_DOWNLOADER_TIMEOUT
user_agent_rotation bool

Whether to rotate user agents

True
enable_caching bool

Whether to cache downloaded content

False
add_instructions bool

Whether to add LLM usage instructions

True
Source code in src/enhancedtoolkits/downloading.py
def __init__(  # pylint: disable=too-many-arguments,too-many-positional-arguments
    self,
    byparr_enabled: Optional[bool] = None,
    max_retries: int = URL_DOWNLOADER_MAX_RETRIES,
    timeout: int = URL_DOWNLOADER_TIMEOUT,
    user_agent_rotation: bool = True,
    enable_caching: bool = False,
    add_instructions: bool = True,
    **kwargs,
):
    """
    Initialize URL Content Downloader.

    Args:
        byparr_enabled: Whether to use BYPARR service (None = auto-detect)
        max_retries: Maximum number of retry attempts
        timeout: Request timeout in seconds
        user_agent_rotation: Whether to rotate user agents
        enable_caching: Whether to cache downloaded content
        add_instructions: Whether to add LLM usage instructions
    """
    # Configuration
    self.max_retries = max(1, min(10, max_retries))
    self.timeout = max(5, min(300, timeout))
    self.user_agent_rotation = user_agent_rotation
    self.enable_caching = enable_caching
    self.add_instructions = add_instructions
    self.instructions = DownloadingTools.get_llm_usage_instructions()

    super().__init__(
        name="url_content_downloader",
        instructions=self.instructions,
        add_instructions=self.add_instructions,
        **kwargs,
    )

    # BYPARR configuration
    if byparr_enabled is not None:
        self.byparr_enabled = byparr_enabled
    else:
        self.byparr_enabled = BYPARR_ENABLED

    # HTTP client configuration
    self.client = httpx.Client(
        timeout=httpx.Timeout(self.timeout),
        follow_redirects=True,
        headers=self._get_default_headers(),
    )

    # Simple cache for downloaded content
    # Always a dict to keep typing/simple usage straightforward; guarded by
    # ``self.enable_caching``.
    self.content_cache: Dict[str, str] = {}

    # Register methods
    self.register(self.access_website_content)
    self.register(self.get_file_from_url)
    self.register(self.download_multiple_urls)
    self.register(self.get_url_metadata)
    self.register(self.check_url_accessibility)

    log_info(
        f"URL Content Downloader initialized - BYPARR: {self.byparr_enabled}, "
        f"Max Retries: {self.max_retries}, Timeout: {self.timeout}s"
    )

Functions

access_website_content

access_website_content(url: str, output: str = 'auto') -> str

Access, download and parse Website content using URL with anti-bot bypass. Automatically detects content type and applies appropriate processing.

Parameters:

Name Type Description Default
url str

URL to download content from

required
output str

Output format ("auto", "markdown", "text", "html", or "binary")

'auto'

Returns:

Type Description
str

Parsed content in the specified format

Raises:

Type Description
URLDownloadError

If download fails

ContentParsingError

If content parsing fails

Source code in src/enhancedtoolkits/downloading.py
def access_website_content(self, url: str, output: str = "auto") -> str:
    """
    Access, download and parse Website content using URL with anti-bot bypass.
    Automatically detects content type and applies appropriate processing.

    Args:
        url: URL to download content from
        output: Output format ("auto", "markdown", "text", "html", or "binary")

    Returns:
        Parsed content in the specified format

    Raises:
        URLDownloadError: If download fails
        ContentParsingError: If content parsing fails
    """
    return self.get_file_from_url(url, output)

get_file_from_url

get_file_from_url(url: str, output: str = 'auto') -> str

Download any file from a URL with smart content processing. Uses MarkItDown for HTML content, handles binary files appropriately.

Parameters:

Name Type Description Default
url str

URL to download file from

required
output str

Output format ("auto", "markdown", "text", "html", or "binary")

'auto'

Returns:

Type Description
str

Processed content or file information

Raises:

Type Description
URLDownloadError

If download fails

ContentParsingError

If content parsing fails

Source code in src/enhancedtoolkits/downloading.py
def get_file_from_url(self, url: str, output: str = "auto") -> str:
    """
    Download any file from a URL with smart content processing.
    Uses MarkItDown for HTML content, handles binary files appropriately.

    Args:
        url: URL to download file from
        output: Output format ("auto", "markdown", "text", "html", or "binary")

    Returns:
        Processed content or file information

    Raises:
        URLDownloadError: If download fails
        ContentParsingError: If content parsing fails
    """
    try:
        # Validate inputs
        validated_url = self._validate_url(url)
        validated_format = self._validate_format(output)

        log_debug(f"Downloading file from: {validated_url}")

        # Check cache first
        cache_key = f"{validated_url}:{validated_format}"
        if self.enable_caching and cache_key in self.content_cache:
            log_debug(f"Using cached content for: {validated_url}")
            return self.content_cache[cache_key]

        # Try BYPARR first if enabled
        response_data = None
        content_type = None

        if self.byparr_enabled:
            try:
                byparr_result = self._fetch_content_with_byparr(
                    validated_url
                )
                if byparr_result:
                    response_data = byparr_result
                    # BYPARR typically returns HTML.
                    content_type = "text/html"
                    log_info(
                        f"Successfully fetched content via BYPARR: {validated_url}"
                    )
            except (
                httpx.HTTPError,
                ValueError,
                KeyError,
                TypeError,
            ) as exc:
                log_warning(
                    f"BYPARR failed for {url}: {exc}; falling back to direct fetch"
                )

        # Fallback to direct fetch with anti-bot bypass
        if not response_data:
            response_data, content_type = self._fetch_file_with_antibot(
                validated_url
            )
            log_info(
                f"Successfully fetched file via direct fetch: {validated_url}"
            )

        # Process content based on type and format
        processed_content = self._process_file_content(
            response_data,
            content_type or "application/octet-stream",
            validated_format,
            validated_url,
        )

        # Cache the processed content
        if self.enable_caching:
            self.content_cache[cache_key] = processed_content

        log_info(
            f"File download completed: {len(str(processed_content))} characters/bytes"
        )
        return processed_content

    except (URLDownloadError, ContentParsingError):
        raise
    except Exception as exc:  # pylint: disable=broad-exception-caught
        # This is a toolkit boundary: we want to wrap any unexpected failure
        # into a stable, user-facing error type.
        log_error(f"Unexpected error downloading {url}: {exc}")
        raise URLDownloadError(
            f"Failed to download file from {url}: {exc}"
        ) from exc

download_multiple_urls

download_multiple_urls(urls: List[str], output: str = 'auto', **kwargs) -> str

Download content from multiple URLs.

Parameters:

Name Type Description Default
urls List[str]

List of URLs to download.

required
output str

Output format for all URLs.

'auto'
**kwargs

Backwards-compatibility for older callers. - format: Alias for output.

{}

Returns:

Type Description
str

JSON string containing results for all URLs.

Source code in src/enhancedtoolkits/downloading.py
def download_multiple_urls(
    self, urls: List[str], output: str = "auto", **kwargs
) -> str:
    """Download content from multiple URLs.

    Args:
        urls: List of URLs to download.
        output: Output format for all URLs.
        **kwargs: Backwards-compatibility for older callers.
            - format: Alias for ``output``.

    Returns:
        JSON string containing results for all URLs.
    """
    if not urls:
        raise URLDownloadError("URL list cannot be empty")

    if len(urls) > 10:
        log_warning(
            f"Large URL list ({len(urls)} URLs), limiting to first 10"
        )
        urls = urls[:10]

    if "format" in kwargs and kwargs["format"] is not None:
        output = kwargs["format"]

    results = []
    for i, url in enumerate(urls):
        try:
            content = self.access_website_content(url, output)
            results.append(
                {
                    "url": url,
                    "success": True,
                    "content": content,
                    "error": None,
                }
            )
            log_debug(
                f"Successfully downloaded {i + 1}/{len(urls)}: {url}"
            )
        except (URLDownloadError, ContentParsingError) as exc:
            results.append(
                {
                    "url": url,
                    "success": False,
                    "content": None,
                    "error": str(exc),
                }
            )
            log_warning(
                f"Failed to download {i + 1}/{len(urls)}: {url} - {exc}"
            )

    return json.dumps(results, indent=2, ensure_ascii=False)

get_url_metadata

get_url_metadata(url: str) -> str

Extract metadata from a URL without downloading full content.

Parameters:

Name Type Description Default
url str

URL to extract metadata from

required

Returns:

Type Description
str

JSON string containing URL metadata

Source code in src/enhancedtoolkits/downloading.py
def get_url_metadata(self, url: str) -> str:
    """
    Extract metadata from a URL without downloading full content.

    Args:
        url: URL to extract metadata from

    Returns:
        JSON string containing URL metadata
    """
    try:
        validated_url = self._validate_url(url)

        # Make HEAD request to get metadata
        response = self.client.head(validated_url)
        response.raise_for_status()

        metadata = {
            "url": validated_url,
            "status_code": response.status_code,
            "content_type": response.headers.get("content-type", ""),
            "content_length": response.headers.get("content-length"),
            "last_modified": response.headers.get("last-modified"),
            "server": response.headers.get("server", ""),
            "accessible": True,
        }

        return json.dumps(metadata, indent=2)

    except Exception as exc:  # pylint: disable=broad-exception-caught
        log_error(f"Error getting metadata for {url}: {exc}")
        return json.dumps(
            {"url": url, "accessible": False, "error": str(exc)},
            indent=2,
        )

check_url_accessibility

check_url_accessibility(url: str) -> str

Check if a URL is accessible without downloading content.

Parameters:

Name Type Description Default
url str

URL to check

required

Returns:

Type Description
str

JSON string with accessibility status

Source code in src/enhancedtoolkits/downloading.py
def check_url_accessibility(self, url: str) -> str:
    """
    Check if a URL is accessible without downloading content.

    Args:
        url: URL to check

    Returns:
        JSON string with accessibility status
    """
    try:
        validated_url = self._validate_url(url)

        response = self.client.head(validated_url)
        response.raise_for_status()

        result = {
            "url": validated_url,
            "accessible": True,
            "status_code": response.status_code,
            "response_time_ms": int(
                response.elapsed.total_seconds() * 1000
            ),
        }

    except Exception as exc:  # pylint: disable=broad-exception-caught
        result = {
            "url": url,
            "accessible": False,
            "error": str(exc),
            "response_time_ms": None,
        }

    return json.dumps(result, indent=2)

get_llm_usage_instructions staticmethod

get_llm_usage_instructions() -> str

Return precise, structured instructions for LLM tool calling.

Source code in src/enhancedtoolkits/downloading.py
    @staticmethod
    def get_llm_usage_instructions() -> str:
        """Return precise, structured instructions for LLM tool calling."""
        instructions = """
<content_downloader_tools_instructions>
URL content downloader (extract text/markdown from HTML/docs; else return minimal metadata)

GOAL
- Given a URL, fetch content safely and return either readable text/markdown or minimal metadata.

OUTPUT
- Always returns a string.
- output=text/markdown/html: extracted content.
- output=binary: JSON metadata; for PDFs/office docs it may return extracted markdown.

TOOLS
- get_file_from_url(url, output="auto")  # default
- access_website_content(url, output="auto")  # alias
- download_multiple_urls(urls, output="auto")  # max 10
- get_url_metadata(url)  # HEAD only
- check_url_accessibility(url)  # HEAD + timing

OUTPUT OPTIONS
- auto | markdown | text | html | binary

CONTEXT-SIZE RULES (IMPORTANT)
- Prefer get_url_metadata() first when unsure (avoid downloading huge/binary files).
- Prefer output="text" for summarization; use markdown only if formatting matters.
- Do NOT paste full extracted pages/PDFs into the final answer; summarize + quote short excerpts.

ERRORS
- URLDownloadError (validation/fetch)
- ContentParsingError (processing)

</content_downloader_tools_instructions>
"""
        return instructions