Source code for extral.connectors.file.utils

"""Utility functions for file connectors."""

import logging
import tempfile
import urllib.request
from contextlib import contextmanager
from pathlib import Path
from typing import Generator, Optional, Union
from urllib.error import HTTPError, URLError

logger = logging.getLogger(__name__)


[docs] @contextmanager def get_file_handle( path: str, mode: str = "r" ) -> Generator[Union[Path, tempfile._TemporaryFileWrapper], None, None]: """ Get a file handle for either local file or HTTP/HTTPS URL. For HTTP/HTTPS URLs, downloads the file to a temporary location. For local files, returns the path directly. Args: path: Local file path or HTTP/HTTPS URL mode: File open mode (only used for local files) Yields: Path object for local files or temporary file for URLs Raises: HTTPError: If HTTP download fails URLError: If URL is invalid or network error FileNotFoundError: If local file doesn't exist """ if path.startswith("http://") or path.startswith("https://"): logger.info(f"Downloading file from {path}") try: with tempfile.NamedTemporaryFile(mode="wb", delete=False) as tmp_file: with urllib.request.urlopen(path) as response: # Stream the download to handle large files chunk_size = 8192 total_size = 0 while True: chunk = response.read(chunk_size) if not chunk: break tmp_file.write(chunk) total_size += len(chunk) if total_size % (chunk_size * 1000) == 0: logger.debug( f"Downloaded {total_size / 1024 / 1024:.1f} MB" ) tmp_file.flush() tmp_path = Path(tmp_file.name) logger.info( f"Downloaded {total_size / 1024 / 1024:.1f} MB to {tmp_path}" ) try: yield tmp_path finally: # Clean up temporary file tmp_path.unlink() logger.debug(f"Cleaned up temporary file {tmp_path}") except HTTPError as e: logger.error(f"HTTP error downloading {path}: {e}") raise except URLError as e: logger.error(f"URL error downloading {path}: {e}") raise else: # Local file path file_path = Path(path) if not file_path.exists(): raise FileNotFoundError(f"File not found: {path}") yield file_path
[docs] def estimate_file_size(path: str) -> Optional[int]: """ Estimate the size of a file from local path or HTTP headers. Args: path: Local file path or HTTP/HTTPS URL Returns: File size in bytes, or None if cannot be determined """ if path.startswith("http://") or path.startswith("https://"): try: with urllib.request.urlopen(path) as response: content_length = response.headers.get("Content-Length") if content_length: return int(content_length) except (HTTPError, URLError, ValueError): logger.warning(f"Could not determine size for {path}") return None else: try: return Path(path).stat().st_size except OSError: logger.warning(f"Could not determine size for {path}") return None