"""Base class and shared HTML parsing utilities for archive scraper plugins.""" import re from typing import Any from models import CandidateRecord from ..rate_limiter import RateLimiter YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b") AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL) def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]: """Extract text content from elements whose class contains cls_frag. Strips inner HTML tags and normalises whitespace, so elements like ``Name I.N.`` work correctly. Args: html: Raw HTML string to search. cls_frag: Substring that must appear in the class attribute value. min_len: Minimum length of extracted text to keep. max_len: Maximum length of extracted text to keep. Returns: Up to three non-empty text strings in document order. """ raw = re.findall(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>(.*?)]+>", "", m) text = re.sub(r"\s+", " ", text).strip() if min_len <= len(text) <= max_len: out.append(text) if len(out) == 3: break return out def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]: """Extract non-empty alt attributes from tags, normalising whitespace. Args: html: Raw HTML string to search. min_len: Minimum character length to include. max_len: Maximum character length to include. Returns: Up to three non-empty, whitespace-normalised alt strings. """ alts = re.findall(r']+alt=[\'"]([^\'"]+)[\'"]', html) out: list[str] = [] for a in alts: text = re.sub(r"\s+", " ", a).strip() if min_len <= len(text) <= max_len: out.append(text) if len(out) == 3: break return out class HtmlScraperPlugin: """Base class for HTML-scraping archive plugins. Handles common initialisation; subclasses implement search() with site-specific hardcoded logic. The config dict is accepted for registry compatibility but is not used by the base class; all scraping details are hardcoded in the subclass. """ category = "archive_searchers" def __init__( self, plugin_id: str, name: str, rate_limiter: RateLimiter, rate_limit_seconds: float, auto_queue: bool, timeout: int, config: dict[str, Any], ): self.plugin_id = plugin_id self.name = name self._rl = rate_limiter self.rate_limit_seconds = rate_limit_seconds self.auto_queue = auto_queue self.timeout = timeout def search(self, query: str) -> list[CandidateRecord]: """Search for books matching query. Args: query: Free-text search string. Returns: Up to three CandidateRecord dicts with source, title, author, year, isbn, and publisher fields. """ raise NotImplementedError