Replace config-driven HtmlScraperPlugin with specific archive classes

Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each <p><b> entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 00:03:17 +03:00
parent b8f82607f9
commit fd32be729f
7 changed files with 261 additions and 227 deletions
--- a/src/plugins/init.py
+++ b/src/plugins/init.py
@@ -41,16 +41,20 @@ _type_to_class: dict[str, Any] = {}  # populated lazily on first call

 def _archive_classes() -> dict[str, Any]:
    if not _type_to_class:
-        from .archives.html_scraper import HtmlScraperPlugin
+        from .archives.alib import AlibPlugin
        from .archives.openlibrary import OpenLibraryPlugin
        from .archives.rsl import RSLPlugin
+        from .archives.rusneb import RusnebPlugin
+        from .archives.shpl import ShplPlugin
        from .archives.sru_catalog import SRUCatalogPlugin

        _type_to_class.update(
            {
                "openlibrary": OpenLibraryPlugin,
                "rsl": RSLPlugin,
-                "html_scraper": HtmlScraperPlugin,
+                "rusneb": RusnebPlugin,
+                "alib_web": AlibPlugin,
+                "shpl": ShplPlugin,
                "sru_catalog": SRUCatalogPlugin,
            }
        )
--- a/src/plugins/archives/alib.py
+++ b/src/plugins/archives/alib.py
@@ -0,0 +1,70 @@
+"""Alib (alib.ru) archive search plugin."""
+
+import re
+from urllib.parse import quote
+
+import httpx
+
+from models import CandidateRecord
+
+from .html_scraper import AUTHOR_PREFIX_PAT, YEAR_RE, HtmlScraperPlugin
+
+_URL = "https://www.alib.ru/find3.php4"
+_DOMAIN = "www.alib.ru"
+_ENCODING = "cp1251"
+_EXTRA_PARAMS: dict[str, str] = {"f": "5", "s": "0"}
+
+# Book entries appear as <p><b>Author Title Year Publisher…</b>
+_ENTRY_RE = re.compile(r"<p><b>([^<]{5,200})</b>")
+
+
+class AlibPlugin(HtmlScraperPlugin):
+    """Archive searcher for alib.ru.
+
+    Fetches search results with Windows-1251 encoding and extracts book records
+    from ``<p><b>Author Title Year...</b>`` entries.  Author surname and initials
+    are split from the remaining text using a Cyrillic/Latin initial pattern.
+    Year is extracted from within each entry rather than from the page globally.
+    """
+
+    def search(self, query: str) -> list[CandidateRecord]:
+        """Search Alib for books matching query.
+
+        Args:
+            query: Free-text search string.
+
+        Returns:
+            Up to three CandidateRecord dicts with source, title, author, year,
+            isbn, and publisher fields.
+        """
+        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
+        q_enc = quote(query.encode(_ENCODING, "replace"))
+        ep: dict[str, str] = dict(_EXTRA_PARAMS)
+        ep_parts = [f"{k}={quote(str(v).encode(_ENCODING, 'replace'))}" for k, v in ep.items()]
+        raw_qs = "&".join([f"tfind={q_enc}"] + ep_parts)
+        r = httpx.get(f"{_URL}?{raw_qs}", timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+        html = r.content.decode(_ENCODING, errors="replace")
+
+        out: list[CandidateRecord] = []
+        for entry in _ENTRY_RE.findall(html)[:3]:
+            text = entry.strip()
+            year_m = YEAR_RE.search(text)
+            year = year_m.group(0) if year_m else ""
+            m = AUTHOR_PREFIX_PAT.match(text)
+            if m:
+                author = m.group(1).strip()
+                title = m.group(2).strip()
+            else:
+                author = ""
+                title = text
+            out.append(
+                CandidateRecord(
+                    source=self.plugin_id,
+                    title=title,
+                    author=author,
+                    year=year,
+                    isbn="",
+                    publisher="",
+                )
+            )
+        return out
--- a/src/plugins/archives/html_scraper.py
+++ b/src/plugins/archives/html_scraper.py
@@ -1,27 +1,17 @@
-"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
+"""Base class and shared HTML parsing utilities for archive scraper plugins."""

 import re
 from typing import Any
-from urllib.parse import quote, urlparse
-
-import httpx

 from models import CandidateRecord

 from ..rate_limiter import RateLimiter

-_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
-
-# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
-_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
+YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
+AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)


-def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
-    # Support both single and double-quoted class attributes.
-    return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
-
-
-def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
+def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
    """Extract text content from elements whose class contains cls_frag.

    Strips inner HTML tags and normalises whitespace, so elements like
@@ -48,7 +38,7 @@ def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int =
    return out


-def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
+def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
    """Extract non-empty alt attributes from <img> tags, normalising whitespace.

    Args:
@@ -71,19 +61,12 @@ def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:


 class HtmlScraperPlugin:
-    """Config-driven HTML scraper.
+    """Base class for HTML-scraping archive plugins.

-    Supported config keys:
-      url               — search URL
-      search_param      — query param name
-      extra_params      — dict of fixed extra query parameters
-      encoding          — character encoding for query and response (e.g. "cp1251")
-      title_class       — CSS class fragment for title elements (class-based strategy)
-      author_class      — CSS class fragment for author elements
-      link_href_pattern — href regex to find title <a> links (link strategy)
-      brief_class       — CSS class for brief record rows (brief strategy)
-      img_alt           — truthy: extract titles from <img alt> attributes (rusneb strategy)
-      bold_text         — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
+    Handles common initialisation; subclasses implement search() with
+    site-specific hardcoded logic.  The config dict is accepted for
+    registry compatibility but is not used by the base class; all scraping
+    details are hardcoded in the subclass.
    """

    category = "archive_searchers"
@@ -104,163 +87,15 @@ class HtmlScraperPlugin:
        self.rate_limit_seconds = rate_limit_seconds
        self.auto_queue = auto_queue
        self.timeout = timeout
-        self.config = config
-        self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id

    def search(self, query: str) -> list[CandidateRecord]:
        """Search for books matching query.

        Args:
-            query: Free-text search string (author, title, keywords).
+            query: Free-text search string.

        Returns:
            Up to three CandidateRecord dicts with source, title, author, year,
            isbn, and publisher fields.
        """
-        cfg = self.config
-        self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
-
-        encoding = str(cfg.get("encoding") or "")
-        if encoding:
-            # Encode query and extra params in the site's native encoding.
-            q_enc = quote(query.encode(encoding, "replace"))
-            ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
-            ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
-            raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
-            r = httpx.get(
-                f'{cfg["url"]}?{raw_qs}',
-                timeout=self.timeout,
-                headers={"User-Agent": "Mozilla/5.0"},
-            )
-            html = r.content.decode(encoding, errors="replace")
-        else:
-            params: dict[str, Any] = dict(cfg.get("extra_params") or {})
-            params[cfg["search_param"]] = query
-            r = httpx.get(
-                cfg["url"],
-                params=params,
-                timeout=self.timeout,
-                headers={"User-Agent": "Mozilla/5.0"},
-            )
-            html = r.text
-
-        years = _YEAR_RE.findall(html)
-
-        if cfg.get("bold_text"):
-            return self._parse_bold_text(html, years)
-        if cfg.get("img_alt"):
-            return self._parse_img_alt(html, years, cfg)
-        if "link_href_pattern" in cfg:
-            return self._parse_link(html, years, cfg)
-        if "brief_class" in cfg:
-            return self._parse_brief(html, years, cfg)
-        return self._parse_class(html, years, cfg)
-
-    def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
-        """Extract records from ``<p><b>text</b>`` entries (Alib-style).
-
-        The bold text is expected to begin with ``Surname I.N. Title…``; the
-        author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
-
-        Args:
-            html: Decoded HTML response.
-            years: Year strings found in the full HTML (used positionally).
-
-        Returns:
-            Up to three CandidateRecord dicts.
-        """
-        entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
-        out: list[CandidateRecord] = []
-        for i, entry in enumerate(entries):
-            text = entry.strip()
-            m = _AUTHOR_PREFIX_PAT.match(text)
-            if m:
-                author = m.group(1).strip()
-                title = m.group(2).strip()
-            else:
-                author = ""
-                title = text
-            out.append(
-                CandidateRecord(
-                    source=self.plugin_id,
-                    title=title,
-                    author=author,
-                    year=years[i] if i < len(years) else "",
-                    isbn="",
-                    publisher="",
-                )
-            )
-        return out
-
-    def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        """Extract records using ``<img alt>`` for titles and a CSS class for authors.
-
-        Used for sites like rusneb.ru where thumbnail alt attributes carry the
-        book title and a separate span contains the author.
-
-        Args:
-            html: Decoded HTML response.
-            years: Year strings found in the full HTML (used positionally).
-            cfg: Plugin config dict (reads ``author_class``).
-
-        Returns:
-            Up to three CandidateRecord dicts.
-        """
-        titles = _img_alts(html)
-        authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=title,
-                author=authors[i] if i < len(authors) else "",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, title in enumerate(titles)
-        ]
-
-    def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
-        authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=title.strip(),
-                author=authors[i].strip() if i < len(authors) else "",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, title in enumerate(titles)
-        ]
-
-    def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        href_pat = cfg.get("link_href_pattern", r"")
-        titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
-        authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=title.strip(),
-                author=authors[i].strip() if i < len(authors) else "",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, title in enumerate(titles)
-        ]
-
-    def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
-        titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
-        return [
-            CandidateRecord(
-                source=self.plugin_id,
-                title=t.strip(),
-                author="",
-                year=years[i] if i < len(years) else "",
-                isbn="",
-                publisher="",
-            )
-            for i, t in enumerate(titles)
-        ]
+        raise NotImplementedError
--- a/src/plugins/archives/rusneb.py
+++ b/src/plugins/archives/rusneb.py
@@ -0,0 +1,64 @@
+"""НЭБ (rusneb.ru) archive search plugin."""
+
+import re
+
+import httpx
+
+from models import CandidateRecord
+
+from .html_scraper import HtmlScraperPlugin, YEAR_RE, cls_inner_texts, img_alts
+
+_URL = "https://rusneb.ru/search/"
+_DOMAIN = "rusneb.ru"
+_AUTHOR_CLASS = "search-list__item_subtext"
+
+# Each search result is a <li> whose class contains search-list__item but not a BEM
+# child element suffix (which would begin with underscore, e.g. __item_subtext).
+_ITEM_RE = re.compile(
+    r'<li[^>]*class=["\'][^"\']*search-list__item(?!_)[^"\']*["\'][^>]*>(.*?)</li>',
+    re.DOTALL,
+)
+
+
+class RusnebPlugin(HtmlScraperPlugin):
+    """Archive searcher for rusneb.ru (НЭБ — Национальная электронная библиотека).
+
+    Extracts book titles from ``<img alt>`` attributes within search result list
+    items and authors from ``.search-list__item_subtext`` spans.  Years are
+    extracted per list item to avoid picking up unrelated page-level dates.
+    """
+
+    def search(self, query: str) -> list[CandidateRecord]:
+        """Search НЭБ for books matching query.
+
+        Args:
+            query: Free-text search string.
+
+        Returns:
+            Up to three CandidateRecord dicts with source, title, author, year,
+            isbn, and publisher fields.
+        """
+        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
+        r = httpx.get(_URL, params={"q": query}, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+        html = r.text
+
+        out: list[CandidateRecord] = []
+        for item_html in _ITEM_RE.findall(html):
+            alts = img_alts(item_html)
+            if not alts:
+                continue
+            authors = cls_inner_texts(item_html, _AUTHOR_CLASS, 3, 80)
+            year_m = YEAR_RE.search(item_html)
+            out.append(
+                CandidateRecord(
+                    source=self.plugin_id,
+                    title=alts[0],
+                    author=authors[0] if authors else "",
+                    year=year_m.group(0) if year_m else "",
+                    isbn="",
+                    publisher="",
+                )
+            )
+            if len(out) == 3:
+                break
+        return out
--- a/src/plugins/archives/shpl.py
+++ b/src/plugins/archives/shpl.py
@@ -0,0 +1,63 @@
+"""ШПИЛ archive search plugin.
+
+Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin
+produces no results.  The class is retained so the configuration entry can
+be re-enabled if the endpoint is restored.
+"""
+
+import re
+
+import httpx
+
+from models import CandidateRecord
+
+from .html_scraper import YEAR_RE, HtmlScraperPlugin
+
+_URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
+_DOMAIN = "www.shpl.ru"
+_EXTRA_PARAMS: dict[str, str] = {
+    "C21COM": "S",
+    "I21DBN": "BIBL",
+    "P21DBN": "BIBL",
+    "S21FMT": "briefWebRus",
+    "Z21ID": "",
+}
+
+_BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<')
+
+
+class ShplPlugin(HtmlScraperPlugin):
+    """Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека).
+
+    Extracts brief record entries from elements with class ``brief``.
+    The remote IRBIS64 CGI endpoint is currently offline (HTTP 404).
+    """
+
+    def search(self, query: str) -> list[CandidateRecord]:
+        """Search ШПИЛ for books matching query.
+
+        Args:
+            query: Free-text search string.
+
+        Returns:
+            Up to three CandidateRecord dicts with source, title, author, year,
+            isbn, and publisher fields.
+        """
+        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
+        params: dict[str, str] = dict(_EXTRA_PARAMS)
+        params["S21ALL"] = query
+        r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+        html = r.text
+        years = YEAR_RE.findall(html)
+        titles = _BRIEF_RE.findall(html)[:3]
+        return [
+            CandidateRecord(
+                source=self.plugin_id,
+                title=t.strip(),
+                author="",
+                year=years[i] if i < len(years) else "",
+                isbn="",
+                publisher="",
+            )
+            for i, t in enumerate(titles)
+        ]