Fix archive plugins for НЭБ and Alib; add network integration tests

- html_scraper: add img_alt strategy (НЭБ titles from <img alt>), bold_text strategy (Alib entries from <p><b>), Windows-1251 encoding support, _cls_inner_texts() helper that strips inner HTML tags - rsl: rewrite to POST SearchFilterForm[search] with CSRF token and CQL title:(words) AND author:(word) query format - config: update rusneb (img_alt + correct author_class) and alib_web (encoding + bold_text) to match fixed plugin strategies - tests: add tests/test_archives.py with network-marked tests for all six archive plugins; НЛР and ШПИЛ marked xfail (endpoints return HTTP 404) - presubmit: exclude network tests from default run (-m "not network") Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-09 22:59:19 +03:00
parent ce03046e51
commit b8f82607f9
6 changed files with 458 additions and 42 deletions
--- a/src/plugins/archives/rsl.py
+++ b/src/plugins/archives/rsl.py
@@ -1,5 +1,17 @@
-"""RSL (Russian State Library) AJAX JSON search API plugin (search.rsl.ru)."""
+"""RSL (Russian State Library) search plugin (search.rsl.ru).

+The search API requires a POST to ``/site/ajax-search?language=ru`` with
+form-encoded body containing ``SearchFilterForm[search]`` and a CSRF token
+obtained from the main search page.  Query syntax is CQL:
+``title:(<title words>) AND author:(<author words>)``.
+
+Results come back as an HTML fragment in the ``content`` key of a JSON
+envelope; individual records are identified by the CSS classes
+``rsl-item-nocover-title`` (author) and ``rsl-item-nocover-descr`` (title).
+Both fields contain ``<b>`` highlight tags that are stripped before returning.
+"""
+
+import re
 from typing import Any

 import httpx
@@ -9,9 +21,27 @@ from models import CandidateRecord
 from ..rate_limiter import RateLimiter

 _DOMAIN = "search.rsl.ru"
+_SEARCH_URL = "https://search.rsl.ru/site/ajax-search"
+_BASE_URL = "https://search.rsl.ru/ru/search"
+_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
+
+
+def _strip_tags(html_frag: str) -> str:
+    """Strip HTML tags and decode basic entities from a fragment."""
+    text = re.sub(r"<[^>]+>", "", html_frag)
+    text = text.replace("&quot;", '"').replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
+    return re.sub(r"\s+", " ", text).strip()


 class RSLPlugin:
+    """Archive searcher for search.rsl.ru.
+
+    Formats the query as CQL ``title:(title_words) AND author:(author_word)``
+    by treating the first whitespace-delimited token as the author surname and
+    the remainder as title keywords.  When only one token is present, a plain
+    ``title:(token) OR author:(token)`` query is used instead.
+    """
+
    category = "archive_searchers"

    def __init__(
@@ -32,28 +62,79 @@ class RSLPlugin:
        self.timeout = timeout

    def search(self, query: str) -> list[CandidateRecord]:
+        """Search RSL for books matching query.
+
+        Args:
+            query: Free-text string; the first token is treated as the author
+                surname and remaining tokens as title keywords.
+
+        Returns:
+            Up to three CandidateRecord dicts extracted from the RSL HTML
+            response, with ``<b>`` highlight tags stripped.
+        """
        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
-        r = httpx.get(
-            "https://search.rsl.ru/site/ajax-search",
-            params={"language": "ru", "q": query, "page": 1, "perPage": 5},
+
+        cql = self._build_cql(query)
+        client = httpx.Client()
+
+        # Fetch the main page to obtain a valid CSRF token.
+        r0 = client.get(_BASE_URL, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
+        csrf_match = re.search(r'name="_csrf"\s+value="([^"]+)"', r0.text)
+        csrf = csrf_match.group(1) if csrf_match else ""
+
+        r = client.post(
+            _SEARCH_URL,
+            params={"language": "ru"},
+            data={"SearchFilterForm[search]": cql, "_csrf": csrf},
            timeout=self.timeout,
-            headers={"Accept": "application/json"},
+            headers={
+                "Accept": "application/json",
+                "X-Requested-With": "XMLHttpRequest",
+                "Referer": _BASE_URL,
+                "User-Agent": "Mozilla/5.0",
+            },
        )
        data: dict[str, Any] = r.json()
-        records: list[dict[str, Any]] = data.get("records") or data.get("items") or data.get("data") or []
+        content = str(data.get("content") or "")
+
+        raw_titles = re.findall(r'rsl-item-nocover-descr[^"]*">(.*?)</div>', content)[:3]
+        raw_authors = re.findall(r'rsl-item-nocover-title[^"]*">(.*?)</div>', content)[:3]
+        years = _YEAR_RE.findall(content)[:3]
+
        out: list[CandidateRecord] = []
-        for rec in records[:3]:
-            title = (str(rec.get("title") or rec.get("name") or "")).strip()
+        for i, raw_title in enumerate(raw_titles):
+            title = _strip_tags(raw_title)
            if not title:
                continue
+            author = _strip_tags(raw_authors[i]) if i < len(raw_authors) else ""
            out.append(
                CandidateRecord(
                    source=self.plugin_id,
                    title=title,
-                    author=(str(rec.get("author") or rec.get("authors") or "")).strip(),
-                    year=str(rec.get("year") or rec.get("pubyear") or "").strip(),
-                    isbn=(str(rec.get("isbn") or "")).strip(),
-                    publisher=(str(rec.get("publisher") or "")).strip(),
+                    author=author,
+                    year=years[i] if i < len(years) else "",
+                    isbn="",
+                    publisher="",
                )
            )
        return out
+
+    @staticmethod
+    def _build_cql(query: str) -> str:
+        """Build a CQL query string for the RSL search API.
+
+        Args:
+            query: Raw query string, typically ``"Author Title keywords"``.
+
+        Returns:
+            CQL string in the form ``title:(…) AND author:(…)`` when the query
+            contains multiple tokens, or ``title:(…) OR author:(…)`` for a
+            single token.
+        """
+        tokens = query.split()
+        if len(tokens) > 1:
+            author_part = tokens[0]
+            title_part = " ".join(tokens[1:])
+            return f"title:({title_part}) AND author:({author_part})"
+        token = tokens[0] if tokens else query
+        return f"title:({token}) OR author:({token})"