Fix archive plugins for НЭБ and Alib; add network integration tests

- html_scraper: add img_alt strategy (НЭБ titles from <img alt>), bold_text
  strategy (Alib entries from <p><b>), Windows-1251 encoding support,
  _cls_inner_texts() helper that strips inner HTML tags
- rsl: rewrite to POST SearchFilterForm[search] with CSRF token and CQL
  title:(words) AND author:(word) query format
- config: update rusneb (img_alt + correct author_class) and alib_web
  (encoding + bold_text) to match fixed plugin strategies
- tests: add tests/test_archives.py with network-marked tests for all six
  archive plugins; НЛР and ШПИЛ marked xfail (endpoints return HTTP 404)
- presubmit: exclude network tests from default run (-m "not network")

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-09 22:59:19 +03:00
parent ce03046e51
commit b8f82607f9
6 changed files with 458 additions and 42 deletions

View File

@@ -2,7 +2,7 @@
import re
from typing import Any
from urllib.parse import urlparse
from urllib.parse import quote, urlparse
import httpx
@@ -12,21 +12,78 @@ from ..rate_limiter import RateLimiter
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
return re.compile(rf'class="[^"]*{re.escape(cls_frag)}[^"]*"[^>]*>([^<]{{{min_len},{max_len}}})<')
# Support both single and double-quoted class attributes.
return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
"""Extract text content from elements whose class contains cls_frag.
Strips inner HTML tags and normalises whitespace, so elements like
``<span class=''><b>Name</b> I.N.</span>`` work correctly.
Args:
html: Raw HTML string to search.
cls_frag: Substring that must appear in the class attribute value.
min_len: Minimum length of extracted text to keep.
max_len: Maximum length of extracted text to keep.
Returns:
Up to three non-empty text strings in document order.
"""
raw = re.findall(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>(.*?)</', html, re.DOTALL)
out: list[str] = []
for m in raw:
text = re.sub(r"<[^>]+>", "", m)
text = re.sub(r"\s+", " ", text).strip()
if min_len <= len(text) <= max_len:
out.append(text)
if len(out) == 3:
break
return out
def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
"""Extract non-empty alt attributes from <img> tags, normalising whitespace.
Args:
html: Raw HTML string to search.
min_len: Minimum character length to include.
max_len: Maximum character length to include.
Returns:
Up to three non-empty, whitespace-normalised alt strings.
"""
alts = re.findall(r'<img[^>]+alt=[\'"]([^\'"]+)[\'"]', html)
out: list[str] = []
for a in alts:
text = re.sub(r"\s+", " ", a).strip()
if min_len <= len(text) <= max_len:
out.append(text)
if len(out) == 3:
break
return out
class HtmlScraperPlugin:
"""
Config-driven HTML scraper. Supported config keys:
url — search URL
search_param — query param name
extra_params dict of fixed extra query parameters
title_class — CSS class fragment for title elements (class-based strategy)
author_class — CSS class fragment for author elements
link_href_pattern — href regex to find title <a> links (link strategy, e.g. alib)
brief_class — CSS class for brief record rows (brief strategy, e.g. shpl)
"""Config-driven HTML scraper.
Supported config keys:
url — search URL
search_param — query param name
extra_params — dict of fixed extra query parameters
encoding — character encoding for query and response (e.g. "cp1251")
title_class — CSS class fragment for title elements (class-based strategy)
author_class — CSS class fragment for author elements
link_href_pattern — href regex to find title <a> links (link strategy)
brief_class — CSS class for brief record rows (brief strategy)
img_alt — truthy: extract titles from <img alt> attributes (rusneb strategy)
bold_text — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
"""
category = "archive_searchers"
@@ -51,30 +108,118 @@ class HtmlScraperPlugin:
self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
def search(self, query: str) -> list[CandidateRecord]:
"""Search for books matching query.
Args:
query: Free-text search string (author, title, keywords).
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
cfg = self.config
self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
params: dict[str, Any] = dict(cfg.get("extra_params") or {})
params[cfg["search_param"]] = query
r = httpx.get(
cfg["url"],
params=params,
timeout=self.timeout,
headers={"User-Agent": "Mozilla/5.0"},
)
html = r.text
encoding = str(cfg.get("encoding") or "")
if encoding:
# Encode query and extra params in the site's native encoding.
q_enc = quote(query.encode(encoding, "replace"))
ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
r = httpx.get(
f'{cfg["url"]}?{raw_qs}',
timeout=self.timeout,
headers={"User-Agent": "Mozilla/5.0"},
)
html = r.content.decode(encoding, errors="replace")
else:
params: dict[str, Any] = dict(cfg.get("extra_params") or {})
params[cfg["search_param"]] = query
r = httpx.get(
cfg["url"],
params=params,
timeout=self.timeout,
headers={"User-Agent": "Mozilla/5.0"},
)
html = r.text
years = _YEAR_RE.findall(html)
# Strategy: link_href_pattern (alib-style)
if cfg.get("bold_text"):
return self._parse_bold_text(html, years)
if cfg.get("img_alt"):
return self._parse_img_alt(html, years, cfg)
if "link_href_pattern" in cfg:
return self._parse_link(html, years, cfg)
# Strategy: brief_class (shpl-style)
if "brief_class" in cfg:
return self._parse_brief(html, years, cfg)
# Strategy: title_class + author_class (rusneb-style)
return self._parse_class(html, years, cfg)
def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
"""Extract records from ``<p><b>text</b>`` entries (Alib-style).
The bold text is expected to begin with ``Surname I.N. Title…``; the
author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
Args:
html: Decoded HTML response.
years: Year strings found in the full HTML (used positionally).
Returns:
Up to three CandidateRecord dicts.
"""
entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
out: list[CandidateRecord] = []
for i, entry in enumerate(entries):
text = entry.strip()
m = _AUTHOR_PREFIX_PAT.match(text)
if m:
author = m.group(1).strip()
title = m.group(2).strip()
else:
author = ""
title = text
out.append(
CandidateRecord(
source=self.plugin_id,
title=title,
author=author,
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
)
return out
def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
"""Extract records using ``<img alt>`` for titles and a CSS class for authors.
Used for sites like rusneb.ru where thumbnail alt attributes carry the
book title and a separate span contains the author.
Args:
html: Decoded HTML response.
years: Year strings found in the full HTML (used positionally).
cfg: Plugin config dict (reads ``author_class``).
Returns:
Up to three CandidateRecord dicts.
"""
titles = _img_alts(html)
authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
return [
CandidateRecord(
source=self.plugin_id,
title=title,
author=authors[i] if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]

View File

@@ -1,5 +1,17 @@
"""RSL (Russian State Library) AJAX JSON search API plugin (search.rsl.ru)."""
"""RSL (Russian State Library) search plugin (search.rsl.ru).
The search API requires a POST to ``/site/ajax-search?language=ru`` with
form-encoded body containing ``SearchFilterForm[search]`` and a CSRF token
obtained from the main search page. Query syntax is CQL:
``title:(<title words>) AND author:(<author words>)``.
Results come back as an HTML fragment in the ``content`` key of a JSON
envelope; individual records are identified by the CSS classes
``rsl-item-nocover-title`` (author) and ``rsl-item-nocover-descr`` (title).
Both fields contain ``<b>`` highlight tags that are stripped before returning.
"""
import re
from typing import Any
import httpx
@@ -9,9 +21,27 @@ from models import CandidateRecord
from ..rate_limiter import RateLimiter
_DOMAIN = "search.rsl.ru"
_SEARCH_URL = "https://search.rsl.ru/site/ajax-search"
_BASE_URL = "https://search.rsl.ru/ru/search"
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
def _strip_tags(html_frag: str) -> str:
"""Strip HTML tags and decode basic entities from a fragment."""
text = re.sub(r"<[^>]+>", "", html_frag)
text = text.replace("&quot;", '"').replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
return re.sub(r"\s+", " ", text).strip()
class RSLPlugin:
"""Archive searcher for search.rsl.ru.
Formats the query as CQL ``title:(title_words) AND author:(author_word)``
by treating the first whitespace-delimited token as the author surname and
the remainder as title keywords. When only one token is present, a plain
``title:(token) OR author:(token)`` query is used instead.
"""
category = "archive_searchers"
def __init__(
@@ -32,28 +62,79 @@ class RSLPlugin:
self.timeout = timeout
def search(self, query: str) -> list[CandidateRecord]:
"""Search RSL for books matching query.
Args:
query: Free-text string; the first token is treated as the author
surname and remaining tokens as title keywords.
Returns:
Up to three CandidateRecord dicts extracted from the RSL HTML
response, with ``<b>`` highlight tags stripped.
"""
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
r = httpx.get(
"https://search.rsl.ru/site/ajax-search",
params={"language": "ru", "q": query, "page": 1, "perPage": 5},
cql = self._build_cql(query)
client = httpx.Client()
# Fetch the main page to obtain a valid CSRF token.
r0 = client.get(_BASE_URL, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
csrf_match = re.search(r'name="_csrf"\s+value="([^"]+)"', r0.text)
csrf = csrf_match.group(1) if csrf_match else ""
r = client.post(
_SEARCH_URL,
params={"language": "ru"},
data={"SearchFilterForm[search]": cql, "_csrf": csrf},
timeout=self.timeout,
headers={"Accept": "application/json"},
headers={
"Accept": "application/json",
"X-Requested-With": "XMLHttpRequest",
"Referer": _BASE_URL,
"User-Agent": "Mozilla/5.0",
},
)
data: dict[str, Any] = r.json()
records: list[dict[str, Any]] = data.get("records") or data.get("items") or data.get("data") or []
content = str(data.get("content") or "")
raw_titles = re.findall(r'rsl-item-nocover-descr[^"]*">(.*?)</div>', content)[:3]
raw_authors = re.findall(r'rsl-item-nocover-title[^"]*">(.*?)</div>', content)[:3]
years = _YEAR_RE.findall(content)[:3]
out: list[CandidateRecord] = []
for rec in records[:3]:
title = (str(rec.get("title") or rec.get("name") or "")).strip()
for i, raw_title in enumerate(raw_titles):
title = _strip_tags(raw_title)
if not title:
continue
author = _strip_tags(raw_authors[i]) if i < len(raw_authors) else ""
out.append(
CandidateRecord(
source=self.plugin_id,
title=title,
author=(str(rec.get("author") or rec.get("authors") or "")).strip(),
year=str(rec.get("year") or rec.get("pubyear") or "").strip(),
isbn=(str(rec.get("isbn") or "")).strip(),
publisher=(str(rec.get("publisher") or "")).strip(),
author=author,
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
)
return out
@staticmethod
def _build_cql(query: str) -> str:
"""Build a CQL query string for the RSL search API.
Args:
query: Raw query string, typically ``"Author Title keywords"``.
Returns:
CQL string in the form ``title:(…) AND author:(…)`` when the query
contains multiple tokens, or ``title:(…) OR author:(…)`` for a
single token.
"""
tokens = query.split()
if len(tokens) > 1:
author_part = tokens[0]
title_part = " ".join(tokens[1:])
return f"title:({title_part}) AND author:({author_part})"
token = tokens[0] if tokens else query
return f"title:({token}) OR author:({token})"