Replace config-driven HtmlScraperPlugin with specific archive classes

Each archive scraper now has its own class with hardcoded URL and parsing
logic; config only carries auto_queue, timeout, and rate_limit_seconds.

- html_scraper: refactor to base class with public shared utilities
  (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts)
- rusneb.py (new): RusnebPlugin extracts year per list item rather than
  globally, eliminating wrong page-level dates
- alib.py (new): AlibPlugin extracts year from within each <p><b> entry
  rather than globally, fixing nonsensical year values
- shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded
  params; config type updated from html_scraper to shpl
- config: remove config: subsections from rusneb, alib_web, shpl entries;
  update type fields to rusneb, alib_web, shpl respectively
- plugins/__init__.py: register new specific types, remove html_scraper
- tests: use specific plugin classes; assert all CandidateRecord fields
  (source, title, author, year, isbn, publisher) with appropriate constraints

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-10 00:03:17 +03:00
parent b8f82607f9
commit fd32be729f
7 changed files with 261 additions and 227 deletions

View File

@@ -41,16 +41,20 @@ _type_to_class: dict[str, Any] = {} # populated lazily on first call
def _archive_classes() -> dict[str, Any]:
if not _type_to_class:
from .archives.html_scraper import HtmlScraperPlugin
from .archives.alib import AlibPlugin
from .archives.openlibrary import OpenLibraryPlugin
from .archives.rsl import RSLPlugin
from .archives.rusneb import RusnebPlugin
from .archives.shpl import ShplPlugin
from .archives.sru_catalog import SRUCatalogPlugin
_type_to_class.update(
{
"openlibrary": OpenLibraryPlugin,
"rsl": RSLPlugin,
"html_scraper": HtmlScraperPlugin,
"rusneb": RusnebPlugin,
"alib_web": AlibPlugin,
"shpl": ShplPlugin,
"sru_catalog": SRUCatalogPlugin,
}
)

View File

@@ -0,0 +1,70 @@
"""Alib (alib.ru) archive search plugin."""
import re
from urllib.parse import quote
import httpx
from models import CandidateRecord
from .html_scraper import AUTHOR_PREFIX_PAT, YEAR_RE, HtmlScraperPlugin
_URL = "https://www.alib.ru/find3.php4"
_DOMAIN = "www.alib.ru"
_ENCODING = "cp1251"
_EXTRA_PARAMS: dict[str, str] = {"f": "5", "s": "0"}
# Book entries appear as <p><b>Author Title Year Publisher…</b>
_ENTRY_RE = re.compile(r"<p><b>([^<]{5,200})</b>")
class AlibPlugin(HtmlScraperPlugin):
"""Archive searcher for alib.ru.
Fetches search results with Windows-1251 encoding and extracts book records
from ``<p><b>Author Title Year...</b>`` entries. Author surname and initials
are split from the remaining text using a Cyrillic/Latin initial pattern.
Year is extracted from within each entry rather than from the page globally.
"""
def search(self, query: str) -> list[CandidateRecord]:
"""Search Alib for books matching query.
Args:
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
q_enc = quote(query.encode(_ENCODING, "replace"))
ep: dict[str, str] = dict(_EXTRA_PARAMS)
ep_parts = [f"{k}={quote(str(v).encode(_ENCODING, 'replace'))}" for k, v in ep.items()]
raw_qs = "&".join([f"tfind={q_enc}"] + ep_parts)
r = httpx.get(f"{_URL}?{raw_qs}", timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
html = r.content.decode(_ENCODING, errors="replace")
out: list[CandidateRecord] = []
for entry in _ENTRY_RE.findall(html)[:3]:
text = entry.strip()
year_m = YEAR_RE.search(text)
year = year_m.group(0) if year_m else ""
m = AUTHOR_PREFIX_PAT.match(text)
if m:
author = m.group(1).strip()
title = m.group(2).strip()
else:
author = ""
title = text
out.append(
CandidateRecord(
source=self.plugin_id,
title=title,
author=author,
year=year,
isbn="",
publisher="",
)
)
return out

View File

@@ -1,27 +1,17 @@
"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
"""Base class and shared HTML parsing utilities for archive scraper plugins."""
import re
from typing import Any
from urllib.parse import quote, urlparse
import httpx
from models import CandidateRecord
from ..rate_limiter import RateLimiter
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
# Support both single and double-quoted class attributes.
return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
"""Extract text content from elements whose class contains cls_frag.
Strips inner HTML tags and normalises whitespace, so elements like
@@ -48,7 +38,7 @@ def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int =
return out
def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
"""Extract non-empty alt attributes from <img> tags, normalising whitespace.
Args:
@@ -71,19 +61,12 @@ def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
class HtmlScraperPlugin:
"""Config-driven HTML scraper.
"""Base class for HTML-scraping archive plugins.
Supported config keys:
url — search URL
search_param — query param name
extra_params — dict of fixed extra query parameters
encoding — character encoding for query and response (e.g. "cp1251")
title_class — CSS class fragment for title elements (class-based strategy)
author_class — CSS class fragment for author elements
link_href_pattern — href regex to find title <a> links (link strategy)
brief_class — CSS class for brief record rows (brief strategy)
img_alt — truthy: extract titles from <img alt> attributes (rusneb strategy)
bold_text — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
Handles common initialisation; subclasses implement search() with
site-specific hardcoded logic. The config dict is accepted for
registry compatibility but is not used by the base class; all scraping
details are hardcoded in the subclass.
"""
category = "archive_searchers"
@@ -104,163 +87,15 @@ class HtmlScraperPlugin:
self.rate_limit_seconds = rate_limit_seconds
self.auto_queue = auto_queue
self.timeout = timeout
self.config = config
self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
def search(self, query: str) -> list[CandidateRecord]:
"""Search for books matching query.
Args:
query: Free-text search string (author, title, keywords).
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
cfg = self.config
self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
encoding = str(cfg.get("encoding") or "")
if encoding:
# Encode query and extra params in the site's native encoding.
q_enc = quote(query.encode(encoding, "replace"))
ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
r = httpx.get(
f'{cfg["url"]}?{raw_qs}',
timeout=self.timeout,
headers={"User-Agent": "Mozilla/5.0"},
)
html = r.content.decode(encoding, errors="replace")
else:
params: dict[str, Any] = dict(cfg.get("extra_params") or {})
params[cfg["search_param"]] = query
r = httpx.get(
cfg["url"],
params=params,
timeout=self.timeout,
headers={"User-Agent": "Mozilla/5.0"},
)
html = r.text
years = _YEAR_RE.findall(html)
if cfg.get("bold_text"):
return self._parse_bold_text(html, years)
if cfg.get("img_alt"):
return self._parse_img_alt(html, years, cfg)
if "link_href_pattern" in cfg:
return self._parse_link(html, years, cfg)
if "brief_class" in cfg:
return self._parse_brief(html, years, cfg)
return self._parse_class(html, years, cfg)
def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
"""Extract records from ``<p><b>text</b>`` entries (Alib-style).
The bold text is expected to begin with ``Surname I.N. Title…``; the
author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
Args:
html: Decoded HTML response.
years: Year strings found in the full HTML (used positionally).
Returns:
Up to three CandidateRecord dicts.
"""
entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
out: list[CandidateRecord] = []
for i, entry in enumerate(entries):
text = entry.strip()
m = _AUTHOR_PREFIX_PAT.match(text)
if m:
author = m.group(1).strip()
title = m.group(2).strip()
else:
author = ""
title = text
out.append(
CandidateRecord(
source=self.plugin_id,
title=title,
author=author,
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
)
return out
def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
"""Extract records using ``<img alt>`` for titles and a CSS class for authors.
Used for sites like rusneb.ru where thumbnail alt attributes carry the
book title and a separate span contains the author.
Args:
html: Decoded HTML response.
years: Year strings found in the full HTML (used positionally).
cfg: Plugin config dict (reads ``author_class``).
Returns:
Up to three CandidateRecord dicts.
"""
titles = _img_alts(html)
authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
return [
CandidateRecord(
source=self.plugin_id,
title=title,
author=authors[i] if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=title.strip(),
author=authors[i].strip() if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
href_pat = cfg.get("link_href_pattern", r"")
titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=title.strip(),
author=authors[i].strip() if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=t.strip(),
author="",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, t in enumerate(titles)
]
raise NotImplementedError

View File

@@ -0,0 +1,64 @@
"""НЭБ (rusneb.ru) archive search plugin."""
import re
import httpx
from models import CandidateRecord
from .html_scraper import HtmlScraperPlugin, YEAR_RE, cls_inner_texts, img_alts
_URL = "https://rusneb.ru/search/"
_DOMAIN = "rusneb.ru"
_AUTHOR_CLASS = "search-list__item_subtext"
# Each search result is a <li> whose class contains search-list__item but not a BEM
# child element suffix (which would begin with underscore, e.g. __item_subtext).
_ITEM_RE = re.compile(
r'<li[^>]*class=["\'][^"\']*search-list__item(?!_)[^"\']*["\'][^>]*>(.*?)</li>',
re.DOTALL,
)
class RusnebPlugin(HtmlScraperPlugin):
"""Archive searcher for rusneb.ru (НЭБ — Национальная электронная библиотека).
Extracts book titles from ``<img alt>`` attributes within search result list
items and authors from ``.search-list__item_subtext`` spans. Years are
extracted per list item to avoid picking up unrelated page-level dates.
"""
def search(self, query: str) -> list[CandidateRecord]:
"""Search НЭБ for books matching query.
Args:
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
r = httpx.get(_URL, params={"q": query}, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
html = r.text
out: list[CandidateRecord] = []
for item_html in _ITEM_RE.findall(html):
alts = img_alts(item_html)
if not alts:
continue
authors = cls_inner_texts(item_html, _AUTHOR_CLASS, 3, 80)
year_m = YEAR_RE.search(item_html)
out.append(
CandidateRecord(
source=self.plugin_id,
title=alts[0],
author=authors[0] if authors else "",
year=year_m.group(0) if year_m else "",
isbn="",
publisher="",
)
)
if len(out) == 3:
break
return out

View File

@@ -0,0 +1,63 @@
"""ШПИЛ archive search plugin.
Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin
produces no results. The class is retained so the configuration entry can
be re-enabled if the endpoint is restored.
"""
import re
import httpx
from models import CandidateRecord
from .html_scraper import YEAR_RE, HtmlScraperPlugin
_URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
_DOMAIN = "www.shpl.ru"
_EXTRA_PARAMS: dict[str, str] = {
"C21COM": "S",
"I21DBN": "BIBL",
"P21DBN": "BIBL",
"S21FMT": "briefWebRus",
"Z21ID": "",
}
_BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<')
class ShplPlugin(HtmlScraperPlugin):
"""Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека).
Extracts brief record entries from elements with class ``brief``.
The remote IRBIS64 CGI endpoint is currently offline (HTTP 404).
"""
def search(self, query: str) -> list[CandidateRecord]:
"""Search ШПИЛ for books matching query.
Args:
query: Free-text search string.
Returns:
Up to three CandidateRecord dicts with source, title, author, year,
isbn, and publisher fields.
"""
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
params: dict[str, str] = dict(_EXTRA_PARAMS)
params["S21ALL"] = query
r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
html = r.text
years = YEAR_RE.findall(html)
titles = _BRIEF_RE.findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=t.strip(),
author="",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, t in enumerate(titles)
]