Replace config-driven HtmlScraperPlugin with specific archive classes
Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each <p><b> entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
63
src/plugins/archives/shpl.py
Normal file
63
src/plugins/archives/shpl.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""ШПИЛ archive search plugin.
|
||||
|
||||
Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin
|
||||
produces no results. The class is retained so the configuration entry can
|
||||
be re-enabled if the endpoint is restored.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import httpx
|
||||
|
||||
from models import CandidateRecord
|
||||
|
||||
from .html_scraper import YEAR_RE, HtmlScraperPlugin
|
||||
|
||||
_URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
|
||||
_DOMAIN = "www.shpl.ru"
|
||||
_EXTRA_PARAMS: dict[str, str] = {
|
||||
"C21COM": "S",
|
||||
"I21DBN": "BIBL",
|
||||
"P21DBN": "BIBL",
|
||||
"S21FMT": "briefWebRus",
|
||||
"Z21ID": "",
|
||||
}
|
||||
|
||||
_BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<')
|
||||
|
||||
|
||||
class ShplPlugin(HtmlScraperPlugin):
|
||||
"""Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека).
|
||||
|
||||
Extracts brief record entries from elements with class ``brief``.
|
||||
The remote IRBIS64 CGI endpoint is currently offline (HTTP 404).
|
||||
"""
|
||||
|
||||
def search(self, query: str) -> list[CandidateRecord]:
|
||||
"""Search ШПИЛ for books matching query.
|
||||
|
||||
Args:
|
||||
query: Free-text search string.
|
||||
|
||||
Returns:
|
||||
Up to three CandidateRecord dicts with source, title, author, year,
|
||||
isbn, and publisher fields.
|
||||
"""
|
||||
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
|
||||
params: dict[str, str] = dict(_EXTRA_PARAMS)
|
||||
params["S21ALL"] = query
|
||||
r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
|
||||
html = r.text
|
||||
years = YEAR_RE.findall(html)
|
||||
titles = _BRIEF_RE.findall(html)[:3]
|
||||
return [
|
||||
CandidateRecord(
|
||||
source=self.plugin_id,
|
||||
title=t.strip(),
|
||||
author="",
|
||||
year=years[i] if i < len(years) else "",
|
||||
isbn="",
|
||||
publisher="",
|
||||
)
|
||||
for i, t in enumerate(titles)
|
||||
]
|
||||
Reference in New Issue
Block a user