bookshelf/src/plugins/archives/shpl.py

"""ШПИЛ archive search plugin.

Note: the IRBIS64 CGI endpoint currently returns HTTP 404 and this plugin
produces no results.  The class is retained so the configuration entry can
be re-enabled if the endpoint is restored.
"""

import re

import httpx

from models import CandidateRecord

from .html_scraper import YEAR_RE, HtmlScraperPlugin

_URL = "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe"
_DOMAIN = "www.shpl.ru"
_EXTRA_PARAMS: dict[str, str] = {
    "C21COM": "S",
    "I21DBN": "BIBL",
    "P21DBN": "BIBL",
    "S21FMT": "briefWebRus",
    "Z21ID": "",
}

_BRIEF_RE = re.compile(r'class=["\']brief["\'][^>]*>([^<]{3,120})<')


class ShplPlugin(HtmlScraperPlugin):
    """Archive searcher for shpl.ru (ШПИЛ — Государственная публичная историческая библиотека).

    Extracts brief record entries from elements with class ``brief``.
    The remote IRBIS64 CGI endpoint is currently offline (HTTP 404).
    """

    def search(self, query: str) -> list[CandidateRecord]:
        """Search ШПИЛ for books matching query.

        Args:
            query: Free-text search string.

        Returns:
            Up to three CandidateRecord dicts with source, title, author, year,
            isbn, and publisher fields.
        """
        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
        params: dict[str, str] = dict(_EXTRA_PARAMS)
        params["S21ALL"] = query
        r = httpx.get(_URL, params=params, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
        html = r.text
        years = YEAR_RE.findall(html)
        titles = _BRIEF_RE.findall(html)[:3]
        return [
            CandidateRecord(
                source=self.plugin_id,
                title=t.strip(),
                author="",
                year=years[i] if i < len(years) else "",
                isbn="",
                publisher="",
            )
            for i, t in enumerate(titles)
        ]