Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each <p><b> entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
102 lines
3.1 KiB
Python
102 lines
3.1 KiB
Python
"""Base class and shared HTML parsing utilities for archive scraper plugins."""
|
||
|
||
import re
|
||
from typing import Any
|
||
|
||
from models import CandidateRecord
|
||
|
||
from ..rate_limiter import RateLimiter
|
||
|
||
YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
|
||
AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
|
||
|
||
|
||
def cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
|
||
"""Extract text content from elements whose class contains cls_frag.
|
||
|
||
Strips inner HTML tags and normalises whitespace, so elements like
|
||
``<span class='…'><b>Name</b> I.N.</span>`` work correctly.
|
||
|
||
Args:
|
||
html: Raw HTML string to search.
|
||
cls_frag: Substring that must appear in the class attribute value.
|
||
min_len: Minimum length of extracted text to keep.
|
||
max_len: Maximum length of extracted text to keep.
|
||
|
||
Returns:
|
||
Up to three non-empty text strings in document order.
|
||
"""
|
||
raw = re.findall(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>(.*?)</', html, re.DOTALL)
|
||
out: list[str] = []
|
||
for m in raw:
|
||
text = re.sub(r"<[^>]+>", "", m)
|
||
text = re.sub(r"\s+", " ", text).strip()
|
||
if min_len <= len(text) <= max_len:
|
||
out.append(text)
|
||
if len(out) == 3:
|
||
break
|
||
return out
|
||
|
||
|
||
def img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
|
||
"""Extract non-empty alt attributes from <img> tags, normalising whitespace.
|
||
|
||
Args:
|
||
html: Raw HTML string to search.
|
||
min_len: Minimum character length to include.
|
||
max_len: Maximum character length to include.
|
||
|
||
Returns:
|
||
Up to three non-empty, whitespace-normalised alt strings.
|
||
"""
|
||
alts = re.findall(r'<img[^>]+alt=[\'"]([^\'"]+)[\'"]', html)
|
||
out: list[str] = []
|
||
for a in alts:
|
||
text = re.sub(r"\s+", " ", a).strip()
|
||
if min_len <= len(text) <= max_len:
|
||
out.append(text)
|
||
if len(out) == 3:
|
||
break
|
||
return out
|
||
|
||
|
||
class HtmlScraperPlugin:
|
||
"""Base class for HTML-scraping archive plugins.
|
||
|
||
Handles common initialisation; subclasses implement search() with
|
||
site-specific hardcoded logic. The config dict is accepted for
|
||
registry compatibility but is not used by the base class; all scraping
|
||
details are hardcoded in the subclass.
|
||
"""
|
||
|
||
category = "archive_searchers"
|
||
|
||
def __init__(
|
||
self,
|
||
plugin_id: str,
|
||
name: str,
|
||
rate_limiter: RateLimiter,
|
||
rate_limit_seconds: float,
|
||
auto_queue: bool,
|
||
timeout: int,
|
||
config: dict[str, Any],
|
||
):
|
||
self.plugin_id = plugin_id
|
||
self.name = name
|
||
self._rl = rate_limiter
|
||
self.rate_limit_seconds = rate_limit_seconds
|
||
self.auto_queue = auto_queue
|
||
self.timeout = timeout
|
||
|
||
def search(self, query: str) -> list[CandidateRecord]:
|
||
"""Search for books matching query.
|
||
|
||
Args:
|
||
query: Free-text search string.
|
||
|
||
Returns:
|
||
Up to three CandidateRecord dicts with source, title, author, year,
|
||
isbn, and publisher fields.
|
||
"""
|
||
raise NotImplementedError
|