Initial commit

Photo-based book cataloger with AI identification.
Room → Cabinet → Shelf → Book hierarchy; FastAPI + SQLite backend;
vanilla JS SPA; OpenAI-compatible plugin system for boundary
detection, text recognition, and archive search.
This commit is contained in:
2026-03-09 14:17:13 +03:00
commit 084d1aebd5
64 changed files with 8605 additions and 0 deletions

View File

@@ -0,0 +1,121 @@
"""Config-driven HTML scraper for archive sites (rusneb, alib, shpl, etc.)."""
import re
from typing import Any
from urllib.parse import urlparse
import httpx
from models import CandidateRecord
from ..rate_limiter import RateLimiter
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
return re.compile(rf'class="[^"]*{re.escape(cls_frag)}[^"]*"[^>]*>([^<]{{{min_len},{max_len}}})<')
class HtmlScraperPlugin:
"""
Config-driven HTML scraper. Supported config keys:
url — search URL
search_param — query param name
extra_params — dict of fixed extra query parameters
title_class — CSS class fragment for title elements (class-based strategy)
author_class — CSS class fragment for author elements
link_href_pattern — href regex to find title <a> links (link strategy, e.g. alib)
brief_class — CSS class for brief record rows (brief strategy, e.g. shpl)
"""
category = "archive_searchers"
def __init__(
self,
plugin_id: str,
name: str,
rate_limiter: RateLimiter,
rate_limit_seconds: float,
auto_queue: bool,
timeout: int,
config: dict[str, Any],
):
self.plugin_id = plugin_id
self.name = name
self._rl = rate_limiter
self.rate_limit_seconds = rate_limit_seconds
self.auto_queue = auto_queue
self.timeout = timeout
self.config = config
self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
def search(self, query: str) -> list[CandidateRecord]:
cfg = self.config
self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
params: dict[str, Any] = dict(cfg.get("extra_params") or {})
params[cfg["search_param"]] = query
r = httpx.get(
cfg["url"],
params=params,
timeout=self.timeout,
headers={"User-Agent": "Mozilla/5.0"},
)
html = r.text
years = _YEAR_RE.findall(html)
# Strategy: link_href_pattern (alib-style)
if "link_href_pattern" in cfg:
return self._parse_link(html, years, cfg)
# Strategy: brief_class (shpl-style)
if "brief_class" in cfg:
return self._parse_brief(html, years, cfg)
# Strategy: title_class + author_class (rusneb-style)
return self._parse_class(html, years, cfg)
def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=title.strip(),
author=authors[i].strip() if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_link(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
href_pat = cfg.get("link_href_pattern", r"")
titles = re.findall(rf'<a[^>]+href="[^"]*{href_pat}[^"]*"[^>]*>([^<]{{3,120}})</a>', html)[:3]
authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=title.strip(),
author=authors[i].strip() if i < len(authors) else "",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, title in enumerate(titles)
]
def _parse_brief(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
titles = _cls_re(cfg.get("brief_class", "brief"), 3, 120).findall(html)[:3]
return [
CandidateRecord(
source=self.plugin_id,
title=t.strip(),
author="",
year=years[i] if i < len(years) else "",
isbn="",
publisher="",
)
for i, t in enumerate(titles)
]