Files
bookshelf/config/functions.default.yaml
Petr Polezhaev fd32be729f Replace config-driven HtmlScraperPlugin with specific archive classes
Each archive scraper now has its own class with hardcoded URL and parsing
logic; config only carries auto_queue, timeout, and rate_limit_seconds.

- html_scraper: refactor to base class with public shared utilities
  (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts)
- rusneb.py (new): RusnebPlugin extracts year per list item rather than
  globally, eliminating wrong page-level dates
- alib.py (new): AlibPlugin extracts year from within each <p><b> entry
  rather than globally, fixing nonsensical year values
- shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded
  params; config type updated from html_scraper to shpl
- config: remove config: subsections from rusneb, alib_web, shpl entries;
  update type fields to rusneb, alib_web, shpl respectively
- plugins/__init__.py: register new specific types, remove html_scraper
- tests: use specific plugin classes; assert all CandidateRecord fields
  (source, title, author, year, isbn, publisher) with appropriate constraints

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 00:03:17 +03:00

89 lines
2.4 KiB
YAML

# Function configurations — dict per category (not lists).
# AI functions reference a model from models.*.yaml.
# Archive functions specify a type and optional config dict.
# Keys within each category serve as plugin_id; must be unique across all categories.
# Override individual functions in functions.user.yaml.
functions:
# ── Boundary detection: image → {boundaries: [...], confidence: 0.x}
# ai_shelf_boundaries / ai_book_boundaries stored as {functionId: [fractions]} per entity.
boundary_detectors:
shelves: # key = plugin_id = target; runs on cabinet images
model: vl_detect_shelves
max_image_px: 1600
auto_queue: false
rate_limit_seconds: 0
timeout: 30
books: # key = plugin_id = target; runs on shelf images
model: vl_detect_books
max_image_px: 1600
auto_queue: false
rate_limit_seconds: 0
timeout: 30
# ── Text recognition: spine image → {raw_text, title, author, year, publisher, other}
text_recognizers:
recognize:
model: vl_recognize
max_image_px: 1600
auto_queue: true
rate_limit_seconds: 0
timeout: 30
# ── Book identification: raw_text → {title, author, year, isbn, publisher, confidence}
book_identifiers:
identify:
model: ai_identify
confidence_threshold: 0.8
auto_queue: false
rate_limit_seconds: 0
timeout: 30
# ── Archive searchers: query → [{source, title, author, year, isbn, publisher}, ...]
archive_searchers:
openlibrary:
name: "OpenLibrary"
type: openlibrary
auto_queue: true
rate_limit_seconds: 5
timeout: 8
rsl:
name: "РГБ"
type: rsl
auto_queue: true
rate_limit_seconds: 5
timeout: 8
rusneb:
name: "НЭБ"
type: rusneb
auto_queue: true
rate_limit_seconds: 5
timeout: 8
alib_web:
name: "Alib (web)"
type: alib_web
auto_queue: false
rate_limit_seconds: 5
timeout: 8
nlr:
name: "НЛР"
type: sru_catalog
auto_queue: false
rate_limit_seconds: 5
timeout: 8
config:
url: "http://www.nlr.ru/search/query"
query_prefix: "title="
shpl:
# Endpoint currently returns HTTP 404; retained for future re-enablement.
name: "ШПИЛ"
type: shpl
auto_queue: false
rate_limit_seconds: 5
timeout: 8