- html_scraper: add img_alt strategy (НЭБ titles from <img alt>), bold_text strategy (Alib entries from <p><b>), Windows-1251 encoding support, _cls_inner_texts() helper that strips inner HTML tags - rsl: rewrite to POST SearchFilterForm[search] with CSRF token and CQL title:(words) AND author:(word) query format - config: update rusneb (img_alt + correct author_class) and alib_web (encoding + bold_text) to match fixed plugin strategies - tests: add tests/test_archives.py with network-marked tests for all six archive plugins; НЛР and ШПИЛ marked xfail (endpoints return HTTP 404) - presubmit: exclude network tests from default run (-m "not network") Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
190 lines
7.0 KiB
Python
190 lines
7.0 KiB
Python
"""Network integration tests for archive searcher plugins.
|
|
|
|
Each test queries a live external service for "War and Peace" by Tolstoy,
|
|
a book universally catalogued in all supported archives.
|
|
|
|
Run with: pytest tests/ -m network
|
|
Skip with: pytest tests/ -m "not network" (default in presubmit)
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from models import CandidateRecord
|
|
from plugins.archives.html_scraper import HtmlScraperPlugin
|
|
from plugins.archives.openlibrary import OpenLibraryPlugin
|
|
from plugins.archives.rsl import RSLPlugin
|
|
from plugins.archives.sru_catalog import SRUCatalogPlugin
|
|
from plugins.rate_limiter import RateLimiter
|
|
|
|
pytestmark = pytest.mark.network
|
|
|
|
_RL = RateLimiter()
|
|
_TIMEOUT = 15
|
|
|
|
|
|
def _titles(results: list[CandidateRecord]) -> list[str]:
|
|
return [r["title"] for r in results]
|
|
|
|
|
|
def _authors(results: list[CandidateRecord]) -> list[str]:
|
|
return [r["author"] for r in results]
|
|
|
|
|
|
def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
|
|
"""Return True if any result title contains fragment (case-insensitive)."""
|
|
low = fragment.lower()
|
|
return any(low in r["title"].lower() for r in results)
|
|
|
|
|
|
def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
|
|
"""Return True if any result author contains fragment (case-insensitive)."""
|
|
low = fragment.lower()
|
|
return any(low in r["author"].lower() for r in results)
|
|
|
|
|
|
# ── OpenLibrary ───────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_openlibrary_war_and_peace() -> None:
|
|
plugin = OpenLibraryPlugin(
|
|
plugin_id="openlibrary",
|
|
name="OpenLibrary",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=True,
|
|
timeout=_TIMEOUT,
|
|
config={},
|
|
)
|
|
results = plugin.search("War and Peace Tolstoy")
|
|
assert results, "OpenLibrary returned no results"
|
|
assert all(r["source"] == "openlibrary" for r in results)
|
|
assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
|
|
# OpenLibrary stores authors in their original language; accept both forms.
|
|
assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
|
|
|
|
|
|
# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_rsl_voina_i_mir() -> None:
|
|
plugin = RSLPlugin(
|
|
plugin_id="rsl",
|
|
name="РГБ",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=True,
|
|
timeout=_TIMEOUT,
|
|
config={},
|
|
)
|
|
results = plugin.search("Толстой Война и мир")
|
|
assert results, "RSL returned no results"
|
|
assert all(r["source"] == "rsl" for r in results)
|
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
|
|
|
|
|
# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_rusneb_voina_i_mir() -> None:
|
|
plugin = HtmlScraperPlugin(
|
|
plugin_id="rusneb",
|
|
name="НЭБ",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=True,
|
|
timeout=_TIMEOUT,
|
|
config={
|
|
"url": "https://rusneb.ru/search/",
|
|
"search_param": "q",
|
|
"img_alt": True,
|
|
"author_class": "search-list__item_subtext",
|
|
},
|
|
)
|
|
results = plugin.search("Война и мир Толстой")
|
|
assert results, "НЭБ returned no results"
|
|
assert all(r["source"] == "rusneb" for r in results)
|
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
|
assert _has_author(results, "толст"), f"authors={_authors(results)}"
|
|
|
|
|
|
# ── Alib ─────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def test_alib_voina_i_mir() -> None:
|
|
plugin = HtmlScraperPlugin(
|
|
plugin_id="alib_web",
|
|
name="Alib (web)",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=False,
|
|
timeout=_TIMEOUT,
|
|
config={
|
|
"url": "https://www.alib.ru/find3.php4",
|
|
"search_param": "tfind",
|
|
"extra_params": {"f": "5", "s": "0"},
|
|
"encoding": "cp1251",
|
|
"bold_text": True,
|
|
},
|
|
)
|
|
results = plugin.search("Война и мир Толстой")
|
|
assert results, "Alib returned no results"
|
|
assert all(r["source"] == "alib_web" for r in results)
|
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
|
assert _has_author(results, "толст"), f"authors={_authors(results)}"
|
|
|
|
|
|
# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
|
|
# The NLR SRU endpoint (www.nlr.ru/search/query) no longer exists (HTTP 404).
|
|
|
|
|
|
@pytest.mark.xfail(reason="nlr.ru SRU endpoint no longer available (HTTP 404)", strict=False)
|
|
def test_nlr_voina_i_mir() -> None:
|
|
plugin = SRUCatalogPlugin(
|
|
plugin_id="nlr",
|
|
name="НЛР",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=False,
|
|
timeout=_TIMEOUT,
|
|
config={
|
|
"url": "http://www.nlr.ru/search/query",
|
|
"query_prefix": "title=",
|
|
},
|
|
)
|
|
results = plugin.search("Война и мир")
|
|
assert results, "НЛР returned no results"
|
|
assert all(r["source"] == "nlr" for r in results)
|
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
|
|
|
|
|
# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
|
|
# The ШПИЛ IRBIS64 CGI endpoint no longer exists (HTTP 404).
|
|
|
|
|
|
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
|
|
def test_shpl_voina_i_mir() -> None:
|
|
plugin = HtmlScraperPlugin(
|
|
plugin_id="shpl",
|
|
name="ШПИЛ",
|
|
rate_limiter=_RL,
|
|
rate_limit_seconds=0,
|
|
auto_queue=False,
|
|
timeout=_TIMEOUT,
|
|
config={
|
|
"url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
|
|
"search_param": "S21ALL",
|
|
"extra_params": {
|
|
"C21COM": "S",
|
|
"I21DBN": "BIBL",
|
|
"P21DBN": "BIBL",
|
|
"S21FMT": "briefWebRus",
|
|
"Z21ID": "",
|
|
},
|
|
"brief_class": "brief",
|
|
},
|
|
)
|
|
results = plugin.search("Война и мир")
|
|
assert results, "ШПИЛ returned no results"
|
|
assert all(r["source"] == "shpl" for r in results)
|
|
assert _has_title(results, "война"), f"titles={_titles(results)}"
|