From bf50b760dad3089c136f269352c487bfe8f2fec1 Mon Sep 17 00:00:00 2001 From: arg3t Date: Sat, 18 Apr 2026 17:48:04 +0200 Subject: [PATCH] Add radius-based location filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Subscribers can now restrict broadcasts to homes within N km of a chosen center point. Scraping logic is untouched; geocoding happens at insert time via the PDOK Locatieserver (best-effort, cached) and broadcast() reads the coords back to apply a haversine check. Touches: - hestia.homes gains lat/lon/geocode_confidence; hestia.subscribers gains filter_center_lat/lon/filter_radius_km; new hestia.geocode_cache table (idempotent migration in misc/geocoding_migration.sql) - hestia_utils.geocode: PDOK lookup with unit-suffix normalization, cache writeback, haversine helper - db.add_home geocodes lazily (NULL coords on miss β€” home is still inserted) - scraper.broadcast applies radius check, skips when home lacks coords - bot.py /filter location subcommand (raw coords or place-name lookup) - web UI dashboard gains a radius fieldset + /api/geocode endpoint sharing the same geocode_cache Tests: unit coverage in tests/test_geocode.py plus a new e2e suite in tests/e2e/ that spins up Postgres 16 in Docker and exercises the full add_home β†’ broadcast pipeline against real DB (PDOK + telegram mocked). πŸ€– Coded agentically with Claude Code. Co-Authored-By: Claude Opus 4.7 --- hestia/bot.py | 93 +++++++- hestia/hestia_utils/db.py | 35 ++- hestia/hestia_utils/geocode.py | 159 ++++++++++++++ hestia/hestia_utils/parser.py | 4 +- hestia/hestia_utils/strings.py | 34 +++ hestia/scraper.py | 25 ++- misc/geocoding_migration.sql | 28 +++ misc/hestia.ddl | 24 ++- tests/e2e/conftest.py | 189 +++++++++++++++++ tests/e2e/test_radius_e2e.py | 378 +++++++++++++++++++++++++++++++++ tests/test_db.py | 50 ++++- tests/test_geocode.py | 134 ++++++++++++ tests/test_scraper.py | 62 ++++++ tests/test_strings.py | 2 + web/hestia_web/app.py | 118 +++++++++- web/static/dashboard.js | 57 +++++ web/templates/dashboard.html | 31 +++ 17 files changed, 1411 insertions(+), 12 deletions(-) create mode 100644 hestia/hestia_utils/geocode.py create mode 100644 misc/geocoding_migration.sql create mode 100644 tests/e2e/conftest.py create mode 100644 tests/e2e/test_radius_e2e.py create mode 100644 tests/test_geocode.py diff --git a/hestia/bot.py b/hestia/bot.py index 3c81b1e..99ff187 100644 --- a/hestia/bot.py +++ b/hestia/bot.py @@ -6,6 +6,7 @@ from telegram.ext import filters, MessageHandler, ApplicationBuilder, CommandHandler, CallbackQueryHandler, ContextTypes import hestia_utils.db as db +import hestia_utils.geocode as geocode import hestia_utils.meta as meta import hestia_utils.secrets as secrets import hestia_utils.strings as strings @@ -272,7 +273,24 @@ async def filter(update: telegram.Update, context: ContextTypes.DEFAULT_TYPE) -> for c in sub["filter_cities"]: cities_str += f"{c.title()}, " - message = strings.get("filter", update.effective_chat.id, [sub['filter_min_price'], sub['filter_max_price'], sub['filter_min_sqm'], cities_str[:-2]]) + if sub.get("filter_radius_km") is not None: + location_str = strings.get( + "filter_location_value", + update.effective_chat.id, + [ + f"{float(sub['filter_center_lat']):.5f}", + f"{float(sub['filter_center_lon']):.5f}", + f"{float(sub['filter_radius_km']):g}", + ], + ) + else: + location_str = strings.get("filter_location_none", update.effective_chat.id) + + message = strings.get( + "filter", + update.effective_chat.id, + [sub['filter_min_price'], sub['filter_max_price'], sub['filter_min_sqm'], cities_str[:-2], location_str], + ) # Set minprice filter elif len(cmd) == 3 and cmd[1] in ["minprice", "min"]: @@ -380,9 +398,80 @@ async def filter(update: telegram.Update, context: ContextTypes.DEFAULT_TYPE) -> if len(sub_filter_cities) == 0: message += strings.get("filter_city_empty", update.effective_chat.id) + + # Show location filter + elif len(cmd) == 2 and cmd[1] == "location": + if sub.get("filter_radius_km") is not None: + message = strings.get( + "filter_location_value", + update.effective_chat.id, + [ + f"{float(sub['filter_center_lat']):.5f}", + f"{float(sub['filter_center_lon']):.5f}", + f"{float(sub['filter_radius_km']):g}", + ], + ) + else: + message = strings.get("filter_location_none", update.effective_chat.id) + + # Clear location filter + elif len(cmd) == 3 and cmd[1] == "location" and cmd[2] in ["clear", "off", "disable"]: + db.clear_filter_location(update.effective_chat) + message = strings.get("filter_location_cleared", update.effective_chat.id) + + # Set location by raw lat/lon: /filter location + elif len(cmd) == 5 and cmd[1] == "location": + try: + radius_km = float(cmd[2]) + lat = float(cmd[3]) + lon = float(cmd[4]) + except ValueError: + await context.bot.send_message(update.effective_chat.id, strings.get("filter_location_invalid", update.effective_chat.id)) + return + if not (-90.0 <= lat <= 90.0) or not (-180.0 <= lon <= 180.0) or radius_km <= 0: + await context.bot.send_message(update.effective_chat.id, strings.get("filter_location_invalid", update.effective_chat.id)) + return + db.set_filter_location(update.effective_chat, lat, lon, radius_km) + message = strings.get( + "filter_location_set", + update.effective_chat.id, + [f"{lat:.5f}", f"{lon:.5f}", f"{radius_km:g}"], + ) + + # Set location by place name: /filter location + elif len(cmd) >= 4 and cmd[1] == "location": + try: + radius_km = float(cmd[2]) + except ValueError: + await context.bot.send_message(update.effective_chat.id, strings.get("filter_location_invalid", update.effective_chat.id)) + return + if radius_km <= 0: + await context.bot.send_message(update.effective_chat.id, strings.get("filter_location_invalid", update.effective_chat.id)) + return + # Use the raw text so we don't distort city names with the earlier .lower(). + raw_tokens = update.message.text.split(' ')[3:] + place = ' '.join(t.replace(';', '').replace('"', '').replace("'", '') for t in raw_tokens).strip() + if not place: + await context.bot.send_message(update.effective_chat.id, strings.get("filter_location_invalid", update.effective_chat.id)) + return + geo = geocode.geocode(place, "") + if geo is None: + await context.bot.send_message( + update.effective_chat.id, + strings.get("filter_location_not_found", update.effective_chat.id, [place]), + ) + return + lat, lon, _ = geo + db.set_filter_location(update.effective_chat, lat, lon, radius_km) + message = strings.get( + "filter_location_set_place", + update.effective_chat.id, + [place, f"{lat:.5f}", f"{lon:.5f}", f"{radius_km:g}"], + ) + else: message = strings.get("filter_invalid_command", update.effective_chat.id) - + await context.bot.send_message(update.effective_chat.id, message, parse_mode="Markdown") diff --git a/hestia/hestia_utils/db.py b/hestia/hestia_utils/db.py index ee15e7a..f7accbd 100644 --- a/hestia/hestia_utils/db.py +++ b/hestia/hestia_utils/db.py @@ -100,7 +100,20 @@ def _write(query: str, params: list[str] = []) -> None: if conn: conn.close() def add_home(url: str, address: str, city: str, price: int, agency: str, date_added: str, sqm: int = -1) -> None: - _write("INSERT INTO hestia.homes (url, address, city, price, agency, date_added, sqm) VALUES (%s, %s, %s, %s, %s, %s, %s)", [url, address, city, str(price), agency, date_added, str(sqm)]) + # Import here to avoid circular import (geocode imports db). + from hestia_utils import geocode as _geocode + try: + geo = _geocode.geocode(address, city) + except Exception as e: + logging.warning(f"Geocoding failed for {address!r}, {city!r}: {repr(e)}") + geo = None + lat, lon, confidence = (None, None, None) + if geo is not None: + lat, lon, confidence = geo + _write( + "INSERT INTO hestia.homes (url, address, city, price, agency, date_added, sqm, lat, lon, geocode_confidence) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", + [url, address, city, str(price), agency, date_added, str(sqm), lat, lon, confidence], + ) def add_user(telegram_id: int) -> None: # Use an explicit column list so this stays valid when new columns are added to hestia.subscribers. _write("INSERT INTO hestia.subscribers (telegram_enabled, telegram_id) VALUES (true, %s)", [str(telegram_id)]) @@ -219,13 +232,23 @@ def set_filter_agencies(telegram_chat: Chat, agencies: set[str]) -> None: _write("UPDATE hestia.subscribers SET filter_agencies = %s WHERE telegram_id = %s", [str(list(agencies)).replace("'", '"'), str(telegram_chat.id)]) def set_filter_minsqm(telegram_chat: Chat, min_sqm: int) -> None: _write("UPDATE hestia.subscribers SET filter_min_sqm = %s WHERE telegram_id = %s", [str(min_sqm), str(telegram_chat.id)]) +def set_filter_location(telegram_chat: Chat, lat: float, lon: float, radius_km: float) -> None: + _write( + "UPDATE hestia.subscribers SET filter_center_lat = %s, filter_center_lon = %s, filter_radius_km = %s WHERE telegram_id = %s", + [lat, lon, radius_km, str(telegram_chat.id)], + ) +def clear_filter_location(telegram_chat: Chat) -> None: + _write( + "UPDATE hestia.subscribers SET filter_center_lat = NULL, filter_center_lon = NULL, filter_radius_km = NULL WHERE telegram_id = %s", + [str(telegram_chat.id)], + ) def set_user_lang(telegram_chat: Chat, lang: Literal["en", "nl"]) -> None: _write("UPDATE hestia.subscribers SET lang = %s WHERE telegram_id = %s", [lang, str(telegram_chat.id)]) LANG_CACHE[telegram_chat.id] = lang -FILTER_COLUMNS = ["filter_min_price", "filter_max_price", "filter_cities", "filter_agencies", "filter_min_sqm"] +FILTER_COLUMNS = ["filter_min_price", "filter_max_price", "filter_cities", "filter_agencies", "filter_min_sqm", "filter_center_lat", "filter_center_lon", "filter_radius_km"] def _load_filter_defaults(cur) -> dict: @@ -308,7 +331,10 @@ def link_account(telegram_id: int, code: str) -> Literal["success", "invalid_cod filter_max_price = %s, filter_cities = %s, filter_agencies = %s, - filter_min_sqm = %s + filter_min_sqm = %s, + filter_center_lat = %s, + filter_center_lon = %s, + filter_radius_km = %s WHERE telegram_id = %s """, [ @@ -317,6 +343,9 @@ def link_account(telegram_id: int, code: str) -> Literal["success", "invalid_cod json.dumps(web_sub["filter_cities"]), json.dumps(web_sub["filter_agencies"]), web_sub["filter_min_sqm"], + web_sub.get("filter_center_lat"), + web_sub.get("filter_center_lon"), + web_sub.get("filter_radius_km"), str(telegram_id), ], ) diff --git a/hestia/hestia_utils/geocode.py b/hestia/hestia_utils/geocode.py new file mode 100644 index 0000000..d6428bd --- /dev/null +++ b/hestia/hestia_utils/geocode.py @@ -0,0 +1,159 @@ +"""Address geocoding for Dutch addresses via PDOK Locatieserver. + +Results are cached in hestia.geocode_cache to avoid repeat PDOK calls for the +same address. Scrapers remain untouched; geocoding happens at DB-insert time +(see hestia_utils.db.add_home) and is best-effort β€” a miss stores NULL coords +so broadcast() can skip the radius check rather than dropping the home. +""" + +import logging +import math +import re +from typing import Optional, Tuple + +import requests + +import hestia_utils.db as db + + +PDOK_URL = "https://api.pdok.nl/bzk/locatieserver/search/v3_1/free" +PDOK_TIMEOUT = 5 +MIN_SCORE = 7.0 +USER_AGENT = "hestia-geocoder/1.0 (+https://hestia.bot)" + +_UNIT_SUFFIX_RES = [ + re.compile(r"\s+\d+(?:hg|bg|vg)\s*$", re.IGNORECASE), # "3hg", "2bg" + re.compile(r"\s+[A-Z]\d+\s*$"), # "B2" + re.compile(r"\s+(?:I{1,3}|IV|V|VI{1,3})\s*$"), # Roman numerals for floor + re.compile(r"\s+bis\s*$", re.IGNORECASE), # NL addition +] +_POINT_RE = re.compile(r"POINT\s*\(\s*([-\d.]+)\s+([-\d.]+)\s*\)") + + +def normalize_address(address: str) -> str: + """Strip common unit/floor suffixes that PDOK doesn't know about. + + Only strips if the remainder still contains a house number digit, so we + don't accidentally destroy the number itself. + """ + if not address: + return "" + cleaned = re.sub(r"\s+", " ", address.strip()) + # One pass is enough for the suffixes we recognize. + for pattern in _UNIT_SUFFIX_RES: + candidate = pattern.sub("", cleaned).strip() + if candidate != cleaned and re.search(r"\d", candidate): + cleaned = candidate + break + return cleaned + + +def _parse_point(point_str: str) -> Optional[Tuple[float, float]]: + """PDOK returns centroide_ll as 'POINT(lon lat)'. Returns (lat, lon).""" + if not point_str: + return None + m = _POINT_RE.search(point_str) + if not m: + return None + lon, lat = float(m.group(1)), float(m.group(2)) + return (lat, lon) + + +def _pdok_lookup(address: str, city: str, fq: str = "type:adres") -> Optional[Tuple[float, float, float]]: + """Hit PDOK Locatieserver. Returns (lat, lon, score) or None.""" + query = f"{address} {city}".strip() + if not query: + return None + try: + r = requests.get( + PDOK_URL, + params={"q": query, "fq": fq, "rows": 1}, + headers={"User-Agent": USER_AGENT, "Accept": "application/json"}, + timeout=PDOK_TIMEOUT, + ) + except requests.RequestException as e: + logging.warning(f"PDOK request failed for {query!r}: {repr(e)}") + return None + + if r.status_code != 200: + logging.warning(f"PDOK returned {r.status_code} for {query!r}") + return None + + try: + docs = r.json().get("response", {}).get("docs", []) + except ValueError: + logging.warning(f"PDOK returned non-JSON body for {query!r}") + return None + + if not docs: + return None + + top = docs[0] + score = float(top.get("score", 0.0)) + coords = _parse_point(top.get("centroide_ll", "")) + if coords is None: + return None + lat, lon = coords + return (lat, lon, score) + + +def geocode(address: str, city: str) -> Optional[Tuple[float, float, float]]: + """Resolve (address, city) to (lat, lon, confidence). + + Returns None if no usable result. Uses hestia.geocode_cache so repeat + lookups are free. Confidence is PDOK's relevance score for the top hit + (higher = more confident); 0.0 indicates a low-confidence fallback. + """ + if not address: + return None + city = city or "" + + cached = db.fetch_one( + "SELECT lat, lon, confidence FROM hestia.geocode_cache WHERE address = %s AND city = %s", + [address, city], + ) + if cached: + if cached["lat"] is None or cached["lon"] is None: + return None + return (cached["lat"], cached["lon"], cached.get("confidence") or 0.0) + + normalized = normalize_address(address) + result = _pdok_lookup(normalized, city, fq="type:adres") + if result is None or result[2] < MIN_SCORE: + fallback = _pdok_lookup(normalized, city, fq="type:weergavenaam") + if fallback is not None and (result is None or fallback[2] > result[2]): + result = (fallback[0], fallback[1], 0.0) + + if result is None: + _store_cache(address, city, None, None, None) + return None + + lat, lon, score = result + _store_cache(address, city, lat, lon, score) + return (lat, lon, score) + + +def _store_cache(address: str, city: str, lat, lon, confidence) -> None: + db._write( + """ + INSERT INTO hestia.geocode_cache (address, city, lat, lon, confidence, fetched_at) + VALUES (%s, %s, %s, %s, %s, now()) + ON CONFLICT (address, city) DO UPDATE SET + lat = EXCLUDED.lat, + lon = EXCLUDED.lon, + confidence = EXCLUDED.confidence, + fetched_at = EXCLUDED.fetched_at + """, + [address, city, lat, lon, confidence], + ) + + +def haversine_km(lat1: float, lon1: float, lat2: float, lon2: float) -> float: + """Great-circle distance in kilometers between two WGS84 points.""" + r = 6371.0088 + phi1 = math.radians(lat1) + phi2 = math.radians(lat2) + dphi = math.radians(lat2 - lat1) + dlambda = math.radians(lon2 - lon1) + a = math.sin(dphi / 2) ** 2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2) ** 2 + return 2 * r * math.asin(math.sqrt(a)) diff --git a/hestia/hestia_utils/parser.py b/hestia/hestia_utils/parser.py index 3872927..aad59fd 100644 --- a/hestia/hestia_utils/parser.py +++ b/hestia/hestia_utils/parser.py @@ -9,13 +9,15 @@ class Home: - def __init__(self, address: str = '', city: str = '', url: str = '', agency: str = '', price: int = -1, sqm: int = -1): + def __init__(self, address: str = '', city: str = '', url: str = '', agency: str = '', price: int = -1, sqm: int = -1, lat: float = None, lon: float = None): self.address = address self.city = city self.url = url self.agency = agency self.price = price self.sqm = sqm + self.lat = lat + self.lon = lon def __repr__(self) -> str: return str(self) diff --git a/hestia/hestia_utils/strings.py b/hestia/hestia_utils/strings.py index a309af6..4db2aab 100644 --- a/hestia/hestia_utils/strings.py +++ b/hestia/hestia_utils/strings.py @@ -51,6 +51,7 @@ Max. price: {} Min. size: {} m\u00b2 Cities: {} +Location: {} *To change your filters, you can say:* `/filter minprice 1200` @@ -58,6 +59,8 @@ `/filter minsqm 40` `/filter city add Amsterdam` `/filter city remove Den Haag` +`/filter location 5 Amsterdam Damrak` +`/filter location clear` I will only send you homes in cities that you've included in your filter. Say `/filter city` to see the list of possible cities. Additionally, you can disable updates from certain agencies/websites. Say `/filter agency` to select your preferences.""", @@ -66,6 +69,7 @@ Max. prijs: {} Min. oppervlakte: {} m\u00b2 Steden: {} +Locatie: {} *Om je filters aan te passen, zeg je bijvoorbeeld:* `/filter minprice 1200` @@ -73,6 +77,8 @@ `/filter minsqm 40` `/filter city add Amsterdam` `/filter city remove Den Haag` +`/filter location 5 Amsterdam Damrak` +`/filter location clear` Ik stuur alleen meldingen voor woningen in steden die je in je filter hebt opgenomen. Zeg `/filter city` om de lijst met mogelijke steden te zien. Daarnaast kun je updates van bepaalde makelaars/websites uitschakelen. Zeg `/filter agency` om deze te selecteren.""" @@ -129,6 +135,34 @@ "en": "\n\nYour city filter is now empty, you will not receive messages about any homes.", "nl": "\n\nJe filter voor steden is nu leeg, je ontvangt geen meldingen voor woningen." }, + "filter_location_none": { + "en": "not set", + "nl": "niet ingesteld" + }, + "filter_location_value": { + "en": "{} km around ({}, {})", + "nl": "{} km rond ({}, {})" + }, + "filter_location_set": { + "en": "Location filter set: {} km around ({}, {})", + "nl": "Locatiefilter ingesteld: {} km rond ({}, {})" + }, + "filter_location_set_place": { + "en": "Location filter set to {}: {} km around ({}, {})", + "nl": "Locatiefilter ingesteld op {}: {} km rond ({}, {})" + }, + "filter_location_cleared": { + "en": "Location filter cleared. You will receive homes regardless of location (subject to your other filters).", + "nl": "Locatiefilter verwijderd. Je ontvangt weer woningen ongeacht locatie (op basis van je overige filters)." + }, + "filter_location_invalid": { + "en": "Invalid location command. Use: `/filter location ` or `/filter location ` or `/filter location clear`", + "nl": "Ongeldig locatie commando. Gebruik: `/filter location ` of `/filter location ` of `/filter location clear`" + }, + "filter_location_not_found": { + "en": "Could not find coordinates for: {}. Try a more specific address (street + city), or use `/filter location ` directly.", + "nl": "Kon geen coΓΆrdinaten vinden voor: {}. Probeer een specifieker adres (straat + stad), of gebruik `/filter location ` direct." + }, "filter_invalid_command": { "en": "Invalid filter command, say /filter to see options", "nl": "Ongeldig filter commando, zeg /filter om de opties te zien" diff --git a/hestia/scraper.py b/hestia/scraper.py index 7fed8f5..b977270 100644 --- a/hestia/scraper.py +++ b/hestia/scraper.py @@ -26,6 +26,7 @@ HAS_IKWILHUREN_SCRAPER = False import hestia_utils.db as db +import hestia_utils.geocode as geocode import hestia_utils.meta as meta import hestia_utils.secrets as secrets import hestia_utils.apns as apns @@ -206,11 +207,33 @@ async def broadcast(homes: list[Home]) -> None: subs = db.fetch_all("SELECT * FROM hestia.subscribers WHERE telegram_enabled = true OR apns_token IS NOT NULL") for home in homes: + # Populate lat/lon from the geocode cache (add_home just wrote it). + # This is a cheap DB read; no PDOK call unless the row is missing entirely. + if home.lat is None or home.lon is None: + try: + geo = geocode.geocode(home.address, home.city) + except Exception as e: + logger.warning(f"Geocode lookup failed during broadcast for {home.address!r}, {home.city!r}: {repr(e)}") + geo = None + if geo is not None: + home.lat, home.lon, _ = geo + for sub in subs: # Apply filters price_ok = (home.price >= sub["filter_min_price"] and home.price <= sub["filter_max_price"]) sqm_ok = (sub["filter_min_sqm"] == 0) or (home.sqm == -1) or (home.sqm >= sub["filter_min_sqm"]) - if price_ok and sqm_ok and home.city.lower() in sub["filter_cities"] and home.agency in sub["filter_agencies"]: + # Radius filter: skip if sub has no location set, or if the home has no coords. + # We intentionally do not drop homes without coords β€” geocoding is best-effort. + radius_ok = True + if sub.get("filter_radius_km") is not None and home.lat is not None and home.lon is not None: + distance_km = geocode.haversine_km( + float(sub["filter_center_lat"]), + float(sub["filter_center_lon"]), + float(home.lat), + float(home.lon), + ) + radius_ok = distance_km <= float(sub["filter_radius_km"]) + if price_ok and sqm_ok and radius_ok and home.city.lower() in sub["filter_cities"] and home.agency in sub["filter_agencies"]: display_address = apns.DEDUP_SUFFIX_RE.sub("", home.address) message = f"{meta.HOUSE_EMOJI} {display_address}, {home.city}\n" message += f"{meta.EURO_EMOJI} €{home.price}/m\n" diff --git a/misc/geocoding_migration.sql b/misc/geocoding_migration.sql new file mode 100644 index 0000000..878460f --- /dev/null +++ b/misc/geocoding_migration.sql @@ -0,0 +1,28 @@ +-- Adds geocoding columns and a radius filter to Hestia. +-- Safe to run multiple times: uses IF NOT EXISTS where possible. + +-- Coordinates and confidence on scraped homes. NULL = not yet geocoded or lookup failed. +ALTER TABLE hestia.homes + ADD COLUMN IF NOT EXISTS lat double precision NULL, + ADD COLUMN IF NOT EXISTS lon double precision NULL, + ADD COLUMN IF NOT EXISTS geocode_confidence real NULL; + +CREATE INDEX IF NOT EXISTS homes_latlon_idx + ON hestia.homes USING btree (lat, lon); + +-- Subscriber-level radius filter. NULL radius = filter disabled. +ALTER TABLE hestia.subscribers + ADD COLUMN IF NOT EXISTS filter_center_lat double precision NULL, + ADD COLUMN IF NOT EXISTS filter_center_lon double precision NULL, + ADD COLUMN IF NOT EXISTS filter_radius_km real NULL; + +-- Geocode cache keyed on (address, city). Avoids hammering PDOK for repeat addresses. +CREATE TABLE IF NOT EXISTS hestia.geocode_cache ( + address varchar NOT NULL, + city varchar NOT NULL, + lat double precision NULL, + lon double precision NULL, + confidence real NULL, + fetched_at timestamptz DEFAULT CURRENT_TIMESTAMP NOT NULL, + CONSTRAINT geocode_cache_pkey PRIMARY KEY (address, city) +); diff --git a/misc/hestia.ddl b/misc/hestia.ddl index 80f7fc1..c5524ad 100644 --- a/misc/hestia.ddl +++ b/misc/hestia.ddl @@ -32,7 +32,26 @@ CREATE TABLE hestia.homes ( price int4 DEFAULT '-1'::integer NOT NULL, sqm int4 DEFAULT '-1'::integer NOT NULL, agency varchar NULL, - date_added timestamp NOT NULL + date_added timestamp NOT NULL, + lat double precision NULL, + lon double precision NULL, + geocode_confidence real NULL +); +CREATE INDEX homes_latlon_idx ON hestia.homes USING btree (lat, lon); + + +-- hestia.geocode_cache definition + +-- DROP TABLE hestia.geocode_cache; + +CREATE TABLE hestia.geocode_cache ( + address varchar NOT NULL, + city varchar NOT NULL, + lat double precision NULL, + lon double precision NULL, + confidence real NULL, + fetched_at timestamptz DEFAULT CURRENT_TIMESTAMP NOT NULL, + CONSTRAINT geocode_cache_pkey PRIMARY KEY (address, city) ); @@ -127,6 +146,9 @@ CREATE TABLE hestia.subscribers ( email_address varchar NULL, device_id varchar(36) NULL, apns_token text NULL, + filter_center_lat double precision NULL, + filter_center_lon double precision NULL, + filter_radius_km real NULL, CONSTRAINT subscribers_device_id_key UNIQUE (device_id) ); CREATE INDEX idx_subscribers_email_address ON hestia.subscribers USING btree (email_address); diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py new file mode 100644 index 0000000..56a0cf7 --- /dev/null +++ b/tests/e2e/conftest.py @@ -0,0 +1,189 @@ +"""Session fixtures for end-to-end tests. + +Spins up a throwaway Postgres 16 container, applies the production DDL, +and rewires hestia_utils.secrets.DB so every call that goes through +db.get_connection() lands in that container. Only PDOK and telegram +send_message need to be mocked in the tests themselves. + +If Docker isn't available or the image can't run, the whole e2e suite +is skipped rather than failing. +""" +from __future__ import annotations + +import shutil +import socket +import subprocess +import time +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock + +import psycopg2 +import pytest + +import hestia_utils.secrets as secrets + +IMAGE = "postgres:16" +CONTAINER_NAME = "hestia-e2e-pg" +DB_NAME = "hestia_e2e" +DB_USER = "hestia" +DB_PASSWORD = "hestia_e2e_pw" +DDL_PATH = Path(__file__).resolve().parents[2] / "misc" / "hestia.ddl" + + +def _docker_available() -> bool: + if shutil.which("docker") is None: + return False + r = subprocess.run(["docker", "info"], capture_output=True) + return r.returncode == 0 + + +def _free_port() -> int: + with socket.socket() as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +def _wait_ready(host: str, port: int, timeout: float = 40.0) -> None: + deadline = time.time() + timeout + last = None + while time.time() < deadline: + try: + conn = psycopg2.connect( + host=host, port=port, user=DB_USER, + password=DB_PASSWORD, database=DB_NAME, + connect_timeout=2, + ) + conn.close() + return + except psycopg2.OperationalError as e: + last = e + time.sleep(0.5) + raise RuntimeError(f"Postgres never became ready: {last!r}") + + +@pytest.fixture(scope="session") +def _pg_container(): + if not _docker_available(): + pytest.skip("Docker daemon unavailable β€” skipping e2e suite") + + # Clean up a stale container from an aborted earlier run, if any. + subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True) + + port = _free_port() + # --network host sidesteps the host's iptables userland-proxy path, which + # can be broken on kernels missing xt_tcp / nf_conntrack modules. We pick + # a free high port and tell Postgres itself to bind there (`-c port=...`). + run = subprocess.run( + [ + "docker", "run", "-d", "--rm", + "--name", CONTAINER_NAME, + "--network", "host", + "-e", f"POSTGRES_DB={DB_NAME}", + "-e", f"POSTGRES_USER={DB_USER}", + "-e", f"POSTGRES_PASSWORD={DB_PASSWORD}", + IMAGE, + "-c", f"port={port}", + ], + capture_output=True, text=True, + ) + if run.returncode != 0: + pytest.skip(f"Could not start postgres container: {run.stderr.strip()}") + + try: + _wait_ready("127.0.0.1", port) + conn = psycopg2.connect( + host="127.0.0.1", port=port, user=DB_USER, + password=DB_PASSWORD, database=DB_NAME, + ) + try: + with conn.cursor() as cur: + # The production DDL says `AUTHORIZATION postgres`; in our test + # container the superuser is DB_USER, and the postgres role + # doesn't exist. Create it so AUTHORIZATION resolves. + cur.execute( + "DO $$ BEGIN " + "IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname='postgres') " + "THEN CREATE ROLE postgres; END IF; END $$;" + ) + cur.execute(DDL_PATH.read_text()) + # Seed the meta row that db.get_dev_mode/get_scraper_halted expect. + cur.execute( + "INSERT INTO hestia.meta " + "(id, devmode_enabled, scraper_halted, workdir) " + "VALUES ('default', false, false, '/tmp/')" + ) + conn.commit() + finally: + conn.close() + + yield { + "host": "127.0.0.1", + "port": str(port), + "database": DB_NAME, + "user": DB_USER, + "password": DB_PASSWORD, + } + finally: + subprocess.run(["docker", "rm", "-f", CONTAINER_NAME], capture_output=True) + + +@pytest.fixture(scope="session") +def pg(_pg_container): + """Real DB connection info; also redirects hestia.secrets.DB for the session.""" + saved = dict(secrets.DB) + secrets.DB.clear() + secrets.DB.update(_pg_container) + yield _pg_container + secrets.DB.clear() + secrets.DB.update(saved) + + +@pytest.fixture(autouse=True) +def _reset_state(pg): + """Blank every table + in-process cache between tests.""" + import hestia_utils.db as db + + db.LANG_CACHE.clear() + + conn = psycopg2.connect( + host=pg["host"], port=pg["port"], user=pg["user"], + password=pg["password"], database=pg["database"], + ) + try: + with conn.cursor() as cur: + cur.execute( + "TRUNCATE hestia.homes, hestia.subscribers, hestia.geocode_cache, " + "hestia.link_codes, hestia.magic_tokens, hestia.preview_cache, " + "hestia.targets, hestia.error_rollups RESTART IDENTITY" + ) + conn.commit() + finally: + conn.close() + + # Bust scraper.py's lru_cache for agency pretty-name lookups, if scraper imported. + try: + import scraper + scraper._get_agency_pretty_name.cache_clear() + except Exception: + pass + + yield + + +@pytest.fixture +def mock_bot(): + """Swap meta.BOT with a MagicMock whose send_message is awaitable. + + Returns the mock so tests can inspect call_args_list. meta.BOT is restored + at teardown. + """ + import hestia_utils.meta as meta + + original = meta.BOT + bot = MagicMock() + bot.send_message = AsyncMock() + meta.BOT = bot + try: + yield bot + finally: + meta.BOT = original diff --git a/tests/e2e/test_radius_e2e.py b/tests/e2e/test_radius_e2e.py new file mode 100644 index 0000000..9b4a27a --- /dev/null +++ b/tests/e2e/test_radius_e2e.py @@ -0,0 +1,378 @@ +"""End-to-end verification of the radius filter. + +No mocks on the DB layer: every fetch/write hits the throwaway Postgres +container from conftest.py. We only stub: + - PDOK HTTP (hestia_utils.geocode.requests.get) + - Telegram send (meta.BOT.send_message, via the mock_bot fixture) + +The tests cover the full pipeline: + add_home β†’ PDOK call β†’ geocode_cache writeback β†’ homes.lat/lon persisted + ↓ + broadcast(homes) β†’ reads cache if coords missing β†’ haversine check + ↓ + meta.BOT.send_message called only for subs whose radius matches +""" +from __future__ import annotations + +import asyncio +import json +from datetime import datetime +from unittest.mock import MagicMock, patch + +import psycopg2 +import pytest +from psycopg2.extras import RealDictCursor + +import hestia_utils.db as db +from hestia_utils.parser import Home + + +AMSTERDAM = (52.3676, 4.9041) # Dam square +ROTTERDAM = (51.9225, 4.4792) # ~57 km from Amsterdam +UTRECHT = (52.0907, 5.1214) # ~35 km from Amsterdam +DEN_HAAG = (52.0705, 4.3007) # ~50 km from Amsterdam + + +# ---------- PDOK response helpers --------------------------------------------- + +def _pdok_hit(lat: float, lon: float, score: float = 9.5): + r = MagicMock() + r.status_code = 200 + r.json.return_value = { + "response": { + "docs": [{"score": score, "centroide_ll": f"POINT({lon} {lat})"}] + } + } + return r + + +def _pdok_empty(): + r = MagicMock() + r.status_code = 200 + r.json.return_value = {"response": {"docs": []}} + return r + + +def _pdok_router(mapping: dict): + """Build a requests.get side_effect that matches on the query string. + + mapping: substring β†’ (lat, lon) tuple or None (means empty response). + """ + def _side_effect(url, params=None, **kwargs): + q = (params or {}).get("q", "") or "" + for needle, target in mapping.items(): + if needle.lower() in q.lower(): + if target is None: + return _pdok_empty() + return _pdok_hit(*target) + return _pdok_empty() + return _side_effect + + +# ---------- DB helpers -------------------------------------------------------- + +def _pg_conn(pg): + return psycopg2.connect( + host=pg["host"], port=pg["port"], user=pg["user"], + password=pg["password"], database=pg["database"], + ) + + +def _insert_subscriber( + pg, + *, + telegram_id, + cities=("amsterdam", "rotterdam", "utrecht", "den haag"), + filter_radius_km=None, + filter_center=(None, None), + min_price=0, + max_price=10000, +): + conn = _pg_conn(pg) + try: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO hestia.subscribers ( + telegram_enabled, telegram_id, + filter_min_price, filter_max_price, + filter_cities, filter_agencies, filter_min_sqm, + filter_center_lat, filter_center_lon, filter_radius_km + ) VALUES (true, %s, %s, %s, %s, %s, 0, %s, %s, %s) + RETURNING id + """, + [ + str(telegram_id), + min_price, max_price, + json.dumps(list(cities)), + json.dumps(["funda"]), + filter_center[0], filter_center[1], filter_radius_km, + ], + ) + sub_id = cur.fetchone()[0] + conn.commit() + return sub_id + finally: + conn.close() + + +def _seed_target(pg, agency="funda"): + """scraper._get_agency_pretty_name reads hestia.targets; give it a row.""" + conn = _pg_conn(pg) + try: + with conn.cursor() as cur: + cur.execute( + "INSERT INTO hestia.targets " + "(agency, queryurl, method, user_info, post_data, headers, enabled) " + "VALUES (%s, %s, 'GET', %s::jsonb, '{}'::jsonb, '{}'::json, true)", + [agency, "http://example.test", json.dumps({"agency": "Funda"})], + ) + conn.commit() + finally: + conn.close() + + +def _fetch_home(pg, url): + conn = _pg_conn(pg) + try: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute("SELECT * FROM hestia.homes WHERE url = %s", [url]) + return cur.fetchone() + finally: + conn.close() + + +def _fetch_cache(pg, address, city): + conn = _pg_conn(pg) + try: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute( + "SELECT * FROM hestia.geocode_cache WHERE address = %s AND city = %s", + [address, city], + ) + return cur.fetchone() + finally: + conn.close() + + +# ---------- Ingestion E2E ----------------------------------------------------- + +class TestAddHomeE2E: + """db.add_home β†’ geocode β†’ homes row with coords + cache populated.""" + + def test_add_home_persists_coords_and_cache(self, pg): + with patch( + "hestia_utils.geocode.requests.get", + return_value=_pdok_hit(*AMSTERDAM), + ): + db.add_home( + "http://example.com/a", "Kerkstraat 1", "Amsterdam", + 1500, "funda", datetime.now().isoformat(), 75, + ) + + home = _fetch_home(pg, "http://example.com/a") + assert home is not None + assert home["lat"] == pytest.approx(AMSTERDAM[0]) + assert home["lon"] == pytest.approx(AMSTERDAM[1]) + assert home["geocode_confidence"] == pytest.approx(9.5) + + cached = _fetch_cache(pg, "Kerkstraat 1", "Amsterdam") + assert cached is not None + assert cached["lat"] == pytest.approx(AMSTERDAM[0]) + assert cached["confidence"] == pytest.approx(9.5) + + def test_add_home_stores_null_on_pdok_miss(self, pg): + """A geocoding miss must not drop the home β€” it inserts with NULL coords.""" + with patch( + "hestia_utils.geocode.requests.get", + return_value=_pdok_empty(), + ): + db.add_home( + "http://example.com/b", "Onbekende Straat 1", "Nergensland", + 1500, "funda", datetime.now().isoformat(), + ) + + home = _fetch_home(pg, "http://example.com/b") + assert home is not None + assert home["lat"] is None + assert home["lon"] is None + + # A null-cache entry must be stored so future lookups short-circuit. + cached = _fetch_cache(pg, "Onbekende Straat 1", "Nergensland") + assert cached is not None + assert cached["lat"] is None + + def test_cache_prevents_second_pdok_call(self, pg): + """Inserting the same address twice should hit PDOK once.""" + with patch( + "hestia_utils.geocode.requests.get", + return_value=_pdok_hit(*AMSTERDAM), + ) as mock_get: + db.add_home("http://example.com/c1", "Damstraat 5", "Amsterdam", + 1200, "funda", datetime.now().isoformat()) + db.add_home("http://example.com/c2", "Damstraat 5", "Amsterdam", + 1250, "funda", datetime.now().isoformat()) + assert mock_get.call_count == 1 + + +# ---------- Broadcast E2E ----------------------------------------------------- + +class TestBroadcastRadiusE2E: + """broadcast() must send listings only to subs whose radius matches.""" + + def _run(self, coro): + return asyncio.new_event_loop().run_until_complete(coro) + + def test_radius_filter_end_to_end(self, pg, mock_bot): + """Near home reaches only the sub with the small radius. + Far home reaches only the no-radius sub. City and agency filters + are wide open so radius is the only differentiator.""" + _seed_target(pg) + + sub_near_only = 111 + sub_open = 222 + _insert_subscriber( + pg, telegram_id=sub_near_only, + filter_radius_km=5.0, filter_center=AMSTERDAM, + ) + _insert_subscriber(pg, telegram_id=sub_open) + + pdok = _pdok_router({ + "Kerkstraat 1": AMSTERDAM, # ~0 km from center + "Coolsingel 1": ROTTERDAM, # ~57 km + "Domplein 1": UTRECHT, # ~35 km + }) + + with patch("hestia_utils.geocode.requests.get", side_effect=pdok): + db.add_home("http://x/near", "Kerkstraat 1", "Amsterdam", + 1500, "funda", datetime.now().isoformat()) + db.add_home("http://x/far-rotterdam", "Coolsingel 1", "Rotterdam", + 1500, "funda", datetime.now().isoformat()) + db.add_home("http://x/far-utrecht", "Domplein 1", "Utrecht", + 1500, "funda", datetime.now().isoformat()) + + homes = [ + Home(address="Kerkstraat 1", city="Amsterdam", + url="http://x/near", agency="funda", price=1500, sqm=-1), + Home(address="Coolsingel 1", city="Rotterdam", + url="http://x/far-rotterdam", agency="funda", price=1500, sqm=-1), + Home(address="Domplein 1", city="Utrecht", + url="http://x/far-utrecht", agency="funda", price=1500, sqm=-1), + ] + + import scraper + # No PDOK calls expected here β€” broadcast must hit the cache. + with patch("hestia_utils.geocode.requests.get", + side_effect=AssertionError("broadcast should not call PDOK")): + self._run(scraper.broadcast(homes)) + + sent_by_chat: dict[str, list[str]] = {} + for call in mock_bot.send_message.call_args_list: + kwargs = call.kwargs + sent_by_chat.setdefault(kwargs["chat_id"], []).append(kwargs["text"]) + + near_only = sent_by_chat.get(str(sub_near_only), []) + open_sub = sent_by_chat.get(str(sub_open), []) + + assert len(near_only) == 1, f"radius sub should get 1 home, got {near_only}" + assert "Kerkstraat 1" in near_only[0] + + assert len(open_sub) == 3, ( + f"no-radius sub should get all 3 homes, got {open_sub}" + ) + assert any("Kerkstraat 1" in m for m in open_sub) + assert any("Coolsingel 1" in m for m in open_sub) + assert any("Domplein 1" in m for m in open_sub) + + def test_home_without_coords_not_dropped_for_radius_sub(self, pg, mock_bot): + """If geocoding failed for a home, broadcast must still deliver it to + a radius-filtered sub (skip the check rather than silently drop).""" + _seed_target(pg) + + sub_near_only = 333 + _insert_subscriber( + pg, telegram_id=sub_near_only, + filter_radius_km=5.0, filter_center=AMSTERDAM, + ) + + with patch("hestia_utils.geocode.requests.get", + return_value=_pdok_empty()): + db.add_home("http://x/noco", "Mystery Ln 9", "Amsterdam", + 1500, "funda", datetime.now().isoformat()) + + home_row = _fetch_home(pg, "http://x/noco") + assert home_row["lat"] is None + + homes = [Home(address="Mystery Ln 9", city="Amsterdam", + url="http://x/noco", agency="funda", price=1500, sqm=-1)] + + import scraper + with patch("hestia_utils.geocode.requests.get", + return_value=_pdok_empty()): + self._run(scraper.broadcast(homes)) + + sent = [c.kwargs for c in mock_bot.send_message.call_args_list] + assert len(sent) == 1, "home without coords must not be dropped" + assert sent[0]["chat_id"] == str(sub_near_only) + assert "Mystery Ln 9" in sent[0]["text"] + + def test_set_filter_location_applied_against_live_db(self, pg, mock_bot): + """db.set_filter_location writes the tuple; broadcast reads it back.""" + _seed_target(pg) + + telegram_id = 444 + _insert_subscriber(pg, telegram_id=telegram_id) + chat = MagicMock() + chat.id = telegram_id + db.set_filter_location(chat, AMSTERDAM[0], AMSTERDAM[1], 5.0) + + # Sanity: the row we just wrote is what broadcast will see. + conn = _pg_conn(pg) + try: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute( + "SELECT filter_center_lat, filter_center_lon, filter_radius_km " + "FROM hestia.subscribers WHERE telegram_id = %s", + [str(telegram_id)], + ) + row = cur.fetchone() + finally: + conn.close() + assert row["filter_center_lat"] == pytest.approx(AMSTERDAM[0]) + assert row["filter_radius_km"] == pytest.approx(5.0) + + pdok = _pdok_router({ + "Kerkstraat 1": AMSTERDAM, + "Coolsingel 1": ROTTERDAM, + }) + + with patch("hestia_utils.geocode.requests.get", side_effect=pdok): + db.add_home("http://s/near", "Kerkstraat 1", "Amsterdam", + 1500, "funda", datetime.now().isoformat()) + db.add_home("http://s/far", "Coolsingel 1", "Rotterdam", + 1500, "funda", datetime.now().isoformat()) + + homes = [ + Home(address="Kerkstraat 1", city="Amsterdam", + url="http://s/near", agency="funda", price=1500, sqm=-1), + Home(address="Coolsingel 1", city="Rotterdam", + url="http://s/far", agency="funda", price=1500, sqm=-1), + ] + + import scraper + with patch("hestia_utils.geocode.requests.get", side_effect=pdok): + self._run(scraper.broadcast(homes)) + + sent = [c.kwargs for c in mock_bot.send_message.call_args_list + if c.kwargs.get("chat_id") == str(telegram_id)] + assert len(sent) == 1 + assert "Kerkstraat 1" in sent[0]["text"] + + # And clearing the filter re-opens the subscriber to far listings. + db.clear_filter_location(chat) + mock_bot.send_message.reset_mock() + with patch("hestia_utils.geocode.requests.get", side_effect=pdok): + self._run(scraper.broadcast(homes)) + sent_after_clear = [c.kwargs for c in mock_bot.send_message.call_args_list + if c.kwargs.get("chat_id") == str(telegram_id)] + assert len(sent_after_clear) == 2 diff --git a/tests/test_db.py b/tests/test_db.py index bf9584c..71b4c94 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -146,8 +146,9 @@ def test_returns_en_for_invalid_lang(self, mock_fetch): class TestWriteActions: + @patch('hestia_utils.geocode.geocode', return_value=None) @patch('hestia_utils.db._write') - def test_add_home(self, mock_write): + def test_add_home(self, mock_write, _mock_geo): db.add_home("http://example.com", "Kerkstraat 1", "Amsterdam", 1500, "funda", "2024-01-01", 75) mock_write.assert_called_once() args = mock_write.call_args[0] @@ -155,13 +156,32 @@ def test_add_home(self, mock_write): assert "http://example.com" in args[1] assert "75" in args[1] + @patch('hestia_utils.geocode.geocode', return_value=None) @patch('hestia_utils.db._write') - def test_add_home_default_sqm(self, mock_write): + def test_add_home_default_sqm(self, mock_write, _mock_geo): db.add_home("http://example.com", "Kerkstraat 1", "Amsterdam", 1500, "funda", "2024-01-01") mock_write.assert_called_once() args = mock_write.call_args[0] assert "-1" in args[1] + @patch('hestia_utils.geocode.geocode', return_value=(52.3676, 4.9041, 9.5)) + @patch('hestia_utils.db._write') + def test_add_home_stores_coords(self, mock_write, _mock_geo): + db.add_home("http://example.com", "Kerkstraat 1", "Amsterdam", 1500, "funda", "2024-01-01", 75) + args = mock_write.call_args[0] + assert 52.3676 in args[1] + assert 4.9041 in args[1] + assert 9.5 in args[1] + + @patch('hestia_utils.geocode.geocode', return_value=None) + @patch('hestia_utils.db._write') + def test_add_home_handles_geocode_failure(self, mock_write, _mock_geo): + """A geocoding miss should insert the home with NULL coords, not skip it.""" + db.add_home("http://example.com", "Obscure Address", "Nowhere", 1500, "funda", "2024-01-01") + mock_write.assert_called_once() + args = mock_write.call_args[0] + assert None in args[1] # lat/lon/confidence are NULL + @patch('hestia_utils.db._write') def test_add_user(self, mock_write): db.add_user(12345) @@ -200,6 +220,32 @@ def test_resume_scraper(self, mock_write): mock_write.assert_called_once() assert "scraper_halted = false" in mock_write.call_args[0][0] + @patch('hestia_utils.db._write') + def test_set_filter_location(self, mock_write): + chat = MagicMock() + chat.id = 12345 + db.set_filter_location(chat, 52.3676, 4.9041, 5.0) + mock_write.assert_called_once() + query, params = mock_write.call_args[0] + assert "filter_center_lat" in query + assert "filter_center_lon" in query + assert "filter_radius_km" in query + assert params[0] == 52.3676 + assert params[1] == 4.9041 + assert params[2] == 5.0 + assert params[3] == "12345" + + @patch('hestia_utils.db._write') + def test_clear_filter_location(self, mock_write): + chat = MagicMock() + chat.id = 12345 + db.clear_filter_location(chat) + mock_write.assert_called_once() + query = mock_write.call_args[0][0] + assert "filter_center_lat = NULL" in query + assert "filter_center_lon = NULL" in query + assert "filter_radius_km = NULL" in query + class TestLinkAccount: @patch('hestia_utils.db.get_connection') diff --git a/tests/test_geocode.py b/tests/test_geocode.py new file mode 100644 index 0000000..68629b8 --- /dev/null +++ b/tests/test_geocode.py @@ -0,0 +1,134 @@ +from unittest.mock import patch, MagicMock + +import pytest + + +@pytest.fixture +def geocode_module(): + import hestia_utils.geocode as geocode + return geocode + + +class TestNormalizeAddress: + def test_strips_unit_suffix(self, geocode_module): + assert geocode_module.normalize_address("Damstraat 12 3hg") == "Damstraat 12" + assert geocode_module.normalize_address("Damstraat 12 B2") == "Damstraat 12" + + def test_collapses_whitespace(self, geocode_module): + assert geocode_module.normalize_address(" Damstraat 12 ") == "Damstraat 12" + + def test_handles_plain_address(self, geocode_module): + assert geocode_module.normalize_address("Kerkstraat 10") == "Kerkstraat 10" + + def test_empty(self, geocode_module): + assert geocode_module.normalize_address("") == "" + + +class TestHaversine: + def test_same_point_is_zero(self, geocode_module): + assert geocode_module.haversine_km(52.3676, 4.9041, 52.3676, 4.9041) == pytest.approx(0.0, abs=1e-6) + + def test_amsterdam_rotterdam_approx(self, geocode_module): + # Amsterdam Centraal to Rotterdam Centraal ~ 57km + d = geocode_module.haversine_km(52.3791, 4.9003, 51.9244, 4.4695) + assert 55 < d < 60 + + def test_symmetric(self, geocode_module): + a = geocode_module.haversine_km(52.0, 4.0, 51.0, 5.0) + b = geocode_module.haversine_km(51.0, 5.0, 52.0, 4.0) + assert a == pytest.approx(b) + + +class TestParsePoint: + def test_parses_valid_point(self, geocode_module): + assert geocode_module._parse_point("POINT(4.9041 52.3676)") == (52.3676, 4.9041) + + def test_none_on_garbage(self, geocode_module): + assert geocode_module._parse_point("") is None + assert geocode_module._parse_point("garbage") is None + + +class TestGeocode: + @patch("hestia_utils.geocode.requests.get") + @patch("hestia_utils.geocode.db") + def test_hits_cache_before_pdok(self, mock_db, mock_get, geocode_module): + mock_db.fetch_one.return_value = {"lat": 52.37, "lon": 4.90, "confidence": 9.5} + result = geocode_module.geocode("Damstraat 1", "Amsterdam") + assert result == (52.37, 4.90, 9.5) + mock_get.assert_not_called() + + @patch("hestia_utils.geocode.requests.get") + @patch("hestia_utils.geocode.db") + def test_cached_null_means_known_miss(self, mock_db, mock_get, geocode_module): + mock_db.fetch_one.return_value = {"lat": None, "lon": None, "confidence": None} + assert geocode_module.geocode("Unknown", "Nowhere") is None + mock_get.assert_not_called() + + @patch("hestia_utils.geocode.requests.get") + @patch("hestia_utils.geocode.db") + def test_pdok_success_writes_cache(self, mock_db, mock_get, geocode_module): + mock_db.fetch_one.return_value = {} + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "response": { + "docs": [{"score": 9.5, "centroide_ll": "POINT(4.9041 52.3676)"}] + } + } + mock_get.return_value = mock_response + + result = geocode_module.geocode("Damstraat 1", "Amsterdam") + assert result == (52.3676, 4.9041, 9.5) + mock_db._write.assert_called_once() + # Cache write parameters: (address, city, lat, lon, confidence) + args = mock_db._write.call_args[0][1] + assert args[:2] == ["Damstraat 1", "Amsterdam"] + assert args[2] == 52.3676 + assert args[3] == 4.9041 + + @patch("hestia_utils.geocode.requests.get") + @patch("hestia_utils.geocode.db") + def test_pdok_empty_caches_null(self, mock_db, mock_get, geocode_module): + mock_db.fetch_one.return_value = {} + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"response": {"docs": []}} + mock_get.return_value = mock_response + + # Both lookups (adres + weergavenaam fallback) return empty β†’ None. + assert geocode_module.geocode("Nowhere", "Atlantis") is None + assert mock_db._write.called + args = mock_db._write.call_args[0][1] + # lat/lon/confidence are all NULL for a known-miss. + assert args[2] is None + assert args[3] is None + assert args[4] is None + + @patch("hestia_utils.geocode.requests.get") + @patch("hestia_utils.geocode.db") + def test_low_score_triggers_fallback(self, mock_db, mock_get, geocode_module): + mock_db.fetch_one.return_value = {} + + first = MagicMock() + first.status_code = 200 + first.json.return_value = { + "response": {"docs": [{"score": 2.0, "centroide_ll": "POINT(1.0 1.0)"}]} + } + second = MagicMock() + second.status_code = 200 + second.json.return_value = { + "response": {"docs": [{"score": 5.0, "centroide_ll": "POINT(4.9 52.3)"}]} + } + mock_get.side_effect = [first, second] + + result = geocode_module.geocode("Amsterdam", "") + # Fallback takes over because its score (5.0) beat the low type:adres hit (2.0). + assert result == (52.3, 4.9, 0.0) + + @patch("hestia_utils.geocode.requests.get") + @patch("hestia_utils.geocode.db") + def test_network_error_returns_none(self, mock_db, mock_get, geocode_module): + import requests as real_requests + mock_db.fetch_one.return_value = {} + mock_get.side_effect = real_requests.RequestException("boom") + assert geocode_module.geocode("x", "y") is None diff --git a/tests/test_scraper.py b/tests/test_scraper.py index e9d9938..0e3b648 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -400,3 +400,65 @@ async def test_apns_invalid_token_is_cleared( mock_db.clear_apns_token.assert_called_once_with(3) assert SCRAPER_METRICS["apns:success"] == 0 assert SCRAPER_METRICS["apns:failure"] == 1 + + @pytest.mark.asyncio + @patch('scraper.geocode.geocode') + @patch('scraper.meta') + @patch('scraper.db') + async def test_applies_radius_filter(self, mock_db, mock_meta, mock_geocode_fn): + from scraper import broadcast + + mock_db.get_dev_mode.return_value = False + # Subscriber is centered on Amsterdam Centraal with a 5km radius. + mock_db.fetch_all.side_effect = [ + [{ + "telegram_id": 111, "telegram_enabled": True, "apns_token": None, + "filter_min_price": 0, "filter_max_price": 9999, + "filter_cities": ["amsterdam", "rotterdam"], "filter_agencies": ["rebo"], + "filter_min_sqm": 0, + "filter_center_lat": 52.3791, "filter_center_lon": 4.9003, + "filter_radius_km": 5.0, + }], + [{"agency": "rebo", "user_info": {"agency": "Rebo"}}], + ] + mock_meta.BOT.send_message = AsyncMock() + + # Homes already have coords attached; broadcast shouldn't re-geocode them. + home_near = Home(address="Damrak 1", city="Amsterdam", url="http://a.com", agency="rebo", price=1200, lat=52.3740, lon=4.8984) + home_far = Home(address="Centraal", city="Rotterdam", url="http://b.com", agency="rebo", price=1200, lat=51.9244, lon=4.4695) + + await broadcast([home_near, home_far]) + + assert mock_meta.BOT.send_message.call_count == 1 + # broadcast shouldn't geocode homes that already have lat/lon set. + mock_geocode_fn.assert_not_called() + + @pytest.mark.asyncio + @patch('scraper.geocode.geocode') + @patch('scraper.meta') + @patch('scraper.db') + async def test_radius_filter_skipped_when_home_has_no_coords(self, mock_db, mock_meta, mock_geocode_fn): + """A home without coords (geocode failed) should still broadcast, not be dropped.""" + from scraper import broadcast + + mock_db.get_dev_mode.return_value = False + mock_db.fetch_all.side_effect = [ + [{ + "telegram_id": 111, "telegram_enabled": True, "apns_token": None, + "filter_min_price": 0, "filter_max_price": 9999, + "filter_cities": ["amsterdam"], "filter_agencies": ["rebo"], + "filter_min_sqm": 0, + "filter_center_lat": 52.0, "filter_center_lon": 4.0, + "filter_radius_km": 1.0, + }], + [{"agency": "rebo", "user_info": {"agency": "Rebo"}}], + ] + mock_meta.BOT.send_message = AsyncMock() + mock_geocode_fn.return_value = None # geocode miss + + home = Home(address="Obscure Address 99", city="Amsterdam", url="http://a.com", agency="rebo", price=1200) + + await broadcast([home]) + + # Sub has a radius, but home has no coords β†’ radius check skipped, home is broadcast. + assert mock_meta.BOT.send_message.call_count == 1 diff --git a/tests/test_strings.py b/tests/test_strings.py index 339618a..4f9b050 100644 --- a/tests/test_strings.py +++ b/tests/test_strings.py @@ -59,6 +59,8 @@ def test_all_keys_return_nonempty_english(self, mock_lang): "filter_city_invalid", "filter_city_already_in", "filter_city_added", "filter_city_not_in", "filter_city_removed", "filter_invalid_number", + "filter_location_value", "filter_location_set", + "filter_location_set_place", "filter_location_not_found", "donate", "faq", "website_info"} for key in _STRINGS: if key not in param_keys: diff --git a/web/hestia_web/app.py b/web/hestia_web/app.py index 736382a..2a3f9e3 100644 --- a/web/hestia_web/app.py +++ b/web/hestia_web/app.py @@ -16,6 +16,7 @@ from threading import Lock from html.parser import HTMLParser import urllib.error +import urllib.parse import urllib.request from urllib.parse import urlparse, urljoin from urllib.request import Request, urlopen @@ -1069,6 +1070,9 @@ def update_filters(): min_price = request.form.get("min_price", "").strip() or None max_price = request.form.get("max_price", "").strip() or None min_sqm = request.form.get("min_sqm", "").strip() or None + radius_km_raw = request.form.get("filter_radius_km", "").strip() or None + center_lat_raw = request.form.get("filter_center_lat", "").strip() or None + center_lon_raw = request.form.get("filter_center_lon", "").strip() or None filter_cities = psycopg2.extras.Json([c.lower() for c in request.form.getlist("filter_cities")]) submitted_agencies = request.form.getlist("filter_agencies") @@ -1096,6 +1100,18 @@ def update_filters(): # DB column is NOT NULL; treat missing/invalid as "no sqm filter". min_sqm = 0 + # Location filter: only apply if all three are present AND valid, otherwise clear it. + try: + radius_km = float(radius_km_raw) if radius_km_raw is not None else None + center_lat = float(center_lat_raw) if center_lat_raw is not None else None + center_lon = float(center_lon_raw) if center_lon_raw is not None else None + except (ValueError, TypeError): + radius_km = center_lat = center_lon = None + if radius_km is None or center_lat is None or center_lon is None: + radius_km = center_lat = center_lon = None + elif not (0 < radius_km <= 500 and -90 <= center_lat <= 90 and -180 <= center_lon <= 180): + radius_km = center_lat = center_lon = None + try: with get_db() as conn: with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: @@ -1131,10 +1147,13 @@ def update_filters(): filter_max_price = %s, filter_min_sqm = %s, filter_cities = %s, - filter_agencies = %s + filter_agencies = %s, + filter_center_lat = %s, + filter_center_lon = %s, + filter_radius_km = %s WHERE email_address = %s """, - (notifications_enabled, min_price, max_price, min_sqm, filter_cities, filter_agencies, request.email), + (notifications_enabled, min_price, max_price, min_sqm, filter_cities, filter_agencies, center_lat, center_lon, radius_km, request.email), ) except psycopg2.Error as e: @@ -1156,6 +1175,101 @@ def update_filters(): return redirect(url_for("dashboard")) +PDOK_GEOCODE_URL = "https://api.pdok.nl/bzk/locatieserver/search/v3_1/free" +_PDOK_POINT_RE = re.compile(r"POINT\s*\(\s*([-\d.]+)\s+([-\d.]+)\s*\)") + + +def _pdok_geocode(query: str): + """Server-side geocode via PDOK. Returns dict with lat/lon/score or None. + + Uses hestia.geocode_cache for repeat lookups so the UI "Find" button is + cheap and doesn't leak user searches to PDOK on every keystroke. + """ + query = (query or "").strip() + if not query: + return None + + try: + with get_db() as conn: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + cur.execute( + "SELECT lat, lon, confidence FROM hestia.geocode_cache WHERE address = %s AND city = %s", + (query, ""), + ) + cached = cur.fetchone() + except psycopg2.Error: + cached = None + + if cached is not None: + if cached["lat"] is None or cached["lon"] is None: + return None + return {"lat": cached["lat"], "lon": cached["lon"], "score": cached.get("confidence") or 0.0} + + params = urllib.parse.urlencode({"q": query, "fq": "type:adres", "rows": 1}) + url = f"{PDOK_GEOCODE_URL}?{params}" + try: + req = Request(url, headers={"User-Agent": "hestia-geocoder/1.0 (+https://hestia.bot)", "Accept": "application/json"}) + with urlopen(req, timeout=5) as resp: + if resp.status != 200: + return None + import json as _json + body = _json.loads(resp.read().decode("utf-8")) + except (urllib.error.URLError, TimeoutError, ValueError): + return None + + docs = body.get("response", {}).get("docs", []) + if not docs: + _cache_geocode(query, None, None, None) + return None + top = docs[0] + m = _PDOK_POINT_RE.search(top.get("centroide_ll", "") or "") + if not m: + _cache_geocode(query, None, None, None) + return None + lon = float(m.group(1)) + lat = float(m.group(2)) + score = float(top.get("score", 0.0)) + _cache_geocode(query, lat, lon, score) + return {"lat": lat, "lon": lon, "score": score} + + +def _cache_geocode(query, lat, lon, score): + try: + with get_db() as conn: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO hestia.geocode_cache (address, city, lat, lon, confidence, fetched_at) + VALUES (%s, %s, %s, %s, %s, now()) + ON CONFLICT (address, city) DO UPDATE SET + lat = EXCLUDED.lat, + lon = EXCLUDED.lon, + confidence = EXCLUDED.confidence, + fetched_at = EXCLUDED.fetched_at + """, + (query, "", lat, lon, score), + ) + except psycopg2.Error: + pass + + +@app.route("/api/geocode", methods=["POST"]) +@limiter.limit("60 per hour") +@login_required +def api_geocode(): + """Look up coordinates for a place name. Used by the dashboard location filter.""" + csrf_token = request.form.get("csrf_token", "") or request.headers.get("X-CSRF-Token", "") + if not validate_csrf_token(csrf_token): + return jsonify({"error": "Invalid CSRF token"}), 403 + query = (request.form.get("q") or (request.get_json(silent=True) or {}).get("q") or "").strip() + if not query or len(query) > 200: + return jsonify({"error": "Invalid query"}), 400 + result = _pdok_geocode(query) + if result is None: + return jsonify({"ok": False, "error": "not_found"}), 404 + return jsonify({"ok": True, **result}) + + @app.route("/api/homes") @limiter.limit("150 per hour") @api_subscriber_required diff --git a/web/static/dashboard.js b/web/static/dashboard.js index fc620ab..19a9267 100644 --- a/web/static/dashboard.js +++ b/web/static/dashboard.js @@ -1415,3 +1415,60 @@ if (telegramModal) { regenerateBtn.addEventListener('click', regenerateLinkCode); } } + + +// ===================================================================== +// Location radius filter β€” place name lookup +// ===================================================================== +(function() { + var btn = document.getElementById("location-place-lookup"); + var input = document.getElementById("location-place-input"); + if (!btn || !input) return; + var errEl = document.getElementById("location-place-error"); + var latEl = document.getElementById("filter_center_lat"); + var lonEl = document.getElementById("filter_center_lon"); + + function showError(msg) { + if (!errEl) return; + errEl.textContent = msg; + errEl.style.display = ""; + } + function clearError() { + if (!errEl) return; + errEl.textContent = ""; + errEl.style.display = "none"; + } + + function lookup() { + var query = (input.value || "").trim(); + if (!query) return; + clearError(); + btn.disabled = true; + var csrfInput = document.querySelector("input[name=\"csrf_token\"]"); + var body = new URLSearchParams(); + body.append("csrf_token", csrfInput ? csrfInput.value : ""); + body.append("q", query); + fetch("/api/geocode", { method: "POST", body: body, headers: { "Accept": "application/json" } }) + .then(function(r) { return r.json().then(function(d) { return { ok: r.ok, data: d }; }); }) + .then(function(res) { + btn.disabled = false; + if (!res.ok || !res.data || res.data.ok === false) { + showError("Could not find that place. Try a more specific address."); + return; + } + if (latEl) latEl.value = res.data.lat.toFixed(5); + if (lonEl) lonEl.value = res.data.lon.toFixed(5); + if (latEl) latEl.dispatchEvent(new Event("change", { bubbles: true })); + if (lonEl) lonEl.dispatchEvent(new Event("change", { bubbles: true })); + }) + .catch(function() { + btn.disabled = false; + showError("Lookup failed. Please try again."); + }); + } + + btn.addEventListener("click", lookup); + input.addEventListener("keydown", function(e) { + if (e.key === "Enter") { e.preventDefault(); lookup(); } + }); +})(); diff --git a/web/templates/dashboard.html b/web/templates/dashboard.html index 42215ac..9c16e9c 100644 --- a/web/templates/dashboard.html +++ b/web/templates/dashboard.html @@ -101,6 +101,37 @@

Your Hestia homes

{% endfor %} +
+ Location radius +
+ +
+
+ + +
+
+ + +
+ +
Cities