From 0de916e5904c06a494b586eb591c75a8f7a52122 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Mon, 6 Jan 2025 05:28:36 -0600 Subject: [PATCH] enh:tax history --- README.md | 10 ++- homeharvest/core/scrapers/models.py | 2 + homeharvest/core/scrapers/realtor/__init__.py | 63 +++++++++++++++---- homeharvest/utils.py | 9 ++- pyproject.toml | 2 +- 5 files changed, 71 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index e68027e..603594d 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,9 @@ filename = f"HomeHarvest_{current_timestamp}.csv" properties = scrape_property( location="San Diego, CA", listing_type="sold", # or (for_sale, for_rent, pending) - property_type=['single_family','multi_family'], past_days=30, # sold in last 30 days - listed in last 30 days if (for_sale, for_rent) + # property_type=['single_family','multi_family'], # date_from="2023-05-01", # alternative to past_days # date_to="2023-05-28", # foreclosure=True @@ -154,6 +154,14 @@ Property │ ├── new_construction │ └── hoa_fee +├── Tax Information: +│ ├── year +│ ├── tax +│ ├── assessment +│ │ ├── building +│ │ ├── land +│ │ └── total + ├── Location Details: │ ├── latitude │ ├── longitude diff --git a/homeharvest/core/scrapers/models.py b/homeharvest/core/scrapers/models.py index 67c4a68..622c165 100644 --- a/homeharvest/core/scrapers/models.py +++ b/homeharvest/core/scrapers/models.py @@ -176,5 +176,7 @@ class Property: nearby_schools: list[str] = None assessed_value: int | None = None estimated_value: int | None = None + tax: int | None = None + tax_history: list[dict] | None = None advertisers: Advertisers | None = None diff --git a/homeharvest/core/scrapers/realtor/__init__.py b/homeharvest/core/scrapers/realtor/__init__.py index e45c2db..9a58ad1 100644 --- a/homeharvest/core/scrapers/realtor/__init__.py +++ b/homeharvest/core/scrapers/realtor/__init__.py @@ -13,7 +13,12 @@ from json import JSONDecodeError from typing import Dict, Union, Optional -from tenacity import retry, retry_if_exception_type, wait_exponential, stop_after_attempt +from tenacity import ( + retry, + retry_if_exception_type, + wait_exponential, + stop_after_attempt, +) from .. import Scraper from ..models import ( @@ -202,27 +207,29 @@ def process_property(self, result: dict, query_name: str) -> Property | None: property_url=result["href"], property_id=property_id, listing_id=result.get("listing_id"), - status="PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper(), + status=("PENDING" if is_pending else "CONTINGENT" if is_contingent else result["status"].upper()), list_price=result["list_price"], list_price_min=result["list_price_min"], list_price_max=result["list_price_max"], - list_date=result["list_date"].split("T")[0] if result.get("list_date") else None, + list_date=(result["list_date"].split("T")[0] if result.get("list_date") else None), prc_sqft=result.get("price_per_sqft"), last_sold_date=result.get("last_sold_date"), new_construction=result["flags"].get("is_new_construction") is True, - hoa_fee=result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None, - latitude=result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None, - longitude=result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None, + hoa_fee=(result["hoa"]["fee"] if result.get("hoa") and isinstance(result["hoa"], dict) else None), + latitude=(result["location"]["address"]["coordinate"].get("lat") if able_to_get_lat_long else None), + longitude=(result["location"]["address"]["coordinate"].get("lon") if able_to_get_lat_long else None), address=self._parse_address(result, search_type="general_search"), description=self._parse_description(result), neighborhoods=self._parse_neighborhoods(result), - county=result["location"]["county"].get("name") if result["location"]["county"] else None, - fips_code=result["location"]["county"].get("fips_code") if result["location"]["county"] else None, + county=(result["location"]["county"].get("name") if result["location"]["county"] else None), + fips_code=(result["location"]["county"].get("fips_code") if result["location"]["county"] else None), days_on_mls=self.calculate_days_on_mls(result), nearby_schools=prop_details.get("schools"), assessed_value=prop_details.get("assessed_value"), estimated_value=estimated_value if estimated_value else None, advertisers=advertisers, + tax=prop_details.get("tax"), + tax_history=prop_details.get("tax_history"), ) return realty_property @@ -447,7 +454,11 @@ def search(self): variables=search_variables | {"offset": i}, search_type=search_type, ) - for i in range(self.DEFAULT_PAGE_SIZE, min(total, self.limit), self.DEFAULT_PAGE_SIZE) + for i in range( + self.DEFAULT_PAGE_SIZE, + min(total, self.limit), + self.DEFAULT_PAGE_SIZE, + ) ] for future in as_completed(futures): @@ -469,15 +480,45 @@ def get_key(data: dict, keys: list): def process_extra_property_details(self, result: dict) -> dict: schools = self.get_key(result, ["nearbySchools", "schools"]) assessed_value = self.get_key(result, ["taxHistory", 0, "assessment", "total"]) + tax_history = self.get_key(result, ["taxHistory"]) schools = [school["district"]["name"] for school in schools if school["district"].get("name")] + + # Process tax history + latest_tax = None + processed_tax_history = None + if tax_history and isinstance(tax_history, list): + tax_history = sorted(tax_history, key=lambda x: x.get("year", 0), reverse=True) + + if tax_history and "tax" in tax_history[0]: + latest_tax = tax_history[0]["tax"] + + processed_tax_history = [] + for entry in tax_history: + if "year" in entry and "tax" in entry: + processed_entry = { + "year": entry["year"], + "tax": entry["tax"], + } + if "assessment" in entry and isinstance(entry["assessment"], dict): + processed_entry["assessment"] = { + "building": entry["assessment"].get("building"), + "land": entry["assessment"].get("land"), + "total": entry["assessment"].get("total"), + } + processed_tax_history.append(processed_entry) + return { "schools": schools if schools else None, "assessed_value": assessed_value if assessed_value else None, + "tax": latest_tax, + "tax_history": processed_tax_history, } @retry( - retry=retry_if_exception_type(JSONDecodeError), wait=wait_exponential(min=4, max=10), stop=stop_after_attempt(3) + retry=retry_if_exception_type(JSONDecodeError), + wait=wait_exponential(min=4, max=10), + stop=stop_after_attempt(3), ) def get_prop_details(self, property_id: str) -> dict: if not self.extra_property_data: @@ -570,7 +611,7 @@ def _parse_description(result: dict) -> Description | None: return Description( primary_photo=primary_photo, alt_photos=RealtorScraper.process_alt_photos(result.get("photos", [])), - style=PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None, + style=(PropertyType.__getitem__(style) if style and style in PropertyType.__members__ else None), beds=description_data.get("beds"), baths_full=description_data.get("baths_full"), baths_half=description_data.get("baths_half"), diff --git a/homeharvest/utils.py b/homeharvest/utils.py index e754190..ee4a078 100644 --- a/homeharvest/utils.py +++ b/homeharvest/utils.py @@ -33,6 +33,8 @@ "last_sold_date", "assessed_value", "estimated_value", + "tax", + "tax_history", "new_construction", "lot_sqft", "price_per_sqft", @@ -115,8 +117,11 @@ def process_result(result: Property) -> pd.DataFrame: if description: prop_data["primary_photo"] = description.primary_photo prop_data["alt_photos"] = ", ".join(description.alt_photos) if description.alt_photos else None - prop_data["style"] = description.style if isinstance(description.style, - str) else description.style.value if description.style else None + prop_data["style"] = ( + description.style + if isinstance(description.style, str) + else description.style.value if description.style else None + ) prop_data["beds"] = description.beds prop_data["full_baths"] = description.baths_full prop_data["half_baths"] = description.baths_half diff --git a/pyproject.toml b/pyproject.toml index 56d5bca..d797a3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "homeharvest" -version = "0.4.4" +version = "0.4.5" description = "Real estate scraping library" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/HomeHarvest"