enh: property type (#102)

Bunsly · Nov 3, 2024 · 8e04f6b · 8e04f6b
1 parent 1f717bd
commit 8e04f6b
Show file tree

Hide file tree

Showing 11 changed files with 274 additions and 241 deletions.
diff --git a/README.md b/README.md
@@ -68,13 +68,24 @@ print(properties.head())
 ```
 Required
 ├── location (str): The address in various formats - this could be just a zip code, a full address, or city/state, etc.
-└── listing_type (option): Choose the type of listing.
+├── listing_type (option): Choose the type of listing.
     - 'for_rent'
     - 'for_sale'
     - 'sold'
-    - 'pending'
+    - 'pending' (for pending/contingent sales)
 
 Optional
+├── property_type (list): Choose the type of properties.
+    - 'single_family'
+    - 'multi_family'
+    - 'condos'
+    - 'condo_townhome_rowhome_coop'
+    - 'condo_townhome'
+    - 'townhomes'
+    - 'duplex_triplex'
+    - 'farm'
+    - 'land'
+    - 'mobile'
 ├── radius (decimal): Radius in miles to find comparable properties based on individual addresses.
 │    Example: 5.5 (fetches properties within a 5.5-mile radius if location is set to a specific address; otherwise, ignored)
 │
@@ -94,7 +105,7 @@ Optional
 │
 ├── extra_property_data (True/False): Increases requests by O(n). If set, this fetches additional property data for general searches (e.g. schools, tax appraisals etc.)
 │
-├── exclude_pending (True/False): If set, excludes pending properties from the results unless listing_type is 'pending'
+├── exclude_pending (True/False): If set, excludes 'pending' properties from the 'for_sale' results unless listing_type is 'pending'
 │
 └── limit (integer): Limit the number of properties to fetch. Max & default is 10000.
 ```

diff --git a/examples/HomeHarvest_Demo.ipynb b/examples/HomeHarvest_Demo.ipynb
diff --git a/examples/HomeHarvest_Demo.py b/examples/HomeHarvest_Demo.py
diff --git a/examples/price_of_land.py b/examples/price_of_land.py
@@ -0,0 +1,104 @@
+"""
+This script scrapes sold and pending sold land listings in past year for a list of zip codes and saves the data to individual Excel files.
+It adds two columns to the data: 'lot_acres' and 'ppa' (price per acre) for user to analyze average price of land in a zip code.
+"""
+
+import os
+import pandas as pd
+from homeharvest import scrape_property
+
+
+def get_property_details(zip: str, listing_type):
+    properties = scrape_property(location=zip, listing_type=listing_type, property_type=["land"], past_days=365)
+    if not properties.empty:
+        properties["lot_acres"] = properties["lot_sqft"].apply(lambda x: x / 43560 if pd.notnull(x) else None)
+
+        properties = properties[properties["sqft"].isnull()]
+        properties["ppa"] = properties.apply(
+            lambda row: (
+                int(
+                    (
+                        row["sold_price"]
+                        if (pd.notnull(row["sold_price"]) and row["status"] == "SOLD")
+                        else row["list_price"]
+                    )
+                    / row["lot_acres"]
+                )
+                if pd.notnull(row["lot_acres"])
+                and row["lot_acres"] > 0
+                and (pd.notnull(row["sold_price"]) or pd.notnull(row["list_price"]))
+                else None
+            ),
+            axis=1,
+        )
+        properties["ppa"] = properties["ppa"].astype("Int64")
+        selected_columns = [
+            "property_url",
+            "property_id",
+            "style",
+            "status",
+            "street",
+            "city",
+            "state",
+            "zip_code",
+            "county",
+            "list_date",
+            "last_sold_date",
+            "list_price",
+            "sold_price",
+            "lot_sqft",
+            "lot_acres",
+            "ppa",
+        ]
+        properties = properties[selected_columns]
+    return properties
+
+
+def output_to_excel(zip_code, sold_df, pending_df):
+    root_folder = os.getcwd()
+    zip_folder = os.path.join(root_folder, "zips", zip_code)
+
+    # Create zip code folder if it doesn't exist
+    os.makedirs(zip_folder, exist_ok=True)
+
+    # Define file paths
+    sold_file = os.path.join(zip_folder, f"{zip_code}_sold.xlsx")
+    pending_file = os.path.join(zip_folder, f"{zip_code}_pending.xlsx")
+
+    # Save individual sold and pending files
+    sold_df.to_excel(sold_file, index=False)
+    pending_df.to_excel(pending_file, index=False)
+
+
+zip_codes = map(
+    str,
+    [
+        22920,
+        77024,
+        78028,
+        24553,
+        22967,
+        22971,
+        22922,
+        22958,
+        22969,
+        22949,
+        22938,
+        24599,
+        24562,
+        22976,
+        24464,
+        22964,
+        24581,
+    ],
+)
+
+combined_df = pd.DataFrame()
+for zip in zip_codes:
+    sold_df = get_property_details(zip, "sold")
+    pending_df = get_property_details(zip, "pending")
+    combined_df = pd.concat([combined_df, sold_df, pending_df], ignore_index=True)
+    output_to_excel(zip, sold_df, pending_df)
+
+combined_file = os.path.join(os.getcwd(), "zips", "combined.xlsx")
+combined_df.to_excel(combined_file, index=False)
diff --git a/homeharvest/__init__.py b/homeharvest/__init__.py
@@ -3,12 +3,13 @@
 from .core.scrapers import ScraperInput
 from .utils import process_result, ordered_properties, validate_input, validate_dates, validate_limit
 from .core.scrapers.realtor import RealtorScraper
-from .core.scrapers.models import ListingType
+from .core.scrapers.models import ListingType, SearchPropertyType
 
 
 def scrape_property(
     location: str,
     listing_type: str = "for_sale",
+    property_type: list[str] | None = None,
     radius: float = None,
     mls_only: bool = False,
     past_days: int = None,
@@ -24,6 +25,7 @@ def scrape_property(
     Scrape properties from Realtor.com based on a given location and listing type.
     :param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way")
     :param listing_type: Listing Type (for_sale, for_rent, sold, pending)
+    :param property_type: Property Type (single_family, multi_family, condos, condo_townhome_rowhome_coop, condo_townhome, townhomes, duplex_triplex, farm, land, mobile)
     :param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses.
     :param mls_only: If set, fetches only listings with MLS IDs.
     :param proxy: Proxy to use for scraping
@@ -41,6 +43,7 @@ def scrape_property(
     scraper_input = ScraperInput(
         location=location,
         listing_type=ListingType[listing_type.upper()],
+        property_type=[SearchPropertyType[prop.upper()] for prop in property_type] if property_type else None,
         proxy=proxy,
         radius=radius,
         mls_only=mls_only,
@@ -63,4 +66,6 @@ def scrape_property(
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", category=FutureWarning)
 
-        return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace({"None": pd.NA, None: pd.NA, "": pd.NA})
+        return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties].replace(
+            {"None": pd.NA, None: pd.NA, "": pd.NA}
+        )