bluebonnet-data · KathrynPanger · Jun 20, 2023 · Jun 20, 2023 · Jun 20, 2023 · Jun 20, 2023
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ __pycache__
 .vscode
 .tox
 .python-version
+.DS_Store
 
 # Sublime environment
 *.sublime-project
@@ -34,3 +35,6 @@ user
 
 # Shh, secrets
 census_api_key.txt
+
+#api key
+config.py
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/censusExplorer.iml b/.idea/censusExplorer.iml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/README.md b/README.md
@@ -38,14 +38,14 @@ For our working example, we'll use median household income (which is coded in th
 We can simply downloaded the census data like so:
 
 ```python
->>> from bbd import census
->>> data = census.get_acs(
->>>    geography=census.Geography.CD,
->>>    variables="group(DP03),NAME",
->>>    year=2018,
->>>    state="co",
->>>    dataset=census.DataSets.ACS5_PROFILE,
->>> )
+>> > from bbd import census
+>> > data = census.get_acs(
+            >> > geography = census.Geography.CD,
+>> > variables = "group(DP03),NAME",
+>> > year = 2018,
+>> > state = "co",
+>> > dataset = census.DataSet.ACS5_PROFILE,
+>> > )
 ```
 
     https://api.census.gov/data/2018/acs/acs1/profile?get=group(DP03),NAME&for=congressional%20district:*&in=state:08

diff --git a/examples/co_income.py b/examples/co_income.py
@@ -26,7 +26,7 @@
     geography=census.Geography.TRACT,
     variables=["NAME", "DP03_0062E"],
     year=2018,
-    dataset=census.DataSets.ACS5_PROFILE,
+    dataset=census.DataSet.ACS5_PROFILE,
     state="co",
     county="069",  # Larimer County
     cache=True,

diff --git a/examples/get_acs_example.py b/examples/get_acs_example.py
@@ -14,7 +14,7 @@
     geography=census.Geography.STATE,
     variables="NAME,B03003_001E",
     year=2018,
-    dataset=census.DataSets.ACS5_DETAIL,
+    dataset=census.DataSet.ACS5,
 )
 
 pprint(data)
diff --git a/examples/tx_hispanic_or_latino.py b/examples/tx_hispanic_or_latino.py
@@ -36,7 +36,7 @@
     geography=census.Geography.BLOCKGROUP,
     variables=["NAME", "B03003_001E", "B03003_002E", "B03003_003E"],
     year=2018,
-    dataset=census.DataSets.ACS5_DETAIL,
+    dataset=census.DataSet.ACS5,
     state="tx",
     county="201",  # Harris County
     cache=True,

diff --git a/examples/tx_zip_code_by_race.py b/examples/tx_zip_code_by_race.py
@@ -56,7 +56,7 @@
     geography=census.Geography.ZCTA,
     variables=list(variables.keys()),
     year=2018,
-    dataset=census.DataSets.ACS5_DETAIL,
+    dataset=census.DataSet.ACS5,
     # state="tx",
     # county="201": "Harris County
     cache=True,

diff --git a/src/bbd/__init__.py b/src/bbd/__init__.py
@@ -1,8 +1,16 @@
 __version__ = "0.0.7"
 
-# TODO add relevant imports...
-from .working_directory import working_directory
-from .geocoder import geocoder
+from . import working_directory
+# import bbd.geocoder as geocoder
+# import bbd.census as census
+# import bbd.fec as fec
+# import bbd.gis as gis
 
+from . import geocoder
+from . import census
+from . import fec
+from . import gis
+from . import models
+from . import elections
 
-__all__ = [working_directory, geocoder]
+__all__ = [working_directory, geocoder, census, fec, gis, models, elections]
diff --git a/src/bbd/census/__init__.py b/src/bbd/census/__init__.py
@@ -1,17 +1,18 @@
+from .census import Census
 from .get_shapefile import get_shapefile
-from .geography import Geography
-from .datasets import DataSets
+from .dataset import DataSet
 from .load import load_json_file, load_json_str
 from .get_acs import get_acs, construct_api_call
-from .api_key import api_key
+from .api_key import api_key, _ApiKey
 
 __all__ = [
     get_shapefile,
-    Geography,
-    DataSets,
+    DataSet,
     load_json_file,
     load_json_str,
     get_acs,
     construct_api_call,
     api_key,
+    _ApiKey,
+    Census,
 ]
diff --git a/src/bbd/census/api_key.py b/src/bbd/census/api_key.py
@@ -7,17 +7,17 @@ def key(self):
         if self._key is None:
             raise ValueError("Census api key has not been set!")
         else:
-            return self._key
+             return self._key
 
     @key.setter
     def key(self, key_value: str):
         if not isinstance(key_value, str):
-            raise ValueError(
+            raise TypeError(
                 f"Cannot set census api key to {key_value} of type {type(key_value)}. "
                 "Value should be a 'str'"
             )
         else:
-            self._key = key_value
+            self._key = key_value.strip()
 
 
 api_key = _ApiKey()
diff --git a/src/bbd/census/census.py b/src/bbd/census/census.py
@@ -0,0 +1,117 @@
+from __future__ import annotations
+import pandas as pd
+from dataclasses import dataclass, field
+from typing import Optional, OrderedDict
+from bbd.census.census_table import CensusTable
+from bbd.models import geography
+import urllib.parse
+import requests
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+pd.set_option('display.max_columns', None)
+
+@dataclass
+class Census:
+    api_key: _ApiKey
+    geography_values: OrderedDict[geography.Geography, str]
+    year: str | int
+    dataset: dataset.Dataset
+    results: list[str] = field(default_factory = list) # list of CensusResult objects
+    available_variables: pd.DataFrame = field(default_factory = pd.DataFrame) # dataframe of all available variables
+    census_tables: list[CensusTable] = field(default_factory = list) # a list of CensusTable objects
+
+    def _build_url(self, variables: list[str]):
+        base_url = "https://api.census.gov/data"
+
+        # Collect all parts
+        year = self.year
+        dataset = self.dataset.value
+        variables = ",".join(variables)
+        key = self.api_key
+
+        # Parse the geography
+        geo_statements = list(self.geography_values.items())
+        statement_count = len(geo_statements)
+        geo_url = ""
+        for i in range(statement_count):
+            if i < statement_count:
+                prefix = "for"
+            else:
+                prefix = "in"
+            geo_url = geo_url + (f"&{prefix}={urllib.parse.quote(geo_statements[i][0].value)}:{geo_statements[i][1]}")
+
+        full_url = f"{base_url}/{year}/{dataset}?get={variables}{geo_url}&key={key}"
+        return full_url
+
+
+    def _make_query(self, variables):
+        url = self._build_url(variables)
+        response = requests.get(url)
+        return response
+
+    def get_acs(self, variables) -> CensusResult:
+        '''Query the database '''
+        response = self._make_query(variables)
+        result = CensusResult(response=response, variables=variables)
+        self.results.append(result)
+        return result
+
+    def _proportion_match(self, search_string: str, match_string:str):
+        search_string = search_string.lower()
+        match_string = match_string.lower()
+        cv = CountVectorizer()
+        count_matrix = cv.fit_transform([search_string, match_string])
+        proportion_match = cosine_similarity(count_matrix)[0][1]
+        return proportion_match
+
+    def _get_all_vars(self):
+        if len(self.census_tables) == 0:
+            url = f"https://api.census.gov/data/{self.year}/{self.dataset.value}/variables.json"
+            variable_data = requests.get(url)
+            json = variable_data.json()
+            attribute_names = [item for item in json["variables"]]
+            names_to_tables = {}
+            for item in attribute_names:
+                one_attribute = json["variables"][item]
+                if "concept" in one_attribute and "label" in one_attribute and "group" in one_attribute:
+                    label = one_attribute["label"]
+                    concept = one_attribute["concept"]
+                    group = one_attribute["group"]
+                    if group not in names_to_tables:
+                        names_to_tables[group] = CensusTable(variable_id = group,
+                                                             variable_description = concept,
+                                                             attributes = [(item, label)])
+                    else:
+                        names_to_tables[group].attributes.append((item, label))
+            self.census_tables = names_to_tables
+        return self.census_tables
+
+    def _datafame_all_variables(self):
+        if len(self.available_variables) == 0:
+            names_to_tables = self._get_all_vars()
+            df = pd.DataFrame()
+            df["variable_id"] = [item.variable_id for item in names_to_tables.values()]
+            df["variable_description"] = [item.variable_description for item in names_to_tables.values()]
+            df["attributes"] = [item.attributes for item in names_to_tables.values()]
+            df["attribute_names"] = df["attributes"].apply(lambda x: [item[0] for item in x])
+            self.available_variables = df
+        return self.available_variables
+
+    def search_variables(self, search_string: Optional[str] = None, number_of_results: Optional[int] = None):
+        df = self._datafame_all_variables()
+        if search_string is not None:
+            proportion_matches = df["variable_description"].apply(lambda x: self._proportion_match(search_string, x))
+            df["match_proportion"] = proportion_matches
+            df = df[["variable_id", "variable_description", "attribute_names", "match_proportion"]]
+            df = df.sort_values(by="match_proportion", ascending=False).head(number_of_results)
+        if number_of_results is not None:
+            return df.head(number_of_results)
+        else:
+            return df.head()
+
+class CensusResult():
+    def __init__(self, response: requests.Reponse, variables: list[str]):
+        self.response = response
+        self.variables = variables
+        self.data = response.json()
+
diff --git a/src/bbd/census/census_table.py b/src/bbd/census/census_table.py
@@ -0,0 +1,11 @@
+from dataclasses import dataclass, field
+
+
+@dataclass
+class CensusTable():
+    variable_id: str
+    variable_description: str
+    attributes: list[tuple[str, str]]
+
+    def fetch_dataframe(self):
+        pass
diff --git a/src/bbd/census/datasets.py → src/bbd/census/dataset.py b/src/bbd/census/datasets.py → src/bbd/census/dataset.py
@@ -1,7 +1,10 @@
-class DataSets:
+import enum
+
+class DataSet(enum.Enum):
     """Datasets available in the census API"""
 
-    ACS5_DETAIL = "acs/acs5"
+    ACS5 = "acs/acs5"
     ACS5_SUBJECT = "acs/acs5/subject"
     ACS5_PROFILE = "acs/acs5/profile"
     ACS5_CPROFILE = "acs/acs5/cprofile"
+    ACS1 = "acs/acs1"
diff --git a/src/bbd/census/geography.py b/src/bbd/census/geography.py