Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ __pycache__
.vscode
.tox
.python-version
.DS_Store

# Sublime environment
*.sublime-project
Expand Down Expand Up @@ -34,3 +35,6 @@ user

# Shh, secrets
census_api_key.txt

#api key
config.py
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions .idea/censusExplorer.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ For our working example, we'll use median household income (which is coded in th
We can simply downloaded the census data like so:

```python
>>> from bbd import census
>>> data = census.get_acs(
>>> geography=census.Geography.CD,
>>> variables="group(DP03),NAME",
>>> year=2018,
>>> state="co",
>>> dataset=census.DataSets.ACS5_PROFILE,
>>> )
>> > from bbd import census
>> > data = census.get_acs(
>> > geography = census.Geography.CD,
>> > variables = "group(DP03),NAME",
>> > year = 2018,
>> > state = "co",
>> > dataset = census.DataSet.ACS5_PROFILE,
>> > )
```

https://api.census.gov/data/2018/acs/acs1/profile?get=group(DP03),NAME&for=congressional%20district:*&in=state:08
Expand Down
2 changes: 1 addition & 1 deletion examples/co_income.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
geography=census.Geography.TRACT,
variables=["NAME", "DP03_0062E"],
year=2018,
dataset=census.DataSets.ACS5_PROFILE,
dataset=census.DataSet.ACS5_PROFILE,
state="co",
county="069", # Larimer County
cache=True,
Expand Down
2 changes: 1 addition & 1 deletion examples/get_acs_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
geography=census.Geography.STATE,
variables="NAME,B03003_001E",
year=2018,
dataset=census.DataSets.ACS5_DETAIL,
dataset=census.DataSet.ACS5,
)

pprint(data)
2 changes: 1 addition & 1 deletion examples/tx_hispanic_or_latino.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
geography=census.Geography.BLOCKGROUP,
variables=["NAME", "B03003_001E", "B03003_002E", "B03003_003E"],
year=2018,
dataset=census.DataSets.ACS5_DETAIL,
dataset=census.DataSet.ACS5,
state="tx",
county="201", # Harris County
cache=True,
Expand Down
2 changes: 1 addition & 1 deletion examples/tx_zip_code_by_race.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
geography=census.Geography.ZCTA,
variables=list(variables.keys()),
year=2018,
dataset=census.DataSets.ACS5_DETAIL,
dataset=census.DataSet.ACS5,
# state="tx",
# county="201": "Harris County
cache=True,
Expand Down
16 changes: 12 additions & 4 deletions src/bbd/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
__version__ = "0.0.7"

# TODO add relevant imports...
from .working_directory import working_directory
from .geocoder import geocoder
from . import working_directory
# import bbd.geocoder as geocoder
# import bbd.census as census
# import bbd.fec as fec
# import bbd.gis as gis

from . import geocoder
from . import census
from . import fec
from . import gis
from . import models
from . import elections

__all__ = [working_directory, geocoder]
__all__ = [working_directory, geocoder, census, fec, gis, models, elections]
11 changes: 6 additions & 5 deletions src/bbd/census/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
from .census import Census
from .get_shapefile import get_shapefile
from .geography import Geography
from .datasets import DataSets
from .dataset import DataSet
from .load import load_json_file, load_json_str
from .get_acs import get_acs, construct_api_call
from .api_key import api_key
from .api_key import api_key, _ApiKey

__all__ = [
get_shapefile,
Geography,
DataSets,
DataSet,
load_json_file,
load_json_str,
get_acs,
construct_api_call,
api_key,
_ApiKey,
Census,
]
6 changes: 3 additions & 3 deletions src/bbd/census/api_key.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ def key(self):
if self._key is None:
raise ValueError("Census api key has not been set!")
else:
return self._key
return self._key

@key.setter
def key(self, key_value: str):
if not isinstance(key_value, str):
raise ValueError(
raise TypeError(
f"Cannot set census api key to {key_value} of type {type(key_value)}. "
"Value should be a 'str'"
)
else:
self._key = key_value
self._key = key_value.strip()


api_key = _ApiKey()
117 changes: 117 additions & 0 deletions src/bbd/census/census.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from __future__ import annotations
import pandas as pd
from dataclasses import dataclass, field
from typing import Optional, OrderedDict
from bbd.census.census_table import CensusTable
from bbd.models import geography
import urllib.parse
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_columns', None)

@dataclass
class Census:
api_key: _ApiKey
geography_values: OrderedDict[geography.Geography, str]
year: str | int
dataset: dataset.Dataset
results: list[str] = field(default_factory = list) # list of CensusResult objects
available_variables: pd.DataFrame = field(default_factory = pd.DataFrame) # dataframe of all available variables
census_tables: list[CensusTable] = field(default_factory = list) # a list of CensusTable objects

def _build_url(self, variables: list[str]):
base_url = "https://api.census.gov/data"

# Collect all parts
year = self.year
dataset = self.dataset.value
variables = ",".join(variables)
key = self.api_key

# Parse the geography
geo_statements = list(self.geography_values.items())
statement_count = len(geo_statements)
geo_url = ""
for i in range(statement_count):
if i < statement_count:
prefix = "for"
else:
prefix = "in"
geo_url = geo_url + (f"&{prefix}={urllib.parse.quote(geo_statements[i][0].value)}:{geo_statements[i][1]}")

full_url = f"{base_url}/{year}/{dataset}?get={variables}{geo_url}&key={key}"
return full_url


def _make_query(self, variables):
url = self._build_url(variables)
response = requests.get(url)
return response

def get_acs(self, variables) -> CensusResult:
'''Query the database '''
response = self._make_query(variables)
result = CensusResult(response=response, variables=variables)
self.results.append(result)
return result

def _proportion_match(self, search_string: str, match_string:str):
search_string = search_string.lower()
match_string = match_string.lower()
cv = CountVectorizer()
count_matrix = cv.fit_transform([search_string, match_string])
proportion_match = cosine_similarity(count_matrix)[0][1]
return proportion_match

def _get_all_vars(self):
if len(self.census_tables) == 0:
url = f"https://api.census.gov/data/{self.year}/{self.dataset.value}/variables.json"
variable_data = requests.get(url)
json = variable_data.json()
attribute_names = [item for item in json["variables"]]
names_to_tables = {}
for item in attribute_names:
one_attribute = json["variables"][item]
if "concept" in one_attribute and "label" in one_attribute and "group" in one_attribute:
label = one_attribute["label"]
concept = one_attribute["concept"]
group = one_attribute["group"]
if group not in names_to_tables:
names_to_tables[group] = CensusTable(variable_id = group,
variable_description = concept,
attributes = [(item, label)])
else:
names_to_tables[group].attributes.append((item, label))
self.census_tables = names_to_tables
return self.census_tables

def _datafame_all_variables(self):
if len(self.available_variables) == 0:
names_to_tables = self._get_all_vars()
df = pd.DataFrame()
df["variable_id"] = [item.variable_id for item in names_to_tables.values()]
df["variable_description"] = [item.variable_description for item in names_to_tables.values()]
df["attributes"] = [item.attributes for item in names_to_tables.values()]
df["attribute_names"] = df["attributes"].apply(lambda x: [item[0] for item in x])
self.available_variables = df
return self.available_variables

def search_variables(self, search_string: Optional[str] = None, number_of_results: Optional[int] = None):
df = self._datafame_all_variables()
if search_string is not None:
proportion_matches = df["variable_description"].apply(lambda x: self._proportion_match(search_string, x))
df["match_proportion"] = proportion_matches
df = df[["variable_id", "variable_description", "attribute_names", "match_proportion"]]
df = df.sort_values(by="match_proportion", ascending=False).head(number_of_results)
if number_of_results is not None:
return df.head(number_of_results)
else:
return df.head()

class CensusResult():
def __init__(self, response: requests.Reponse, variables: list[str]):
self.response = response
self.variables = variables
self.data = response.json()

11 changes: 11 additions & 0 deletions src/bbd/census/census_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from dataclasses import dataclass, field


@dataclass
class CensusTable():
variable_id: str
variable_description: str
attributes: list[tuple[str, str]]

def fetch_dataframe(self):
pass
7 changes: 5 additions & 2 deletions src/bbd/census/datasets.py → src/bbd/census/dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
class DataSets:
import enum

class DataSet(enum.Enum):
"""Datasets available in the census API"""

ACS5_DETAIL = "acs/acs5"
ACS5 = "acs/acs5"
ACS5_SUBJECT = "acs/acs5/subject"
ACS5_PROFILE = "acs/acs5/profile"
ACS5_CPROFILE = "acs/acs5/cprofile"
ACS1 = "acs/acs1"
10 changes: 0 additions & 10 deletions src/bbd/census/geography.py

This file was deleted.

Loading