diff --git a/.gitignore b/.gitignore index d4200f8..59aa663 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__ .vscode .tox .python-version +.DS_Store # Sublime environment *.sublime-project @@ -34,3 +35,6 @@ user # Shh, secrets census_api_key.txt + +#api key +config.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/censusExplorer.iml b/.idea/censusExplorer.iml new file mode 100644 index 0000000..5f3d3c2 --- /dev/null +++ b/.idea/censusExplorer.iml @@ -0,0 +1,12 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..6fdd81d --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..046df8a --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index 44ee561..916cf0b 100644 --- a/README.md +++ b/README.md @@ -38,14 +38,14 @@ For our working example, we'll use median household income (which is coded in th We can simply downloaded the census data like so: ```python ->>> from bbd import census ->>> data = census.get_acs( ->>> geography=census.Geography.CD, ->>> variables="group(DP03),NAME", ->>> year=2018, ->>> state="co", ->>> dataset=census.DataSets.ACS5_PROFILE, ->>> ) +>> > from bbd import census +>> > data = census.get_acs( + >> > geography = census.Geography.CD, +>> > variables = "group(DP03),NAME", +>> > year = 2018, +>> > state = "co", +>> > dataset = census.DataSet.ACS5_PROFILE, +>> > ) ``` https://api.census.gov/data/2018/acs/acs1/profile?get=group(DP03),NAME&for=congressional%20district:*&in=state:08 diff --git a/examples/co_income.py b/examples/co_income.py index 51ecd25..e6b447b 100644 --- a/examples/co_income.py +++ b/examples/co_income.py @@ -26,7 +26,7 @@ geography=census.Geography.TRACT, variables=["NAME", "DP03_0062E"], year=2018, - dataset=census.DataSets.ACS5_PROFILE, + dataset=census.DataSet.ACS5_PROFILE, state="co", county="069", # Larimer County cache=True, diff --git a/examples/get_acs_example.py b/examples/get_acs_example.py index 39ee219..b0a6c5a 100644 --- a/examples/get_acs_example.py +++ b/examples/get_acs_example.py @@ -14,7 +14,7 @@ geography=census.Geography.STATE, variables="NAME,B03003_001E", year=2018, - dataset=census.DataSets.ACS5_DETAIL, + dataset=census.DataSet.ACS5, ) pprint(data) diff --git a/examples/tx_hispanic_or_latino.py b/examples/tx_hispanic_or_latino.py index 4dbec4d..397c123 100644 --- a/examples/tx_hispanic_or_latino.py +++ b/examples/tx_hispanic_or_latino.py @@ -36,7 +36,7 @@ geography=census.Geography.BLOCKGROUP, variables=["NAME", "B03003_001E", "B03003_002E", "B03003_003E"], year=2018, - dataset=census.DataSets.ACS5_DETAIL, + dataset=census.DataSet.ACS5, state="tx", county="201", # Harris County cache=True, diff --git a/examples/tx_zip_code_by_race.py b/examples/tx_zip_code_by_race.py index 8ddc152..bb28f41 100644 --- a/examples/tx_zip_code_by_race.py +++ b/examples/tx_zip_code_by_race.py @@ -56,7 +56,7 @@ geography=census.Geography.ZCTA, variables=list(variables.keys()), year=2018, - dataset=census.DataSets.ACS5_DETAIL, + dataset=census.DataSet.ACS5, # state="tx", # county="201": "Harris County cache=True, diff --git a/src/bbd/__init__.py b/src/bbd/__init__.py index a7cf406..4cf45e9 100644 --- a/src/bbd/__init__.py +++ b/src/bbd/__init__.py @@ -1,8 +1,16 @@ __version__ = "0.0.7" -# TODO add relevant imports... -from .working_directory import working_directory -from .geocoder import geocoder +from . import working_directory +# import bbd.geocoder as geocoder +# import bbd.census as census +# import bbd.fec as fec +# import bbd.gis as gis +from . import geocoder +from . import census +from . import fec +from . import gis +from . import models +from . import elections -__all__ = [working_directory, geocoder] +__all__ = [working_directory, geocoder, census, fec, gis, models, elections] diff --git a/src/bbd/census/__init__.py b/src/bbd/census/__init__.py index e7e79cb..1ed2a09 100644 --- a/src/bbd/census/__init__.py +++ b/src/bbd/census/__init__.py @@ -1,17 +1,18 @@ +from .census import Census from .get_shapefile import get_shapefile -from .geography import Geography -from .datasets import DataSets +from .dataset import DataSet from .load import load_json_file, load_json_str from .get_acs import get_acs, construct_api_call -from .api_key import api_key +from .api_key import api_key, _ApiKey __all__ = [ get_shapefile, - Geography, - DataSets, + DataSet, load_json_file, load_json_str, get_acs, construct_api_call, api_key, + _ApiKey, + Census, ] diff --git a/src/bbd/census/api_key.py b/src/bbd/census/api_key.py index fef12dc..dbc267b 100644 --- a/src/bbd/census/api_key.py +++ b/src/bbd/census/api_key.py @@ -7,17 +7,17 @@ def key(self): if self._key is None: raise ValueError("Census api key has not been set!") else: - return self._key + return self._key @key.setter def key(self, key_value: str): if not isinstance(key_value, str): - raise ValueError( + raise TypeError( f"Cannot set census api key to {key_value} of type {type(key_value)}. " "Value should be a 'str'" ) else: - self._key = key_value + self._key = key_value.strip() api_key = _ApiKey() diff --git a/src/bbd/census/census.py b/src/bbd/census/census.py new file mode 100644 index 0000000..dc00b19 --- /dev/null +++ b/src/bbd/census/census.py @@ -0,0 +1,117 @@ +from __future__ import annotations +import pandas as pd +from dataclasses import dataclass, field +from typing import Optional, OrderedDict +from bbd.census.census_table import CensusTable +from bbd.models import geography +import urllib.parse +import requests +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.metrics.pairwise import cosine_similarity +pd.set_option('display.max_columns', None) + +@dataclass +class Census: + api_key: _ApiKey + geography_values: OrderedDict[geography.Geography, str] + year: str | int + dataset: dataset.Dataset + results: list[str] = field(default_factory = list) # list of CensusResult objects + available_variables: pd.DataFrame = field(default_factory = pd.DataFrame) # dataframe of all available variables + census_tables: list[CensusTable] = field(default_factory = list) # a list of CensusTable objects + + def _build_url(self, variables: list[str]): + base_url = "https://api.census.gov/data" + + # Collect all parts + year = self.year + dataset = self.dataset.value + variables = ",".join(variables) + key = self.api_key + + # Parse the geography + geo_statements = list(self.geography_values.items()) + statement_count = len(geo_statements) + geo_url = "" + for i in range(statement_count): + if i < statement_count: + prefix = "for" + else: + prefix = "in" + geo_url = geo_url + (f"&{prefix}={urllib.parse.quote(geo_statements[i][0].value)}:{geo_statements[i][1]}") + + full_url = f"{base_url}/{year}/{dataset}?get={variables}{geo_url}&key={key}" + return full_url + + + def _make_query(self, variables): + url = self._build_url(variables) + response = requests.get(url) + return response + + def get_acs(self, variables) -> CensusResult: + '''Query the database ''' + response = self._make_query(variables) + result = CensusResult(response=response, variables=variables) + self.results.append(result) + return result + + def _proportion_match(self, search_string: str, match_string:str): + search_string = search_string.lower() + match_string = match_string.lower() + cv = CountVectorizer() + count_matrix = cv.fit_transform([search_string, match_string]) + proportion_match = cosine_similarity(count_matrix)[0][1] + return proportion_match + + def _get_all_vars(self): + if len(self.census_tables) == 0: + url = f"https://api.census.gov/data/{self.year}/{self.dataset.value}/variables.json" + variable_data = requests.get(url) + json = variable_data.json() + attribute_names = [item for item in json["variables"]] + names_to_tables = {} + for item in attribute_names: + one_attribute = json["variables"][item] + if "concept" in one_attribute and "label" in one_attribute and "group" in one_attribute: + label = one_attribute["label"] + concept = one_attribute["concept"] + group = one_attribute["group"] + if group not in names_to_tables: + names_to_tables[group] = CensusTable(variable_id = group, + variable_description = concept, + attributes = [(item, label)]) + else: + names_to_tables[group].attributes.append((item, label)) + self.census_tables = names_to_tables + return self.census_tables + + def _datafame_all_variables(self): + if len(self.available_variables) == 0: + names_to_tables = self._get_all_vars() + df = pd.DataFrame() + df["variable_id"] = [item.variable_id for item in names_to_tables.values()] + df["variable_description"] = [item.variable_description for item in names_to_tables.values()] + df["attributes"] = [item.attributes for item in names_to_tables.values()] + df["attribute_names"] = df["attributes"].apply(lambda x: [item[0] for item in x]) + self.available_variables = df + return self.available_variables + + def search_variables(self, search_string: Optional[str] = None, number_of_results: Optional[int] = None): + df = self._datafame_all_variables() + if search_string is not None: + proportion_matches = df["variable_description"].apply(lambda x: self._proportion_match(search_string, x)) + df["match_proportion"] = proportion_matches + df = df[["variable_id", "variable_description", "attribute_names", "match_proportion"]] + df = df.sort_values(by="match_proportion", ascending=False).head(number_of_results) + if number_of_results is not None: + return df.head(number_of_results) + else: + return df.head() + +class CensusResult(): + def __init__(self, response: requests.Reponse, variables: list[str]): + self.response = response + self.variables = variables + self.data = response.json() + diff --git a/src/bbd/census/census_table.py b/src/bbd/census/census_table.py new file mode 100644 index 0000000..e1680c6 --- /dev/null +++ b/src/bbd/census/census_table.py @@ -0,0 +1,11 @@ +from dataclasses import dataclass, field + + +@dataclass +class CensusTable(): + variable_id: str + variable_description: str + attributes: list[tuple[str, str]] + + def fetch_dataframe(self): + pass \ No newline at end of file diff --git a/src/bbd/census/datasets.py b/src/bbd/census/dataset.py similarity index 66% rename from src/bbd/census/datasets.py rename to src/bbd/census/dataset.py index aa6a186..b86cf76 100644 --- a/src/bbd/census/datasets.py +++ b/src/bbd/census/dataset.py @@ -1,7 +1,10 @@ -class DataSets: +import enum + +class DataSet(enum.Enum): """Datasets available in the census API""" - ACS5_DETAIL = "acs/acs5" + ACS5 = "acs/acs5" ACS5_SUBJECT = "acs/acs5/subject" ACS5_PROFILE = "acs/acs5/profile" ACS5_CPROFILE = "acs/acs5/cprofile" + ACS1 = "acs/acs1" \ No newline at end of file diff --git a/src/bbd/census/geography.py b/src/bbd/census/geography.py deleted file mode 100644 index 0823e32..0000000 --- a/src/bbd/census/geography.py +++ /dev/null @@ -1,10 +0,0 @@ -class Geography: - """Geography available for download""" - - TRACT = "tract" - CD = "congressional district" - COUNTY = "county" - STATE = "state" - ZCTA = "zip code tabulation area" - BLOCK = "block" - BLOCKGROUP = "block group" diff --git a/src/bbd/census/get_acs.py b/src/bbd/census/get_acs.py index b7d1962..c16f58b 100644 --- a/src/bbd/census/get_acs.py +++ b/src/bbd/census/get_acs.py @@ -1,12 +1,12 @@ import re -from typing import Union, List +from typing import Union, List, Optional import requests from ..working_directory import working_directory -from .geography import Geography -from .datasets import DataSets +from ..models import Geography +from .dataset import DataSet from .api_key import api_key from .load import load_json_str, load_json_file from .us import state_to_fips @@ -16,7 +16,7 @@ def get_acs( geography: Geography, variables: Union[str, List[str]], year: Union[str, int] = 2018, - dataset: DataSets = DataSets.ACS5_DETAIL, + dataset: DataSet = DataSet.ACS5, state: Union[str, None] = None, county: Union[str, None] = None, cache: bool = False, @@ -76,9 +76,9 @@ def construct_api_call( geography: Geography, variables: Union[str, List[str]], year: Union[str, int] = 2018, - dataset: DataSets = DataSets.ACS5_DETAIL, - state: Union[str, None] = None, - county: Union[str, None] = None, + dataset: DataSet = DataSet.ACS5, + state: Optional[str] = None, + county: Optional[str] = None, ): """Construct a url call to the census api""" @@ -90,16 +90,10 @@ def construct_api_call( for_geography = f"&for={geography}:*" # If a state is provided, request the data returned be within it - if state is not None: - in_state = f"&in=state:{state_to_fips(state)}" - else: - in_state = "" + in_state = "" if state is None else f"&in=state:{state_to_fips(state)}" # If a county is provided, request the data returned be within it - if county is not None: - in_county = f"&in=county:{county}" - else: - in_county = "" + in_county = "" if county is None else f"&in=county:{county}" # Census api call return ( diff --git a/src/bbd/census/get_shapefile.py b/src/bbd/census/get_shapefile.py index 4db3c8b..f8f62c3 100644 --- a/src/bbd/census/get_shapefile.py +++ b/src/bbd/census/get_shapefile.py @@ -9,7 +9,7 @@ from ..working_directory import working_directory -from .geography import Geography +from ..models import Geography from .us import state_to_fips """Maps year to congressional district number""" diff --git a/src/bbd/elections/__init__.py b/src/bbd/elections/__init__.py new file mode 100644 index 0000000..ced0892 --- /dev/null +++ b/src/bbd/elections/__init__.py @@ -0,0 +1 @@ +from .get_elections import get_elections \ No newline at end of file diff --git a/src/bbd/elections/get_elections.py b/src/bbd/elections/get_elections.py new file mode 100644 index 0000000..6d26501 --- /dev/null +++ b/src/bbd/elections/get_elections.py @@ -0,0 +1,33 @@ +import pandas as pd +from ..models import Geography, Election +from typing import Optional, Iterable + +def get_elections(election_office: Optional[Election] = None, + aggregate_into: Optional[Geography] = None, + years: Iterable[int] = (), + districts: Iterable[Geography] = ()): + """ + Returns the election results for a specific election office and election year and returns the Democratic & Republican vote share + + TODOs: + - collect data + - connect to data here (caching?) + - run aggregation + - Could we allow passing in an arbitrary geoshape to aggregate into? + - Can we get turnout? + """ + + # dict summarizing dataCoverage + dataCoverage = { + "TX": { + "years": [], + "election_office": [Election.SL, Election.SU, Election.PRES], + "geo_levels": [Geography.PRECINCT, Geography.SL, Geography.UL, Geography.COUNTY, Geography.STATE], + # state, all 33 state districts, all 150 house districts + } + } + + + #example return structure + return pd.DataFrame([["State House District 21", "TX_HD21", 47.0, 46.5]], + columns=["Election Office", "District", "Democratic Vote Share", "Republican Vote Share"]) diff --git a/src/bbd/fec/api_key.py b/src/bbd/fec/api_key.py index 54ce030..f218837 100644 --- a/src/bbd/fec/api_key.py +++ b/src/bbd/fec/api_key.py @@ -17,6 +17,6 @@ def key(self, key_value: str): "Value should be a 'str.'" ) else: - self._key = key_value + self._key = key_value.strip() api_key = _ApiKey() diff --git a/src/bbd/fec/get_fec.py b/src/bbd/fec/get_fec.py index a1e3769..3b92904 100644 --- a/src/bbd/fec/get_fec.py +++ b/src/bbd/fec/get_fec.py @@ -17,6 +17,8 @@ def get_fec( params: dict, cache: bool = False ): + raise ValueError("TEST") + """Get OpenFEC data. See https://api.open.fec.gov/developers for a list of endpoints and the parameters associated with each endpoint.""" call = construct_api_call(endpoint, params) diff --git a/src/bbd/models/__init__.py b/src/bbd/models/__init__.py new file mode 100644 index 0000000..117d69d --- /dev/null +++ b/src/bbd/models/__init__.py @@ -0,0 +1,4 @@ +from .geography import Geography +from .election import Election + +__all__ = [Geography, Election] \ No newline at end of file diff --git a/src/bbd/models/candidate.py b/src/bbd/models/candidate.py new file mode 100644 index 0000000..8a72d5b --- /dev/null +++ b/src/bbd/models/candidate.py @@ -0,0 +1,9 @@ +from .Geography import Geography + +class Candidate: + + def __init__(self, name: str, district: Geography, votes_received: int = None, vote_share_2_way: float = None, filing_id: str = None): + self.name = name + self.filing_id = filing_id + self.district = district + \ No newline at end of file diff --git a/src/bbd/models/election.py b/src/bbd/models/election.py new file mode 100644 index 0000000..af57ae2 --- /dev/null +++ b/src/bbd/models/election.py @@ -0,0 +1,8 @@ +class Election: + + SL = "State legislative (lower)" + SU = "State legislative (upper)" + CD = "Congressional" + PRES = "Presidential" + SEN = "US Senate" + GOV = "Governor" \ No newline at end of file diff --git a/src/bbd/models/geography.py b/src/bbd/models/geography.py new file mode 100644 index 0000000..2a3242b --- /dev/null +++ b/src/bbd/models/geography.py @@ -0,0 +1,62 @@ +from __future__ import annotations +import enum + + +class Geography(enum.Enum): + US = "us" + REGION = "region" + DIVISION = "division" + STATE = "state" + COUNTY = "county" + COUNTY_SUBDIVISION = "county subdivision" + PLACE = "place" + ALASKA_NATIVE_REGIONAL_CORPORATION = "alaska native regional corporation" + CONGRESSIONAL_DISTRICT = "congressional district" + PUBLIC_USE_MICRODATA_AREA = "public use microdata area" + SCHOOL_DISTRICT_ELEMENTARY = "school district (elementary)" + SCHOOL_DISTRICT_SECONDARY = "school district (secondary)" + SCHOOL_DISTRICT_UNIFIED = "school district (unified)" + AMERICAN_INDIAN_ALASKA_NATIVE_HAWIIAN = "american indian area/alaska native area/hawaiian home land" + METROPOLITAN_MICROPOLITAN_STATISTICAL_AREA = "metropolitan statistical area/micropolitan statistical area" + STATE_OR_PART = "state (or part)" + PRINCIPAL_CITY_OR_PART = "principal city (or part)" + METROPOLITAN_DIVISION = "metropolitan division" + COMBINED_STATISTICAL_AREA = "combined statistical area" + COMBINED_NEW_ENGLAND_CITY_AND_TOWN_AREA = "combined new england city and town area" + NEW_ENGLAND_CITY_AND_TOWN_AREA = "new england city and town area" + PRINCIPAL_CITY = "principal city" + NECTA_DIVISION = "necta division" + URBAN_AREA = "urban area" + + + + # """Geographies, meaning districts or regions, referencing both census and election data definitions, prioritizing census definitions""" + # + # TRACT = "tract" + # CD = "congressional district" + # COUNTY = "county" + # STATE = "state" + # ZCTA = "zip code tabulation area" + # BLOCK = "block" + # BLOCKGROUP = "block group" + # # NEW: + # SL = "state legislative district (lower)" + # UL = "state legislative district (upper)" + # PRECINCT = "precinct" + # + # + # def __init__(self, geo_type, id: str, name: str, shape=None): + # self.geo_type = geo_type + # self.id = id + # self.name = name + # self.shape = shape + # + # + # + # def find_intersections(self, other_geography): + # """ + # Returns bool if self intersects with other_geography + # + # If "shape" is not defined for this geographi + # """ + # return None \ No newline at end of file diff --git a/tests/census/test_api_key.py b/tests/census/test_api_key.py index e33e703..3c8d38d 100644 --- a/tests/census/test_api_key.py +++ b/tests/census/test_api_key.py @@ -1,13 +1,52 @@ import pytest +import requests +import requests_mock -from bbd import census +from bbd import census, models def test_api_key_required(): with pytest.raises(ValueError): census.construct_api_call( - geography=census.Geography.STATE, + geography=models.Geography.STATE, variables="B03003_001E", year=2018, - dataset=census.DataSets.ACS5_DETAIL, + dataset=census.DataSet.ACS5, ) + + +@pytest.mark.skip(reason="Seems incomplete, and creates errorsin other tests") +def test_can_request_data(): + census.api_key = "" + + +def test_can_set_api_key(): + SAMPLE_API_KEY = "abc123" + census.api_key.key = SAMPLE_API_KEY + assert census.api_key.key == SAMPLE_API_KEY + + +def test_can_call_census_api(): + SAMPLE_API_KEY = "abc123" + census.api_key.key = SAMPLE_API_KEY + url = census.construct_api_call( + geography=models.Geography.STATE, + variables="B03003_001E", + year=2018, + dataset=census.DataSet.ACS5, + ) + + def callback(request: requests.Request, context): + # Check for "valid" sample API key + if request.qs['key'] != [SAMPLE_API_KEY]: + context.status_code = 404 + context.reason = "Invalid API key" + return "Invalid API key" + else: + context.status_code = 200 + return "Valid API key" + + with requests_mock.Mocker() as mocker: + mocker.get(url, text=callback) + assert requests.get(url).text == "Valid API key" + diff --git a/tests/census/test_census.py b/tests/census/test_census.py new file mode 100644 index 0000000..9ea6f08 --- /dev/null +++ b/tests/census/test_census.py @@ -0,0 +1,135 @@ +from bbd.census import Census, DataSet +from bbd.models import Geography +from config import API_KEY +from collections import OrderedDict + +def test_build_url(): + api_key = "YOUR_KEY_GOES_HERE" + year = 2019 + dataset = DataSet.ACS1 + variables = ["NAME", "B01001_001E"] + geography_values = OrderedDict() + geography_values[Geography.STATE] = "36" + geography_values[Geography.COUNTY] = "*" + geography_values[Geography.COUNTY_SUBDIVISION] = "*" + census = Census(api_key=api_key, geography_values=geography_values, year=year, dataset=dataset) + goal_url = "https://api.census.gov/data/2019/acs/acs1?get=NAME,B01001_001E&for=state:36&for=county:*&for=county%20subdivision:*&key=YOUR_KEY_GOES_HERE" + test_url = census._build_url(variables) + print(test_url) + assert goal_url == test_url + +def test_make_query(): + api_key = API_KEY + year = 2019 + dataset = DataSet.ACS1 + variables = ["NAME", "B01001_001E"] + geography_values = OrderedDict() + geography_values[Geography.STATE] = "36" + geography_values[Geography.COUNTY] = "*" + geography_values[Geography.COUNTY_SUBDIVISION] = "*" + census = Census(api_key=api_key, geography_values=geography_values, year=year, dataset=dataset) + result = census._make_query(variables) + print(result.json()) + assert result is not None + +def test_get_census_result(): + api_key = API_KEY + year = 2019 + dataset = DataSet.ACS1 + variables = ["NAME", "B01001_001E"] + geography_values = OrderedDict() + geography_values[Geography.STATE] = "36" + geography_values[Geography.COUNTY] = "*" + geography_values[Geography.COUNTY_SUBDIVISION] = "*" + census = Census(api_key=api_key, geography_values=geography_values, year=year, dataset=dataset) + result = census.get_acs(variables) + print(f"result json: {result.data}") + assert result.data is not None + +def test_get_all_vars(): + api_key = API_KEY + year = 2019 + dataset = DataSet.ACS1 + variables = ["NAME", "B01001_001E"] + geography_values = OrderedDict() + geography_values[Geography.STATE] = "36" + geography_values[Geography.COUNTY] = "*" + geography_values[Geography.COUNTY_SUBDIVISION] = "*" + census = Census(api_key=api_key, geography_values=geography_values, year=year, dataset=dataset) + names_to_tables = census._get_all_vars() + print(names_to_tables) + assert len(names_to_tables) > 50 + +def test_proportion_match(): + search_string = "this is the first string" + comparison_string = "AND THIS, MY FRIEND, IS THE SECOND STRING" + api_key = API_KEY + year = 2019 + dataset = DataSet.ACS1 + variables = ["NAME", "B01001_001E"] + geography_values = OrderedDict() + geography_values[Geography.STATE] = "36" + geography_values[Geography.COUNTY] = "*" + geography_values[Geography.COUNTY_SUBDIVISION] = "*" + census = Census(api_key=api_key, geography_values=geography_values, year=year, dataset=dataset) + match_proportion = census._proportion_match(search_string, comparison_string) + print(match_proportion) + assert match_proportion > 0.50 + +def test_dataframe_all_variables(): + api_key = API_KEY + year = 2019 + dataset = DataSet.ACS1 + geography_values = OrderedDict() + geography_values[Geography.STATE] = "36" + geography_values[Geography.COUNTY] = "*" + geography_values[Geography.COUNTY_SUBDIVISION] = "*" + census = Census(api_key=api_key, geography_values=geography_values, year=year, dataset=dataset) + df = census._datafame_all_variables() + print(df) + assert df is not None + +def test_census_search_variables(): + api_key = API_KEY + year = 2019 + dataset = DataSet.ACS1 + geography_values = OrderedDict() + geography_values[Geography.STATE] = "36" + geography_values[Geography.COUNTY] = "*" + geography_values[Geography.COUNTY_SUBDIVISION] = "*" + census = Census(api_key=api_key, geography_values=geography_values, year=year, dataset=dataset) + search_string = "geography sex by occupation of workers" + number_of_results = 10 + df = census.search_variables(search_string, number_of_results) + assert len(df) > 0 + assert len(df["match_proportion"]) > 0 + assert len(df.columns) == 4 + print(df) + +def test_census_search_variables_no_string(): + api_key = API_KEY + year = 2019 + dataset = DataSet.ACS1 + geography_values = OrderedDict() + geography_values[Geography.STATE] = "36" + geography_values[Geography.COUNTY] = "*" + geography_values[Geography.COUNTY_SUBDIVISION] = "*" + census = Census(api_key=api_key, geography_values=geography_values, year=year, dataset=dataset) + search_string = "geography sex by occupation of workers" + number_of_results = 10 + df = census.search_variables(search_string = None, number_of_results = 30) + assert len(df) > 0 + print(df) + +def test_acs_to_df(): + api_key = API_KEY + year = 2019 + dataset = DataSet.ACS1 + geography_values = OrderedDict() + geography_values[Geography.STATE] = "36" + geography_values[Geography.COUNTY] = "*" + geography_values[Geography.COUNTY_SUBDIVISION] = "*" + variables = ["NAME", "B01001_001E"] + census = Census(api_key=api_key, geography_values=geography_values, year=year, dataset=dataset) + result = census.get_acs(variables) + print(result.data) diff --git a/tests/census/test_get_acs.py b/tests/census/test_get_acs.py index e2a8578..5b08d24 100644 --- a/tests/census/test_get_acs.py +++ b/tests/census/test_get_acs.py @@ -1,14 +1,14 @@ -from bbd import census +from bbd import census, models def _construct_call(variables): census.api_key.key = "MyApiKey" return census.construct_api_call( - geography=census.Geography.STATE, + geography=models.Geography.STATE, variables=variables, year=2018, - dataset=census.DataSets.ACS5_DETAIL, + dataset=census.DataSet.ACS5, ) diff --git a/tests/geocoder/test_geocoder.py b/tests/geocoder/test_geocoder.py index e51c68c..ba8705d 100644 --- a/tests/geocoder/test_geocoder.py +++ b/tests/geocoder/test_geocoder.py @@ -283,26 +283,27 @@ def test_LocationsGeocoder_make_file_header(self, tmp_path): assert len(test.columns) <= 4, error_msg - def test_LocationsGeocoder_run_one_batch(self, tmp_path): - """Tests the .run() and associated methods for - LocationsGeocoder. - """ - p = tmp_path/"test.csv" + # def test_LocationsGeocoder_run_one_batch(self, tmp_path): + # #FIXME -- probably replace all this complex "dummy_geocoder" with a monkeypatch: https://stackoverflow.com/questions/51392889/python-pytest-occasionally-fails-with-oserror-reading-from-stdin-while-output-i + # """Tests the .run() and associated methods for + # LocationsGeocoder. + # """ + # p = tmp_path/"test.csv" - gl = gc.LocationsGeocoder(self.address_df, valid_email, p) - gl._set_dummy_geocoder() + # gl = gc.LocationsGeocoder(self.address_df, valid_email, p) + # gl._set_dummy_geocoder() - gl.run() + # gl.run() - test = pd.read_csv(p, sep = "\t") + # test = pd.read_csv(p, sep = "\t") - assert not test.empty, "No results saved to disk." - assert not gl.locations.empty, "No results stored in object." + # assert not test.empty, "No results saved to disk." + # assert not gl.locations.empty, "No results stored in object." - assert len(test) == 234, "Not all lines were saved to disk" - assert len(gl.locations) == 234, "Not all lines were saved in object" + # assert len(test) == 234, "Not all lines were saved to disk" + # assert len(gl.locations) == 234, "Not all lines were saved in object" - assert len(test.columns) <= 4, "Saved too many columns in tsv format" + # assert len(test.columns) <= 4, "Saved too many columns in tsv format" def test_LocationsGeocoder_run_mult_batches(self, tmp_path): @@ -410,26 +411,27 @@ def test_LocationsGeocoder_test_real_geocoder(self, tmp_path): assert all(gl.locations.all()) - def test_LocationsGeocoder_reset(self, tmp_path): - """ - """ - #Mocks the input asking for DELETE - gc.input = lambda *args : "DELETE" + # def test_LocationsGeocoder_reset(self, tmp_path): + # """ + # """ + # #FIXME: -- probably replace all this complex "dummy_geocoder" with a monkeypatch: https://stackoverflow.com/questions/51392889/python-pytest-occasionally-fails-with-oserror-reading-from-stdin-while-output-i + # #Mocks the input asking for DELETE + # gc.input = lambda *args : "DELETE" - p = tmp_path/"test.csv" + # p = tmp_path/"test.csv" - gl = gc.LocationsGeocoder(self.address_df, valid_email, p) - gl._set_dummy_geocoder() + # gl = gc.LocationsGeocoder(self.address_df, valid_email, p) + # gl._set_dummy_geocoder() - gl.run() + # gl.run() - gl.reset() + # gl.reset() - test = pd.read_csv(p, sep = "\t") + # test = pd.read_csv(p, sep = "\t") - assert test.empty, "File was not deleted" - assert gl.curr_batch == 1, "Batches were not reset" - assert len(gl._queue) == 234, "Queue was not reset" + # assert test.empty, "File was not deleted" + # assert gl.curr_batch == 1, "Batches were not reset" + # assert len(gl._queue) == 234, "Queue was not reset" def teardown_method(self):