Skip to content

Commit

Permalink
refactor creation of citycodes table: only use cities from world city…
Browse files Browse the repository at this point in the history
… data; manual translation of some city and country names to short name to allow join with world city data; auto creation of city codes that could not be identified
  • Loading branch information
danielphilippi committed Mar 28, 2021
1 parent 1cf5946 commit 65da367
Show file tree
Hide file tree
Showing 4 changed files with 235 additions and 27 deletions.
30 changes: 17 additions & 13 deletions datamanager/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,13 @@
import os

from definitions import *
from datamanager.processing import prepare_unloc, prepare_ccodes, check_ccode_loccode
from datamanager.processing import prepare_unloc, prepare_ccodes, check_ccode_loccode, prepare_world_cities,\
enrich_cities


rawdatainfo = pd.read_csv(joinpath(DATAMGR_PATH, 'overview_rawdata.csv'), sep=';')

# city codes
if not os.path.isfile(UNLOC_FILE_CLEAN_ABS):
city_codes = prepare_unloc(unloc_files=UNLOC_FILES_ABS)
with open(UNLOC_FILE_CLEAN_ABS, 'wb') as f:
pickle.dump(city_codes, f)
else:
with open(UNLOC_FILE_CLEAN_ABS, 'rb') as f:
city_codes = pickle.load(f)
print(city_codes.shape)

# ccodes
# country codes
if not os.path.isfile(CCODE_FILE_CLEAN_ABS):
country_codes = prepare_ccodes(ccode_file=CCODE_FILE_ABS)
with open(CCODE_FILE_CLEAN_ABS, 'wb') as f:
Expand All @@ -29,4 +20,17 @@
print(country_codes.shape)


check_ccode_loccode(country_codes, city_codes)
# city codes
if not os.path.isfile(UNLOC_FILE_CLEAN_ABS):
city_codes = prepare_unloc(unloc_files=UNLOC_FILES_ABS)
check_ccode_loccode(country_codes, city_codes)

cities = prepare_world_cities(world_city_file=WORLD_CITIES_FILE_ABS)
final_cities = enrich_cities(cities=cities, country_codes=country_codes, city_codes=city_codes)

with open(UNLOC_FILE_CLEAN_ABS, 'wb') as f:
pickle.dump(final_cities, f)
else:
with open(UNLOC_FILE_CLEAN_ABS, 'rb') as f:
final_cities = pickle.load(f)
print(final_cities.shape)
5 changes: 2 additions & 3 deletions datamanager/overview_rawdata.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
id;name;filename;granularity;dims
1;citytemp;city_temperature_2020.csv;city;AvgTemperature
2;test;test;test;test
id;dataset;filename;granularity;dims
1;citytemp;city_temperature_2020.csv;city;AvgTemperature
216 changes: 207 additions & 9 deletions datamanager/processing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
import string


def filter_rows(df, condition, reason):
Expand Down Expand Up @@ -39,30 +40,67 @@ def clean_unloc(unloc):
def prepare_unloc(unloc_files):
if not isinstance(unloc_files, list):
unloc_files = [unloc_files]
unloc_comb = pd.DataFrame()
city_codes = pd.DataFrame()
for file_path in unloc_files:
print(file_path)
unloc_raw = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)
unloc_tmp = clean_unloc(unloc_raw)
unloc_comb = pd.concat([unloc_comb, unloc_tmp])
return unloc_comb
city_codes = pd.concat([city_codes, unloc_tmp])

# cleaning
city_codes = city_codes[['ccode', 'locname', 'loccode']]
city_codes = city_codes.rename(columns={'ccode': 'country_code', 'locname': 'city', 'loccode': 'city_code'})
city_codes = city_codes.groupby(['country_code', 'city']).city_code.first().reset_index()

return city_codes


def prepare_ccodes(ccode_file):
ccodes = pd.read_csv(ccode_file, names=['cname', 'ccode'], header=0)
filter_cname = (~ccodes.cname.isin(['Namibia', 'Bouvet Island']))
ccodes = ccodes[filter_cname]
return ccodes
country = pd.read_csv(ccode_file, names=['cname', 'ccode'], header=0)
filter_cname = (~country.cname.isin(['Namibia', 'Bouvet Island']))
country = country[filter_cname]

country.rename(columns={'cname': 'country', 'ccode': 'country_code'}, inplace=True)

country.loc[country.country_code == 'KR', 'country'] = 'South Korea'
country.loc[country.country_code == 'RU', 'country'] = 'Russia'
country.loc[country.country_code == 'IR', 'country'] = 'Iran'
country.loc[country.country_code == 'VN', 'country'] = 'Vietnam'
country.loc[country.country_code == 'TZ', 'country'] = 'Tanzania'
country.loc[country.country_code == 'BO', 'country'] = 'Bolivia'
country.loc[country.country_code == 'TW', 'country'] = 'Taiwan'
country.loc[country.country_code == 'VE', 'country'] = 'Venezuela'
country.loc[country.country_code == 'SY', 'country'] = 'Syria'
country.loc[country.country_code == 'CZ', 'country'] = 'Czechia'
country.loc[country.country_code == 'LA', 'country'] = 'Laos'
country.loc[country.country_code == 'MD', 'country'] = 'Moldova'
country.loc[country.country_code == 'MK', 'country'] = 'Macedonia'
country.loc[country.country_code == 'TW', 'country'] = 'Taiwan'
# country = country.append({'country': 'Namibia', 'country_code': 'NA'}, ignore_index=True)
country.loc[country.country_code == 'BN', 'country'] = 'Brunei'
country.loc[country.country_code == 'FM', 'country'] = 'Micronesia'
country.loc[country.country_code == 'RE', 'country'] = 'Reunion'
country.loc[country.country_code == 'VA', 'country'] = 'Vatican City'
country.loc[country.country_code == 'FK', 'country'] = 'Falkland Islands'

obs_per_country = country.groupby('country').size()
obs_per_country
if (obs_per_country > 1).sum() != 0:
raise Exception('Country name not unique!')

country

return country


def check_ccode_loccode(ccodes, unloc):

unloc_check = pd.DataFrame({
'ccode': unloc.ccode.unique(),
'country_code': unloc.country_code.unique(),
'country_city_codes': 1
})
# print(unloc_check.isna().sum())
ccode_loccode_check = ccodes.merge(unloc_check, how='outer', on='ccode')
ccode_loccode_check = ccodes.merge(unloc_check, how='outer', on='country_code')

# ccode_loccode_check[ccode_loccode_check.isnull().any(axis=1)]
# unloc_check[unloc_check.ccode.str.contains('NA')]
Expand All @@ -75,7 +113,167 @@ def check_ccode_loccode(ccodes, unloc):
return 'ok'


def filter_per_group(df, id_col, filter_col, filter_fct):
idx = df.groupby(id_col)[filter_col].transform(filter_fct) == df[filter_col]
return df[idx]


def prepare_world_cities(world_city_file):
cities = pd.read_csv(world_city_file)
(cities.city + '_' + cities.country).nunique() / cities.shape[0]

# cities['city_country'] = cities.city +'_'+ cities.country

# rows_per_citycountry = cities.groupby('city_country').size()
# rows_per_citycountry[rows_per_citycountry > 1]

cities = filter_per_group(
df=cities,
id_col=['city', 'country'],
filter_col='population',
filter_fct=max)

# rename

# inspection
# country[country.country.str.contains('falk', case=False)]
# country.loc[country.ccode == 'KO']

cities.loc[cities.country == 'Korea, South', 'country'] = 'South Korea'
cities.loc[cities.country.str.contains('congo', case=False), 'country'] = 'Congo'
cities.loc[cities.country.str.contains('ivoire', case=False), 'country'] = "Côte d'Ivoire"
cities.loc[cities.country.str.contains('bosnia', case=False), 'country'] = "Bosnia and Herzegovina"
cities.loc[cities.country.str.contains('bahamas', case=False), 'country'] = "Bahamas"
cities.loc[cities.country.str.contains('verde', case=False), 'country'] = "Cape Verde"
cities.loc[cities.country.str.contains('tobago', case=False), 'country'] = "Trinidad and Tobago"
cities.loc[cities.country.str.contains('Gambia', case=False), 'country'] = "Gambia"
cities.loc[cities.country.str.contains('tome', case=False), 'country'] = "Sao Tome and Principe"
cities.loc[cities.country.str.contains('Grenadines', case=False), 'country'] = "Saint Vincent and the Grenadines"
cities.loc[cities.country.str.contains('Antigua', case=False), 'country'] = "Antigua and Barbuda"
cities.loc[cities.country.str.contains('Macau', case=False), 'country'] = "Macao"
cities.loc[cities.country.str.contains('Micronesia', case=False), 'country'] = "Micronesia"
cities.loc[cities.country.str.contains('Reunion', case=False), 'country'] = "Reunion"
cities.loc[cities.country.str.contains('Isle of Man', case=False), 'country'] = "Isle of Man"
cities.loc[cities.country.str.contains('falkland', case=False), 'country'] = "Falkland Islands"

return cities


def merge_country_codes_on_cities(cities, country):
# check join country code in cities
cities_check = pd.DataFrame({
'country_name': cities.country.unique(),
'in_cities': 1
})
cities_check

country_check = pd.DataFrame({
'country_name': country.country.unique(),
'in_countries': 1
})

check = cities_check.merge(country_check, how='left', on='country_name')
check.in_cities.isna().sum()
check.in_countries.isna().sum()
check[check.in_countries.isna()]

cities = cities.merge(country, how='left', on='country')

cities_no_countrycode = cities[cities.country_code.isna()].country.unique()
print(cities.country.nunique())
print(cities.shape)
print(len(cities_no_countrycode))
if len(cities_no_countrycode) == 7:
cities = cities.loc[(~cities.country_code.isna()), ]
else:
raise Exception('Eliminating more countries than expected')
print(cities.shape)
print(cities.country.nunique())

print(f'\nNAs in cities:\n{cities.isna().sum()}')

# remove dubplicate cities
cities = cities[cities.city != 'Darhan']
cities = cities[cities.city != 'Crato']

# check if city country unique
(cities.country + cities.city).nunique()
obs_per_citycountry = cities.groupby(['country', 'city']).size()
if len(obs_per_citycountry[obs_per_citycountry > 1]) != 0:
raise Exception('city country not unique!')

return cities


def merge_city_codes(cities, city_codes):
cities_check = cities[['country_code', 'city']].copy()
cities_check['in_cities'] = 1
cities_check = cities_check.merge(city_codes, how='left', on=['country_code', 'city'])

obs_per_citycountry = cities_check.groupby(['country_code', 'city']).size()
obs_per_citycountry[obs_per_citycountry > 1]
print('cities_check', cities_check.shape)
print('cities', cities.shape)

# merge city codes
if len(obs_per_citycountry[obs_per_citycountry > 1]) == 0:
cities = cities.merge(city_codes, how='left', on=['country_code', 'city'])
print('cities', cities.shape)

else:
raise Exception('Can not merge city_codes in cities. Keys ambigiouse')

cities['cc_code'] = cities.country_code + '-' + cities.city_code

cities = cities[['cc_code', 'city', 'city_code', 'country', 'country_code', 'lat', 'lng', 'population']]

# create artificial city codes
print('creating artificial city codes')
cc_codes = cities[~cities.cc_code.isna()].cc_code.to_list()
for idx, row in cities[cities.city_code.isna()].iterrows():
city_name_tmp_clean = row.city.translate(str.maketrans('', '', string.punctuation)).replace(' ', '').upper()
city_code_tmp = city_name_tmp_clean[0:3]
cc_code_tmp = row.country_code + '-' + city_code_tmp
if cc_code_tmp not in cc_codes and len(city_name_tmp_clean) >= 3:
cc_codes.append(cc_code_tmp + '-X')
cities.loc[idx, 'cc_code'] = cc_code_tmp + '-X'
cities.loc[idx, 'city_code'] = city_code_tmp + '-X'
else:
if len(city_name_tmp_clean) >= 4:
city_code_tmp = city_name_tmp_clean[0:2] + city_name_tmp_clean[3]
cc_code_tmp = row.country_code + '-' + city_code_tmp
if cc_code_tmp not in cc_codes:
cc_codes.append(cc_code_tmp + '-X')
cities.loc[idx, 'cc_code'] = cc_code_tmp + '-X'
cities.loc[idx, 'city_code'] = city_code_tmp + '-X'

else:
pass
else:
pass

if cities.cc_code.isna().sum() <= 278:
cities = cities[~cities.cc_code.isna()]
else:
raise Exception('More more missing cc_codes than expected!')

print('done')

return cities


def enrich_cities(cities, country_codes, city_codes):

cities = merge_country_codes_on_cities(cities, country_codes)

cities = merge_city_codes(cities, city_codes)

return cities


def clean_citytemp():
print('clean_citytemp')


cleaner = globals()['clean_citytemp']
cleaner()
11 changes: 9 additions & 2 deletions definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
os.mkdir(COMPUTED_DATA_PATH)

# city codes
UNLOC_PATH = joinpath(DATA_BASE_PATH, 'country_city_codes/')
UNLOC_PATH = joinpath(RAWDATA_PATH, 'country_city_codes/')

UNLOC_FILES_ABS = [
joinpath(UNLOC_PATH, 'CodeListPart1.csv'),
Expand All @@ -29,7 +29,8 @@
for file in UNLOC_FILES_ABS:
if not os.path.isfile(file):
raise Exception(f'{os.path.relpath(file)} is missing. Please download from'
f' https://unece.org/trade/cefact/UNLOCODE-Download or define location.')
f' https://unece.org/trade/cefact/UNLOCODE-Download or https://datahub.io/core/un-locode '
f'or define location.')

UNLOC_FILE_CLEAN = 'city_codes.pickle'
UNLOC_FILE_CLEAN_ABS = joinpath(COMPUTED_DATA_PATH, UNLOC_FILE_CLEAN)
Expand All @@ -43,3 +44,9 @@

CCODE_FILE_CLEAN_ABS = joinpath(COMPUTED_DATA_PATH, 'country_codes.pickle')

#
DIMS_BASE_PATH = joinpath(RAWDATA_PATH, 'dimensions/')


# world cities
WORLD_CITIES_FILE_ABS = joinpath(DIMS_BASE_PATH, 'worldcities_clean.csv')

0 comments on commit 65da367

Please sign in to comment.