Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
source :rubygems
source 'https://rubygems.org'

gem 'rake'

group :test, :development do
gem 'cover_me'
gem 'simplecov', require: false
gem 'awesome_print'
gem 'pry'
end

group :test do
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ and constants handling code) from [Geocommons](http://geocommons.com/)'
[Geocoder::US 2.0](https://github.com/geocommons/geocoder) gem.

[![Build Status](https://secure.travis-ci.org/daveworth/Indirizzo.png)](http://travis-ci.org/daveworth/Indirizzo)
[![Gem Version](https://badge.fury.io/rb/Indirizzo.png)](http://badge.fury.io/rb/Indirizzo)

## Background

Expand Down
12 changes: 0 additions & 12 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,3 @@ Rake::TestTask.new(:test) do |test|
end

task :default => :test

namespace :cover_me do
desc "Generates and opens code coverage report."
task :report do
require 'cover_me'
CoverMe.complete!
end
end

task :test do
Rake::Task['cover_me:report'].invoke
end
231 changes: 14 additions & 217 deletions lib/indirizzo/address.rb
Original file line number Diff line number Diff line change
@@ -1,18 +1,12 @@
require 'indirizzo/constants'
require 'indirizzo/parser'
require 'indirizzo/address_hash_extractor'
require 'indirizzo/match'
require 'indirizzo/city'
require 'indirizzo/street'
require 'indirizzo/helper'

module Indirizzo
# Defines the matching of parsed address tokens.
Match = {
# FIXME: shouldn't have to anchor :number and :zip at start/end
:number => /^(\d+\W|[a-z]+)?(\d+)([a-z]?)\b/io,
:street => /(?:\b(?:\d+\w*|[a-z'-]+)\s*)+/io,
:city => /(?:\b[a-z][a-z'-]+\s*)+/io,
:state => State.regexp,
:zip => /\b(\d{5})(?:-(\d{4}))?\b/o,
:at => /\s(at|@|and|&)\s/io,
:po_box => /\b[P|p]*(OST|ost)*\.*\s*[O|o|0]*(ffice|FFICE)*\.*\s*[B|b][O|o|0][X|x]\b/
}

# The Address class takes a US street address or place name and
# constructs a list of possible structured parses of the address
# string.
Expand Down Expand Up @@ -42,232 +36,35 @@ def initialize (text, options={})

# Removes any characters that aren't strictly part of an address string.
def clean (value)
value.strip \
.gsub(/[^a-z0-9 ,'&@\/-]+/io, "") \
.gsub(/\s+/o, " ")
Helper.clean(value)
end

def assign_text_to_address(text)
if !text[:address].nil?
@text = clean text[:address]
parse
else
@street = []
@prenum = text[:prenum]
@sufnum = text[:sufnum]
if !text[:street].nil?
@street = text[:street].scan(Match[:street])
end
@number = ""
if !@street.nil?
if text[:number].nil?
@street.map! { |single_street|
@number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s
single_street.sub! @number, ""
single_street.sub! /^\s*,?\s*/o, ""
}
else
@number = text[:number].to_s
end
@street = expand_streets(@street) if @options[:expand_streets]
street_parts
end
@city = []
if !text[:city].nil?
@city.push(text[:city])
@text = text[:city].to_s
else
@city.push("")
end
if !text[:region].nil?
# @state = []
@state = text[:region]
if @state.length > 2
# full_state = @state.strip # special case: New York
@state = State[@state]
end
elsif !text[:state].nil?
@state = text[:state]
elsif !text[:country].nil?
@state = text[:country]
end

@zip = text[:postal_code]
@plus4 = text[:plus4]
if !@zip
@zip = @plus4 = ""
end
end
@text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country = AddressHashExtractor.extract(text, @options)
end

# Expands a token into a list of possible strings based on
# the Geocoder::US::Name_Abbr constant, and expands numerals and
# number words into their possible equivalents.
def expand_numbers (string)
if /\b\d+(?:st|nd|rd|th)?\b/o.match string
match = $&
num = $&.to_i
elsif Ordinals.regexp.match string
num = Ordinals[$&]
match = $&
elsif Cardinals.regexp.match string
num = Cardinals[$&]
match = $&
end
strings = []
if num and num < 100
[num.to_s, Ordinals[num], Cardinals[num]].each {|replace|
strings << string.sub(match, replace)
}
else
strings << string
end
strings
end

def parse_state(regex_match, text)
idx = text.rindex(regex_match)
@full_state = @state[0].strip # special case: New York
@state = State[@full_state]
@city = "Washington" if @state == "DC" && text[idx...idx+regex_match.length] =~ /washington\s+d\.?c\.?/i
text
NumberHelper.expand_numbers(string)
end

def parse
text = @text.clone

@zip = text.scan(Match[:zip]).last
if @zip
last_match = $&
zip_index = text.rindex(last_match)
zip_end_index = zip_index + last_match.length - 1
@zip, @plus4 = @zip.map {|s| s and s.strip }
else
@zip = @plus4 = ""
zip_index = text.length
zip_end_index = -1
end

@country = @text[zip_end_index+1..-1].sub(/^\s*,\s*/, '').strip
@country = nil if @country == text

@state = text.scan(Match[:state]).last
if @state
last_match = $&
state_index = text.rindex(last_match)
text = parse_state(last_match, text)
else
@full_state = ""
@state = ""
end

@number = text.scan(Match[:number]).first
# FIXME: 230 Fish And Game Rd, Hudson NY 12534
if @number # and not intersection?
last_match = $&
number_index = text.index(last_match)
number_end_index = number_index + last_match.length - 1
@prenum, @number, @sufnum = @number.map {|s| s and s.strip}
else
number_end_index = -1
@prenum = @number = @sufnum = ""
end

# FIXME: special case: Name_Abbr gets a bit aggressive
# about replacing St with Saint. exceptional case:
# Sault Ste. Marie

# FIXME: PO Box should geocode to ZIP
street_search_end_index = [state_index,zip_index,text.length].reject(&:nil?).min-1
@street = text[number_end_index+1..street_search_end_index].scan(Match[:street]).map { |s| s and s.strip }

@street = expand_streets(@street) if @options[:expand_streets]
# SPECIAL CASE: 1600 Pennsylvania 20050
@street << @full_state if @street.empty? and @state.downcase != @full_state.downcase

street_end_index = @street.map { |s| text.rindex(s) }.reject(&:nil?).min||0

if @city.nil? || @city.empty?
@city = text[street_end_index..street_search_end_index+1].scan(Match[:city])
if !@city.empty?
#@city = [@city[-1].strip]
@city = [@city.last.strip]
add = @city.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}}
@city |= add
@city.uniq! { |s| s.downcase }
else
@city = []
end

# SPECIAL CASE: no city, but a state with the same name. e.g. "New York"
@city << @full_state if @state.downcase != @full_state.downcase
end

@city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country = Parser.new(@text, @options).parse
end

def expand_streets(street)
if !street.empty? && !street[0].nil?
street.map! {|s|s.strip}
add = street.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}}
street |= add
add = street.map {|item| item.gsub(Std_Abbr.regexp) {|m| Std_Abbr[m]}}
street |= add
street.map! {|item| expand_numbers(item)}
street.flatten!
street.uniq! { |s| s.downcase }
else
street = []
end
street
Street.expand(street)
end

def street_parts
strings = []
# Get all the substrings delimited by whitespace
@street.each {|string|
tokens = string.split(" ")
strings |= (0...tokens.length).map {|i|
(i...tokens.length).map {|j| tokens[i..j].join(" ")}}.flatten
}
strings = remove_noise_words(strings)

# Try a simpler case of adding the @number in case everything is an abbr.
strings += [@number] if strings.all? {|s| Std_Abbr.key? s or Name_Abbr.key? s}
strings.uniq
Street.parts(@street, @number)
end

def remove_noise_words(strings)
# Don't return strings that consist solely of abbreviations.
# NOTE: Is this a micro-optimization that has edge cases that will break?
# Answer: Yes, it breaks on simple things like "Prairie St" or "Front St"
prefix = Regexp.new("^" + Prefix_Type.regexp.source + "\s*", Regexp::IGNORECASE)
suffix = Regexp.new("\s*" + Suffix_Type.regexp.source + "$", Regexp::IGNORECASE)
predxn = Regexp.new("^" + Directional.regexp.source + "\s*", Regexp::IGNORECASE)
sufdxn = Regexp.new("\s*" + Directional.regexp.source + "$", Regexp::IGNORECASE)
good_strings = strings.map {|s|
s = s.clone
s.gsub!(predxn, "")
s.gsub!(sufdxn, "")
s.gsub!(prefix, "")
s.gsub!(suffix, "")
s
}
good_strings.reject! {|s| s.empty?}
strings = good_strings if !good_strings.empty? {|s| not Std_Abbr.key?(s) and not Name_Abbr.key?(s)}
strings
Helper.remove_noise_words(strings)
end

def city_parts
strings = []
@city.map do |string|
tokens = string.split(" ")
strings |= (0...tokens.length).to_a.reverse.map {|i|
(i...tokens.length).map {|j| tokens[i..j].join(" ")}}.flatten
end
# Don't return strings that consist solely of abbreviations.
# NOTE: Is this a micro-optimization that has edge cases that will break?
# Answer: Yes, it breaks on "Prairie"
strings.reject { |s| Std_Abbr.key?(s) }.uniq
City.city_parts(@city)
end

def city= (strings)
Expand Down
84 changes: 84 additions & 0 deletions lib/indirizzo/address_hash_extractor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
module Indirizzo
class AddressHashExtractor
def self.extract(address_hash, options)
AddressHashExtractor.new(address_hash, options).extract
end

def initialize(address_hash, options={})
@address_hash = address_hash
@options = options
end
attr_accessor :address_hash

def extract
if !address_hash[:address].nil?
@text = Helper.clean address_hash[:address]
return Parser.new(@text, @options).parse
else
handle_hash
end

return @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country
end

private
def handle_hash
handle_street_and_numbers
handle_city
handle_state
handle_zip
end

def handle_street_and_numbers
@street = []
@prenum = address_hash[:prenum]
@sufnum = address_hash[:sufnum]
if !address_hash[:street].nil?
@street = address_hash[:street].scan(Match[:street])
end
@number = ""
if !@street.nil?
if address_hash[:number].nil?
@street.map! { |single_street|
@number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s
single_street.sub! @number, ""
single_street.sub! /^\s*,?\s*/o, ""
}
else
@number = address_hash[:number].to_s
end
@street = Street.expand(@street) if @options[:expand_streets]
end
end

def handle_city
@city = []
if !address_hash[:city].nil?
@city.push(address_hash[:city])
@text = address_hash[:city].to_s
else
@city.push("")
end
end

def handle_state
if !address_hash[:region].nil?
@state = address_hash[:region]
# full_state = @state.strip # special case: New York
@state = State[@state] if @state.length > 2
elsif !address_hash[:state].nil?
@state = address_hash[:state]
elsif !address_hash[:country].nil?
@state = address_hash[:country]
end
end

def handle_zip
@zip = address_hash[:postal_code]
@plus4 = address_hash[:plus4]
if !@zip
@zip = @plus4 = ""
end
end
end
end
Loading