diff --git a/Gemfile b/Gemfile index a6ea715..2e70ee3 100644 --- a/Gemfile +++ b/Gemfile @@ -1,10 +1,11 @@ -source :rubygems +source 'https://rubygems.org' gem 'rake' group :test, :development do - gem 'cover_me' + gem 'simplecov', require: false gem 'awesome_print' + gem 'pry' end group :test do diff --git a/README.md b/README.md index 604db04..a6c5b25 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ and constants handling code) from [Geocommons](http://geocommons.com/)' [Geocoder::US 2.0](https://github.com/geocommons/geocoder) gem. [![Build Status](https://secure.travis-ci.org/daveworth/Indirizzo.png)](http://travis-ci.org/daveworth/Indirizzo) +[![Gem Version](https://badge.fury.io/rb/Indirizzo.png)](http://badge.fury.io/rb/Indirizzo) ## Background diff --git a/Rakefile b/Rakefile index e134955..925d9de 100644 --- a/Rakefile +++ b/Rakefile @@ -17,15 +17,3 @@ Rake::TestTask.new(:test) do |test| end task :default => :test - -namespace :cover_me do - desc "Generates and opens code coverage report." - task :report do - require 'cover_me' - CoverMe.complete! - end -end - -task :test do - Rake::Task['cover_me:report'].invoke -end diff --git a/lib/indirizzo/address.rb b/lib/indirizzo/address.rb index a24359f..89d7996 100644 --- a/lib/indirizzo/address.rb +++ b/lib/indirizzo/address.rb @@ -1,18 +1,12 @@ require 'indirizzo/constants' +require 'indirizzo/parser' +require 'indirizzo/address_hash_extractor' +require 'indirizzo/match' +require 'indirizzo/city' +require 'indirizzo/street' +require 'indirizzo/helper' module Indirizzo - # Defines the matching of parsed address tokens. - Match = { - # FIXME: shouldn't have to anchor :number and :zip at start/end - :number => /^(\d+\W|[a-z]+)?(\d+)([a-z]?)\b/io, - :street => /(?:\b(?:\d+\w*|[a-z'-]+)\s*)+/io, - :city => /(?:\b[a-z][a-z'-]+\s*)+/io, - :state => State.regexp, - :zip => /\b(\d{5})(?:-(\d{4}))?\b/o, - :at => /\s(at|@|and|&)\s/io, - :po_box => /\b[P|p]*(OST|ost)*\.*\s*[O|o|0]*(ffice|FFICE)*\.*\s*[B|b][O|o|0][X|x]\b/ - } - # The Address class takes a US street address or place name and # constructs a list of possible structured parses of the address # string. @@ -42,232 +36,35 @@ def initialize (text, options={}) # Removes any characters that aren't strictly part of an address string. def clean (value) - value.strip \ - .gsub(/[^a-z0-9 ,'&@\/-]+/io, "") \ - .gsub(/\s+/o, " ") + Helper.clean(value) end def assign_text_to_address(text) - if !text[:address].nil? - @text = clean text[:address] - parse - else - @street = [] - @prenum = text[:prenum] - @sufnum = text[:sufnum] - if !text[:street].nil? - @street = text[:street].scan(Match[:street]) - end - @number = "" - if !@street.nil? - if text[:number].nil? - @street.map! { |single_street| - @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s - single_street.sub! @number, "" - single_street.sub! /^\s*,?\s*/o, "" - } - else - @number = text[:number].to_s - end - @street = expand_streets(@street) if @options[:expand_streets] - street_parts - end - @city = [] - if !text[:city].nil? - @city.push(text[:city]) - @text = text[:city].to_s - else - @city.push("") - end - if !text[:region].nil? - # @state = [] - @state = text[:region] - if @state.length > 2 - # full_state = @state.strip # special case: New York - @state = State[@state] - end - elsif !text[:state].nil? - @state = text[:state] - elsif !text[:country].nil? - @state = text[:country] - end - - @zip = text[:postal_code] - @plus4 = text[:plus4] - if !@zip - @zip = @plus4 = "" - end - end + @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country = AddressHashExtractor.extract(text, @options) end - # Expands a token into a list of possible strings based on - # the Geocoder::US::Name_Abbr constant, and expands numerals and - # number words into their possible equivalents. def expand_numbers (string) - if /\b\d+(?:st|nd|rd|th)?\b/o.match string - match = $& - num = $&.to_i - elsif Ordinals.regexp.match string - num = Ordinals[$&] - match = $& - elsif Cardinals.regexp.match string - num = Cardinals[$&] - match = $& - end - strings = [] - if num and num < 100 - [num.to_s, Ordinals[num], Cardinals[num]].each {|replace| - strings << string.sub(match, replace) - } - else - strings << string - end - strings - end - - def parse_state(regex_match, text) - idx = text.rindex(regex_match) - @full_state = @state[0].strip # special case: New York - @state = State[@full_state] - @city = "Washington" if @state == "DC" && text[idx...idx+regex_match.length] =~ /washington\s+d\.?c\.?/i - text + NumberHelper.expand_numbers(string) end def parse - text = @text.clone - - @zip = text.scan(Match[:zip]).last - if @zip - last_match = $& - zip_index = text.rindex(last_match) - zip_end_index = zip_index + last_match.length - 1 - @zip, @plus4 = @zip.map {|s| s and s.strip } - else - @zip = @plus4 = "" - zip_index = text.length - zip_end_index = -1 - end - - @country = @text[zip_end_index+1..-1].sub(/^\s*,\s*/, '').strip - @country = nil if @country == text - - @state = text.scan(Match[:state]).last - if @state - last_match = $& - state_index = text.rindex(last_match) - text = parse_state(last_match, text) - else - @full_state = "" - @state = "" - end - - @number = text.scan(Match[:number]).first - # FIXME: 230 Fish And Game Rd, Hudson NY 12534 - if @number # and not intersection? - last_match = $& - number_index = text.index(last_match) - number_end_index = number_index + last_match.length - 1 - @prenum, @number, @sufnum = @number.map {|s| s and s.strip} - else - number_end_index = -1 - @prenum = @number = @sufnum = "" - end - - # FIXME: special case: Name_Abbr gets a bit aggressive - # about replacing St with Saint. exceptional case: - # Sault Ste. Marie - - # FIXME: PO Box should geocode to ZIP - street_search_end_index = [state_index,zip_index,text.length].reject(&:nil?).min-1 - @street = text[number_end_index+1..street_search_end_index].scan(Match[:street]).map { |s| s and s.strip } - - @street = expand_streets(@street) if @options[:expand_streets] - # SPECIAL CASE: 1600 Pennsylvania 20050 - @street << @full_state if @street.empty? and @state.downcase != @full_state.downcase - - street_end_index = @street.map { |s| text.rindex(s) }.reject(&:nil?).min||0 - - if @city.nil? || @city.empty? - @city = text[street_end_index..street_search_end_index+1].scan(Match[:city]) - if !@city.empty? - #@city = [@city[-1].strip] - @city = [@city.last.strip] - add = @city.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}} - @city |= add - @city.uniq! { |s| s.downcase } - else - @city = [] - end - - # SPECIAL CASE: no city, but a state with the same name. e.g. "New York" - @city << @full_state if @state.downcase != @full_state.downcase - end - + @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country = Parser.new(@text, @options).parse end def expand_streets(street) - if !street.empty? && !street[0].nil? - street.map! {|s|s.strip} - add = street.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}} - street |= add - add = street.map {|item| item.gsub(Std_Abbr.regexp) {|m| Std_Abbr[m]}} - street |= add - street.map! {|item| expand_numbers(item)} - street.flatten! - street.uniq! { |s| s.downcase } - else - street = [] - end - street + Street.expand(street) end def street_parts - strings = [] - # Get all the substrings delimited by whitespace - @street.each {|string| - tokens = string.split(" ") - strings |= (0...tokens.length).map {|i| - (i...tokens.length).map {|j| tokens[i..j].join(" ")}}.flatten - } - strings = remove_noise_words(strings) - - # Try a simpler case of adding the @number in case everything is an abbr. - strings += [@number] if strings.all? {|s| Std_Abbr.key? s or Name_Abbr.key? s} - strings.uniq + Street.parts(@street, @number) end def remove_noise_words(strings) - # Don't return strings that consist solely of abbreviations. - # NOTE: Is this a micro-optimization that has edge cases that will break? - # Answer: Yes, it breaks on simple things like "Prairie St" or "Front St" - prefix = Regexp.new("^" + Prefix_Type.regexp.source + "\s*", Regexp::IGNORECASE) - suffix = Regexp.new("\s*" + Suffix_Type.regexp.source + "$", Regexp::IGNORECASE) - predxn = Regexp.new("^" + Directional.regexp.source + "\s*", Regexp::IGNORECASE) - sufdxn = Regexp.new("\s*" + Directional.regexp.source + "$", Regexp::IGNORECASE) - good_strings = strings.map {|s| - s = s.clone - s.gsub!(predxn, "") - s.gsub!(sufdxn, "") - s.gsub!(prefix, "") - s.gsub!(suffix, "") - s - } - good_strings.reject! {|s| s.empty?} - strings = good_strings if !good_strings.empty? {|s| not Std_Abbr.key?(s) and not Name_Abbr.key?(s)} - strings + Helper.remove_noise_words(strings) end def city_parts - strings = [] - @city.map do |string| - tokens = string.split(" ") - strings |= (0...tokens.length).to_a.reverse.map {|i| - (i...tokens.length).map {|j| tokens[i..j].join(" ")}}.flatten - end - # Don't return strings that consist solely of abbreviations. - # NOTE: Is this a micro-optimization that has edge cases that will break? - # Answer: Yes, it breaks on "Prairie" - strings.reject { |s| Std_Abbr.key?(s) }.uniq + City.city_parts(@city) end def city= (strings) diff --git a/lib/indirizzo/address_hash_extractor.rb b/lib/indirizzo/address_hash_extractor.rb new file mode 100644 index 0000000..fbe4f69 --- /dev/null +++ b/lib/indirizzo/address_hash_extractor.rb @@ -0,0 +1,84 @@ +module Indirizzo + class AddressHashExtractor + def self.extract(address_hash, options) + AddressHashExtractor.new(address_hash, options).extract + end + + def initialize(address_hash, options={}) + @address_hash = address_hash + @options = options + end + attr_accessor :address_hash + + def extract + if !address_hash[:address].nil? + @text = Helper.clean address_hash[:address] + return Parser.new(@text, @options).parse + else + handle_hash + end + + return @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country + end + + private + def handle_hash + handle_street_and_numbers + handle_city + handle_state + handle_zip + end + + def handle_street_and_numbers + @street = [] + @prenum = address_hash[:prenum] + @sufnum = address_hash[:sufnum] + if !address_hash[:street].nil? + @street = address_hash[:street].scan(Match[:street]) + end + @number = "" + if !@street.nil? + if address_hash[:number].nil? + @street.map! { |single_street| + @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s + single_street.sub! @number, "" + single_street.sub! /^\s*,?\s*/o, "" + } + else + @number = address_hash[:number].to_s + end + @street = Street.expand(@street) if @options[:expand_streets] + end + end + + def handle_city + @city = [] + if !address_hash[:city].nil? + @city.push(address_hash[:city]) + @text = address_hash[:city].to_s + else + @city.push("") + end + end + + def handle_state + if !address_hash[:region].nil? + @state = address_hash[:region] + # full_state = @state.strip # special case: New York + @state = State[@state] if @state.length > 2 + elsif !address_hash[:state].nil? + @state = address_hash[:state] + elsif !address_hash[:country].nil? + @state = address_hash[:country] + end + end + + def handle_zip + @zip = address_hash[:postal_code] + @plus4 = address_hash[:plus4] + if !@zip + @zip = @plus4 = "" + end + end + end +end diff --git a/lib/indirizzo/city.rb b/lib/indirizzo/city.rb new file mode 100644 index 0000000..a70717f --- /dev/null +++ b/lib/indirizzo/city.rb @@ -0,0 +1,17 @@ +module Indirizzo + class City + def self.city_parts(city) + strings = [] + city.map do |string| + tokens = string.split(" ") + strings |= (0...tokens.length).to_a.reverse.map do |i| + (i...tokens.length).map {|j| tokens[i..j].join(" ")} + end.flatten + end + # Don't return strings that consist solely of abbreviations. + # NOTE: Is this a micro-optimization that has edge cases that will break? + # Answer: Yes, it breaks on "Prairie" + strings.reject { |s| Std_Abbr.key?(s) }.uniq + end + end +end diff --git a/lib/indirizzo/helper.rb b/lib/indirizzo/helper.rb new file mode 100644 index 0000000..7c2a447 --- /dev/null +++ b/lib/indirizzo/helper.rb @@ -0,0 +1,30 @@ +module Indirizzo + class Helper + def self.remove_noise_words(strings) + # Don't return strings that consist solely of abbreviations. + # NOTE: Is this a micro-optimization that has edge cases that will break? + # Answer: Yes, it breaks on simple things like "Prairie St" or "Front St" + prefix = Regexp.new("^" + Prefix_Type.regexp.source + "\s*", Regexp::IGNORECASE) + suffix = Regexp.new("\s*" + Suffix_Type.regexp.source + "$", Regexp::IGNORECASE) + predxn = Regexp.new("^" + Directional.regexp.source + "\s*", Regexp::IGNORECASE) + sufdxn = Regexp.new("\s*" + Directional.regexp.source + "$", Regexp::IGNORECASE) + good_strings = strings.map {|s| + s = s.clone + s.gsub!(predxn, "") + s.gsub!(sufdxn, "") + s.gsub!(prefix, "") + s.gsub!(suffix, "") + s + } + good_strings.reject! {|s| s.empty?} + strings = good_strings if !good_strings.empty? {|s| not Std_Abbr.key?(s) and not Name_Abbr.key?(s)} + strings + end + + def self.clean(value) + value.strip \ + .gsub(/[^a-z0-9 ,'&@\/-]+/io, "") \ + .gsub(/\s+/o, " ") + end + end +end diff --git a/lib/indirizzo/match.rb b/lib/indirizzo/match.rb new file mode 100644 index 0000000..fc11c2b --- /dev/null +++ b/lib/indirizzo/match.rb @@ -0,0 +1,13 @@ +module Indirizzo + # Defines the matching of parsed address tokens. + Match = { + # FIXME: shouldn't have to anchor :number and :zip at start/end + :number => /^(\d+\W|[a-z]+)?(\d+)([a-z]?)\b/io, + :street => /(?:\b(?:\d+\w*|[a-z'-]+)\s*)+/io, + :city => /(?:\b[a-z][a-z'-]+\s*)+/io, + :state => State.regexp, + :zip => /\b(\d{5})(?:-(\d{4}))?\b/o, + :at => /\s(at|@|and|&)\s/io, + :po_box => /\b[P|p]*(OST|ost)*\.*\s*[O|o|0]*(ffice|FFICE)*\.*\s*[B|b][O|o|0][X|x]\b/ + } +end diff --git a/lib/indirizzo/number_helper.rb b/lib/indirizzo/number_helper.rb new file mode 100644 index 0000000..dc61886 --- /dev/null +++ b/lib/indirizzo/number_helper.rb @@ -0,0 +1,29 @@ +module Indirizzo + class NumberHelper + # Expands a token into a list of possible strings based on + # the Geocoder::US::Name_Abbr constant, and expands numerals and + # number words into their possible equivalents. + def self.expand_numbers (string) + if /\b\d+(?:st|nd|rd|th)?\b/o.match string + match = $& + num = $&.to_i + elsif Ordinals.regexp.match string + num = Ordinals[$&] + match = $& + elsif Cardinals.regexp.match string + num = Cardinals[$&] + match = $& + end + strings = [] + if num and num < 100 + [num.to_s, Ordinals[num], Cardinals[num]].each {|replace| + strings << string.sub(match, replace) + } + else + strings << string + end + strings + end + end +end + diff --git a/lib/indirizzo/parser.rb b/lib/indirizzo/parser.rb new file mode 100644 index 0000000..95b0d1e --- /dev/null +++ b/lib/indirizzo/parser.rb @@ -0,0 +1,110 @@ +require 'indirizzo/match' +require 'indirizzo/helper' +require 'indirizzo/street' +require 'indirizzo/constants' + +module Indirizzo + class Parser + def initialize(text, options={}) + @text = text + @options = options + end + + def parse + text = @text.clone + + @zip, @plus4, zip_index, zip_end_index = extract_zip_from_text(text) + + @country = @text[zip_end_index+1..-1].sub(/^\s*,\s*/, '').strip + @country = nil if @country == text + + @state, @full_state, @city, state_index = extract_state_from_text(text) + + @prenum, @number, @sufnum, number_end_index = process_number(text) + + # FIXME: special case: Name_Abbr gets a bit aggressive + # about replacing St with Saint. exceptional case: + # Sault Ste. Marie + + # FIXME: PO Box should geocode to ZIP + street_search_end_index = [state_index,zip_index,text.length].reject(&:nil?).min-1 + @street = text[number_end_index+1..street_search_end_index].scan(Match[:street]).map { |s| s and s.strip } + + @street = Street.expand(@street) if @options[:expand_streets] + # SPECIAL CASE: 1600 Pennsylvania 20050 + @street << @full_state if @street.empty? and @state.downcase != @full_state.downcase + + street_end_index = @street.map { |s| text.rindex(s) }.reject(&:nil?).min||0 + + process_city(text, street_end_index, street_search_end_index) + + return @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country + end + + private + + def extract_zip_from_text(text) + zip = text.scan(Match[:zip]).last + if zip + last_match = $& + zip_index = text.rindex(last_match) + zip_end_index = zip_index + last_match.length - 1 + zip, plus4 = zip.map {|s| s and s.strip } + else + zip = plus4 = "" + zip_index = text.length + zip_end_index = -1 + end + return zip, plus4, zip_index, zip_end_index + end + + def extract_state_from_text(text) + state = text.scan(Match[:state]).last + if state + last_match = $& + state_index = text.rindex(last_match) + idx = text.rindex(last_match) + full_state = state[0].strip # special case: New York + state = State[full_state] + city = "Washington" if state == "DC" && text[idx...idx+last_match.length] =~ /washington\s+d\.?c\.?/i + else + full_state = "" + state = "" + end + return state, full_state, city, state_index + end + + def process_city(text, street_end_index, street_search_end_index) + if @city.nil? || @city.empty? + @city = text[street_end_index..street_search_end_index+1].scan(Match[:city]) + if !@city.empty? + #@city = [@city[-1].strip] + @city = [@city.last.strip] + add = @city.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}} + @city |= add + @city.uniq! {|s| s.downcase} + else + @city = [] + end + + # SPECIAL CASE: no city, but a state with the same name. e.g. "New York" + @city << @full_state if @state.downcase != @full_state.downcase + end + end + + def process_number(text) + number = text.scan(Match[:number]).first + # FIXME: 230 Fish And Game Rd, Hudson NY 12534 + if number # and not intersection? + last_match = $& + number_index = text.index(last_match) + number_end_index = number_index + last_match.length - 1 + prenum, number, sufnum = number.map {|s| s and s.strip} + else + number_end_index = -1 + prenum = number = sufnum = "" + end + return prenum, number, sufnum, number_end_index + end + end +end diff --git a/lib/indirizzo/street.rb b/lib/indirizzo/street.rb new file mode 100644 index 0000000..22f49a6 --- /dev/null +++ b/lib/indirizzo/street.rb @@ -0,0 +1,39 @@ +require 'indirizzo/constants' +require 'indirizzo/number_helper' +require 'indirizzo/helper' + +module Indirizzo + class Street + def self.expand(street) + if !street.empty? && !street[0].nil? + street.map! {|s|s.strip} + add = street.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}} + street |= add + add = street.map {|item| item.gsub(Std_Abbr.regexp) {|m| Std_Abbr[m]}} + street |= add + street.map! {|item| NumberHelper.expand_numbers(item)} + street.flatten! + street.uniq! {|s| s.downcase} + else + street = [] + end + street + end + + def self.parts(street, number) + strings = [] + # Get all the substrings delimited by whitespace + street.each do |string| + tokens = string.split(" ") + strings |= (0...tokens.length).map do |i| + (i...tokens.length).map {|j| tokens[i..j].join(" ")} + end.flatten + end + strings = Helper.remove_noise_words(strings) + + # Try a simpler case of adding the @number in case everything is an abbr. + strings += [number] if strings.all? {|s| Std_Abbr.key?(s) || Name_Abbr.key?(s)} + strings.uniq + end + end +end diff --git a/test/test_address.rb b/test/test_address.rb index 68b5a64..3032e3a 100644 --- a/test/test_address.rb +++ b/test/test_address.rb @@ -21,14 +21,6 @@ def test_doesnt_downcase_street assert_equal "Pennsylvania Av", addr.street.first end - def test_expand_numbers - num_list = ["5", "fifth", "five"] - num_list.each {|n| - addr = Address.new(n) - assert_equal num_list, addr.expand_numbers(n).to_a.sort - } - end - def test_expand_street addr = Address.new("1 First St, Atlanta GA, 12345") expected_streets = ["1 st", "first st", "one st"] diff --git a/test/test_helper.rb b/test/test_helper.rb index 683618e..8f40974 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -1,2 +1,3 @@ -require 'cover_me' +require 'simplecov' +SimpleCov.start require 'test/unit' diff --git a/test/test_number_helper.rb b/test/test_number_helper.rb new file mode 100644 index 0000000..e2fb268 --- /dev/null +++ b/test/test_number_helper.rb @@ -0,0 +1,10 @@ +require 'test_helper' +include Indirizzo +class TestNumberHelper < Test::Unit::TestCase + def test_expand_numbers + num_list = ["5", "fifth", "five"] + num_list.each {|n| + assert_equal num_list, NumberHelper.expand_numbers(n).to_a.sort + } + end +end