From 61b2e53ae32fff0536d9f79c1261f55c270f27a8 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 20 Jan 2013 10:16:18 -0500 Subject: [PATCH 01/22] Update README.md add gem version badge for funsies --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 604db04..a6c5b25 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ and constants handling code) from [Geocommons](http://geocommons.com/)' [Geocoder::US 2.0](https://github.com/geocommons/geocoder) gem. [![Build Status](https://secure.travis-ci.org/daveworth/Indirizzo.png)](http://travis-ci.org/daveworth/Indirizzo) +[![Gem Version](https://badge.fury.io/rb/Indirizzo.png)](http://badge.fury.io/rb/Indirizzo) ## Background From 10b06a15a57dda66df7a3a73e513cbf96ea1286c Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sat, 4 May 2013 23:07:17 -0400 Subject: [PATCH 02/22] update rubygems in Gemfile --- Gemfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index a6ea715..01fb6c2 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,4 @@ -source :rubygems +source 'https://rubygems.org' gem 'rake' From 713d984cc9afe734dc81f5013e4c92133f3a00ae Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sat, 4 May 2013 23:07:33 -0400 Subject: [PATCH 03/22] s/cover_me/simplecov/ --- Gemfile | 2 +- Rakefile | 12 ------------ test/test_helper.rb | 3 ++- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/Gemfile b/Gemfile index 01fb6c2..a16bb93 100644 --- a/Gemfile +++ b/Gemfile @@ -3,7 +3,7 @@ source 'https://rubygems.org' gem 'rake' group :test, :development do - gem 'cover_me' + gem 'simplecov', require: false gem 'awesome_print' end diff --git a/Rakefile b/Rakefile index e134955..925d9de 100644 --- a/Rakefile +++ b/Rakefile @@ -17,15 +17,3 @@ Rake::TestTask.new(:test) do |test| end task :default => :test - -namespace :cover_me do - desc "Generates and opens code coverage report." - task :report do - require 'cover_me' - CoverMe.complete! - end -end - -task :test do - Rake::Task['cover_me:report'].invoke -end diff --git a/test/test_helper.rb b/test/test_helper.rb index 683618e..8f40974 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -1,2 +1,3 @@ -require 'cover_me' +require 'simplecov' +SimpleCov.start require 'test/unit' From 1d2fa70b824ef904d746e5b4a5134c79d699de3b Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sat, 4 May 2013 23:07:38 -0400 Subject: [PATCH 04/22] add pry --- Gemfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Gemfile b/Gemfile index a16bb93..2e70ee3 100644 --- a/Gemfile +++ b/Gemfile @@ -5,6 +5,7 @@ gem 'rake' group :test, :development do gem 'simplecov', require: false gem 'awesome_print' + gem 'pry' end group :test do From 1352dafc0b1d045cbc5e99a264c0014b1f3c3df0 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sat, 4 May 2013 23:09:21 -0400 Subject: [PATCH 05/22] extract NumberHelper --- lib/indirizzo/address.rb | 24 ++---------------------- lib/indirizzo/number_helper.rb | 29 +++++++++++++++++++++++++++++ test/test_address.rb | 8 -------- test/test_number_helper.rb | 10 ++++++++++ 4 files changed, 41 insertions(+), 30 deletions(-) create mode 100644 lib/indirizzo/number_helper.rb create mode 100644 test/test_number_helper.rb diff --git a/lib/indirizzo/address.rb b/lib/indirizzo/address.rb index 9889f5d..77874a6 100644 --- a/lib/indirizzo/address.rb +++ b/lib/indirizzo/address.rb @@ -1,4 +1,5 @@ require 'indirizzo/constants' +require 'indirizzo/number_helper' module Indirizzo # Defines the matching of parsed address tokens. @@ -101,29 +102,8 @@ def assign_text_to_address(text) end end - # Expands a token into a list of possible strings based on - # the Geocoder::US::Name_Abbr constant, and expands numerals and - # number words into their possible equivalents. def expand_numbers (string) - if /\b\d+(?:st|nd|rd|th)?\b/o.match string - match = $& - num = $&.to_i - elsif Ordinals.regexp.match string - num = Ordinals[$&] - match = $& - elsif Cardinals.regexp.match string - num = Cardinals[$&] - match = $& - end - strings = [] - if num and num < 100 - [num.to_s, Ordinals[num], Cardinals[num]].each {|replace| - strings << string.sub(match, replace) - } - else - strings << string - end - strings + NumberHelper.expand_numbers(string) end def parse_state(regex_match, text) diff --git a/lib/indirizzo/number_helper.rb b/lib/indirizzo/number_helper.rb new file mode 100644 index 0000000..dc61886 --- /dev/null +++ b/lib/indirizzo/number_helper.rb @@ -0,0 +1,29 @@ +module Indirizzo + class NumberHelper + # Expands a token into a list of possible strings based on + # the Geocoder::US::Name_Abbr constant, and expands numerals and + # number words into their possible equivalents. + def self.expand_numbers (string) + if /\b\d+(?:st|nd|rd|th)?\b/o.match string + match = $& + num = $&.to_i + elsif Ordinals.regexp.match string + num = Ordinals[$&] + match = $& + elsif Cardinals.regexp.match string + num = Cardinals[$&] + match = $& + end + strings = [] + if num and num < 100 + [num.to_s, Ordinals[num], Cardinals[num]].each {|replace| + strings << string.sub(match, replace) + } + else + strings << string + end + strings + end + end +end + diff --git a/test/test_address.rb b/test/test_address.rb index ccef9e3..0b771cd 100644 --- a/test/test_address.rb +++ b/test/test_address.rb @@ -16,14 +16,6 @@ def test_new assert_equal "1600 Pennsylvania Av, Washington DC", addr.text end - def test_expand_numbers - num_list = ["5", "fifth", "five"] - num_list.each {|n| - addr = Address.new(n) - assert_equal num_list, addr.expand_numbers(n).to_a.sort - } - end - def test_expand_street addr = Address.new("1 First St, Atlanta GA, 12345") expected_streets = ["1 st", "first st", "one st"] diff --git a/test/test_number_helper.rb b/test/test_number_helper.rb new file mode 100644 index 0000000..e2fb268 --- /dev/null +++ b/test/test_number_helper.rb @@ -0,0 +1,10 @@ +require 'test_helper' +include Indirizzo +class TestNumberHelper < Test::Unit::TestCase + def test_expand_numbers + num_list = ["5", "fifth", "five"] + num_list.each {|n| + assert_equal num_list, NumberHelper.expand_numbers(n).to_a.sort + } + end +end From 3636c08e8ab359e712c550a071de8e73b2c75354 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sat, 4 May 2013 23:13:23 -0400 Subject: [PATCH 06/22] extract city class for #city_parts --- lib/indirizzo/address.rb | 11 +---------- lib/indirizzo/city.rb | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 10 deletions(-) create mode 100644 lib/indirizzo/city.rb diff --git a/lib/indirizzo/address.rb b/lib/indirizzo/address.rb index 77874a6..30fb016 100644 --- a/lib/indirizzo/address.rb +++ b/lib/indirizzo/address.rb @@ -241,16 +241,7 @@ def remove_noise_words(strings) end def city_parts - strings = [] - @city.map do |string| - tokens = string.split(" ") - strings |= (0...tokens.length).to_a.reverse.map {|i| - (i...tokens.length).map {|j| tokens[i..j].join(" ")}}.flatten - end - # Don't return strings that consist solely of abbreviations. - # NOTE: Is this a micro-optimization that has edge cases that will break? - # Answer: Yes, it breaks on "Prairie" - strings.reject { |s| Std_Abbr.key?(s) }.uniq + City.city_parts(@city) end def city= (strings) diff --git a/lib/indirizzo/city.rb b/lib/indirizzo/city.rb new file mode 100644 index 0000000..a70717f --- /dev/null +++ b/lib/indirizzo/city.rb @@ -0,0 +1,17 @@ +module Indirizzo + class City + def self.city_parts(city) + strings = [] + city.map do |string| + tokens = string.split(" ") + strings |= (0...tokens.length).to_a.reverse.map do |i| + (i...tokens.length).map {|j| tokens[i..j].join(" ")} + end.flatten + end + # Don't return strings that consist solely of abbreviations. + # NOTE: Is this a micro-optimization that has edge cases that will break? + # Answer: Yes, it breaks on "Prairie" + strings.reject { |s| Std_Abbr.key?(s) }.uniq + end + end +end From 8061535fa401df4cc21287d1ec84783dacfee17f Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sat, 4 May 2013 23:15:50 -0400 Subject: [PATCH 07/22] extract Match regexps --- lib/indirizzo/address.rb | 13 +------------ lib/indirizzo/match.rb | 13 +++++++++++++ 2 files changed, 14 insertions(+), 12 deletions(-) create mode 100644 lib/indirizzo/match.rb diff --git a/lib/indirizzo/address.rb b/lib/indirizzo/address.rb index 30fb016..4c4cb73 100644 --- a/lib/indirizzo/address.rb +++ b/lib/indirizzo/address.rb @@ -1,19 +1,8 @@ require 'indirizzo/constants' require 'indirizzo/number_helper' +require 'indirizzo/match' module Indirizzo - # Defines the matching of parsed address tokens. - Match = { - # FIXME: shouldn't have to anchor :number and :zip at start/end - :number => /^(\d+\W|[a-z]+)?(\d+)([a-z]?)\b/io, - :street => /(?:\b(?:\d+\w*|[a-z'-]+)\s*)+/io, - :city => /(?:\b[a-z][a-z'-]+\s*)+/io, - :state => State.regexp, - :zip => /\b(\d{5})(?:-(\d{4}))?\b/o, - :at => /\s(at|@|and|&)\s/io, - :po_box => /\b[P|p]*(OST|ost)*\.*\s*[O|o|0]*(ffice|FFICE)*\.*\s*[B|b][O|o|0][X|x]\b/ - } - # The Address class takes a US street address or place name and # constructs a list of possible structured parses of the address # string. diff --git a/lib/indirizzo/match.rb b/lib/indirizzo/match.rb new file mode 100644 index 0000000..fc11c2b --- /dev/null +++ b/lib/indirizzo/match.rb @@ -0,0 +1,13 @@ +module Indirizzo + # Defines the matching of parsed address tokens. + Match = { + # FIXME: shouldn't have to anchor :number and :zip at start/end + :number => /^(\d+\W|[a-z]+)?(\d+)([a-z]?)\b/io, + :street => /(?:\b(?:\d+\w*|[a-z'-]+)\s*)+/io, + :city => /(?:\b[a-z][a-z'-]+\s*)+/io, + :state => State.regexp, + :zip => /\b(\d{5})(?:-(\d{4}))?\b/o, + :at => /\s(at|@|and|&)\s/io, + :po_box => /\b[P|p]*(OST|ost)*\.*\s*[O|o|0]*(ffice|FFICE)*\.*\s*[B|b][O|o|0][X|x]\b/ + } +end From 6f61158b719de0bee46ce4175b14b810ec5b2355 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sat, 4 May 2013 23:23:47 -0400 Subject: [PATCH 08/22] extract Street class and associated methods --- lib/indirizzo/address.rb | 50 ++++------------------------------------ lib/indirizzo/helper.rb | 24 +++++++++++++++++++ lib/indirizzo/street.rb | 40 ++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 45 deletions(-) create mode 100644 lib/indirizzo/helper.rb create mode 100644 lib/indirizzo/street.rb diff --git a/lib/indirizzo/address.rb b/lib/indirizzo/address.rb index 4c4cb73..49f73c4 100644 --- a/lib/indirizzo/address.rb +++ b/lib/indirizzo/address.rb @@ -1,6 +1,7 @@ require 'indirizzo/constants' -require 'indirizzo/number_helper' require 'indirizzo/match' +require 'indirizzo/city' +require 'indirizzo/street' module Indirizzo # The Address class takes a US street address or place name and @@ -177,56 +178,15 @@ def parse end def expand_streets(street) - if !street.empty? && !street[0].nil? - street.map! {|s|s.strip} - add = street.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}} - street |= add - add = street.map {|item| item.gsub(Std_Abbr.regexp) {|m| Std_Abbr[m]}} - street |= add - street.map! {|item| expand_numbers(item)} - street.flatten! - street.map! {|s| s.downcase} - street.uniq! - else - street = [] - end - street + Street.expand(street) end def street_parts - strings = [] - # Get all the substrings delimited by whitespace - @street.each {|string| - tokens = string.split(" ") - strings |= (0...tokens.length).map {|i| - (i...tokens.length).map {|j| tokens[i..j].join(" ")}}.flatten - } - strings = remove_noise_words(strings) - - # Try a simpler case of adding the @number in case everything is an abbr. - strings += [@number] if strings.all? {|s| Std_Abbr.key? s or Name_Abbr.key? s} - strings.uniq + Street.parts(@street, @number) end def remove_noise_words(strings) - # Don't return strings that consist solely of abbreviations. - # NOTE: Is this a micro-optimization that has edge cases that will break? - # Answer: Yes, it breaks on simple things like "Prairie St" or "Front St" - prefix = Regexp.new("^" + Prefix_Type.regexp.source + "\s*", Regexp::IGNORECASE) - suffix = Regexp.new("\s*" + Suffix_Type.regexp.source + "$", Regexp::IGNORECASE) - predxn = Regexp.new("^" + Directional.regexp.source + "\s*", Regexp::IGNORECASE) - sufdxn = Regexp.new("\s*" + Directional.regexp.source + "$", Regexp::IGNORECASE) - good_strings = strings.map {|s| - s = s.clone - s.gsub!(predxn, "") - s.gsub!(sufdxn, "") - s.gsub!(prefix, "") - s.gsub!(suffix, "") - s - } - good_strings.reject! {|s| s.empty?} - strings = good_strings if !good_strings.empty? {|s| not Std_Abbr.key?(s) and not Name_Abbr.key?(s)} - strings + Helper.remove_noise_words(strings) end def city_parts diff --git a/lib/indirizzo/helper.rb b/lib/indirizzo/helper.rb new file mode 100644 index 0000000..a5b276d --- /dev/null +++ b/lib/indirizzo/helper.rb @@ -0,0 +1,24 @@ +module Indirizzo + class Helper + def self.remove_noise_words(strings) + # Don't return strings that consist solely of abbreviations. + # NOTE: Is this a micro-optimization that has edge cases that will break? + # Answer: Yes, it breaks on simple things like "Prairie St" or "Front St" + prefix = Regexp.new("^" + Prefix_Type.regexp.source + "\s*", Regexp::IGNORECASE) + suffix = Regexp.new("\s*" + Suffix_Type.regexp.source + "$", Regexp::IGNORECASE) + predxn = Regexp.new("^" + Directional.regexp.source + "\s*", Regexp::IGNORECASE) + sufdxn = Regexp.new("\s*" + Directional.regexp.source + "$", Regexp::IGNORECASE) + good_strings = strings.map {|s| + s = s.clone + s.gsub!(predxn, "") + s.gsub!(sufdxn, "") + s.gsub!(prefix, "") + s.gsub!(suffix, "") + s + } + good_strings.reject! {|s| s.empty?} + strings = good_strings if !good_strings.empty? {|s| not Std_Abbr.key?(s) and not Name_Abbr.key?(s)} + strings + end + end +end diff --git a/lib/indirizzo/street.rb b/lib/indirizzo/street.rb new file mode 100644 index 0000000..c1b98e9 --- /dev/null +++ b/lib/indirizzo/street.rb @@ -0,0 +1,40 @@ +require 'indirizzo/constants' +require 'indirizzo/number_helper' +require 'indirizzo/helper' + +module Indirizzo + class Street + def self.expand(street) + if !street.empty? && !street[0].nil? + street.map! {|s|s.strip} + add = street.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}} + street |= add + add = street.map {|item| item.gsub(Std_Abbr.regexp) {|m| Std_Abbr[m]}} + street |= add + street.map! {|item| NumberHelper.expand_numbers(item)} + street.flatten! + street.map! {|s| s.downcase} + street.uniq! + else + street = [] + end + street + end + + def self.parts(street, number) + strings = [] + # Get all the substrings delimited by whitespace + street.each do |string| + tokens = string.split(" ") + strings |= (0...tokens.length).map do |i| + (i...tokens.length).map {|j| tokens[i..j].join(" ")} + end.flatten + end + strings = Helper.remove_noise_words(strings) + + # Try a simpler case of adding the @number in case everything is an abbr. + strings += [number] if strings.all? {|s| Std_Abbr.key?(s) || Name_Abbr.key?(s)} + strings.uniq + end + end +end From 6899077187d386d5edb15755f6e39461cb0ca1e3 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sat, 4 May 2013 23:46:43 -0400 Subject: [PATCH 09/22] extract crazy parser into a class --- lib/indirizzo/address.rb | 80 +-------------------------------- lib/indirizzo/parser.rb | 96 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 78 deletions(-) create mode 100644 lib/indirizzo/parser.rb diff --git a/lib/indirizzo/address.rb b/lib/indirizzo/address.rb index 49f73c4..a5092a0 100644 --- a/lib/indirizzo/address.rb +++ b/lib/indirizzo/address.rb @@ -1,4 +1,5 @@ require 'indirizzo/constants' +require 'indirizzo/parser' require 'indirizzo/match' require 'indirizzo/city' require 'indirizzo/street' @@ -96,85 +97,8 @@ def expand_numbers (string) NumberHelper.expand_numbers(string) end - def parse_state(regex_match, text) - idx = text.rindex(regex_match) - @full_state = @state[0].strip # special case: New York - @state = State[@full_state] - @city = "Washington" if @state == "DC" && text[idx...idx+regex_match.length] =~ /washington\s+d\.?c\.?/i - text - end - def parse - text = @text.clone.downcase - - @zip = text.scan(Match[:zip]).last - if @zip - last_match = $& - zip_index = text.rindex(last_match) - zip_end_index = zip_index + last_match.length - 1 - @zip, @plus4 = @zip.map {|s| s and s.strip } - else - @zip = @plus4 = "" - zip_index = text.length - zip_end_index = -1 - end - - @country = @text[zip_end_index+1..-1].sub(/^\s*,\s*/, '').strip - @country = nil if @country == text - - @state = text.scan(Match[:state]).last - if @state - last_match = $& - state_index = text.rindex(last_match) - text = parse_state(last_match, text) - else - @full_state = "" - @state = "" - end - - @number = text.scan(Match[:number]).first - # FIXME: 230 Fish And Game Rd, Hudson NY 12534 - if @number # and not intersection? - last_match = $& - number_index = text.index(last_match) - number_end_index = number_index + last_match.length - 1 - @prenum, @number, @sufnum = @number.map {|s| s and s.strip} - else - number_end_index = -1 - @prenum = @number = @sufnum = "" - end - - # FIXME: special case: Name_Abbr gets a bit aggressive - # about replacing St with Saint. exceptional case: - # Sault Ste. Marie - - # FIXME: PO Box should geocode to ZIP - street_search_end_index = [state_index,zip_index,text.length].reject(&:nil?).min-1 - @street = text[number_end_index+1..street_search_end_index].scan(Match[:street]).map { |s| s and s.strip } - - @street = expand_streets(@street) if @options[:expand_streets] - # SPECIAL CASE: 1600 Pennsylvania 20050 - @street << @full_state if @street.empty? and @state.downcase != @full_state.downcase - - street_end_index = @street.map { |s| text.rindex(s) }.reject(&:nil?).min||0 - - if @city.nil? || @city.empty? - @city = text[street_end_index..street_search_end_index+1].scan(Match[:city]) - if !@city.empty? - #@city = [@city[-1].strip] - @city = [@city.last.strip] - add = @city.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}} - @city |= add - @city.map! {|s| s.downcase} - @city.uniq! - else - @city = [] - end - - # SPECIAL CASE: no city, but a state with the same name. e.g. "New York" - @city << @full_state if @state.downcase != @full_state.downcase - end - + @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country = Parser.new(@text, @options).parse end def expand_streets(street) diff --git a/lib/indirizzo/parser.rb b/lib/indirizzo/parser.rb new file mode 100644 index 0000000..33e328d --- /dev/null +++ b/lib/indirizzo/parser.rb @@ -0,0 +1,96 @@ +require 'indirizzo/match' +require 'indirizzo/street' +require 'indirizzo/constants' + +module Indirizzo + class Parser + def initialize(text, options={}) + @text = text + @options = options + end + + def parse + text = @text.clone.downcase + + @zip = text.scan(Match[:zip]).last + if @zip + last_match = $& + zip_index = text.rindex(last_match) + zip_end_index = zip_index + last_match.length - 1 + @zip, @plus4 = @zip.map {|s| s and s.strip } + else + @zip = @plus4 = "" + zip_index = text.length + zip_end_index = -1 + end + + @country = @text[zip_end_index+1..-1].sub(/^\s*,\s*/, '').strip + @country = nil if @country == text + + @state = text.scan(Match[:state]).last + if @state + last_match = $& + state_index = text.rindex(last_match) + text = parse_state(last_match, text) + else + @full_state = "" + @state = "" + end + + @number = text.scan(Match[:number]).first + # FIXME: 230 Fish And Game Rd, Hudson NY 12534 + if @number # and not intersection? + last_match = $& + number_index = text.index(last_match) + number_end_index = number_index + last_match.length - 1 + @prenum, @number, @sufnum = @number.map {|s| s and s.strip} + else + number_end_index = -1 + @prenum = @number = @sufnum = "" + end + + # FIXME: special case: Name_Abbr gets a bit aggressive + # about replacing St with Saint. exceptional case: + # Sault Ste. Marie + + # FIXME: PO Box should geocode to ZIP + street_search_end_index = [state_index,zip_index,text.length].reject(&:nil?).min-1 + @street = text[number_end_index+1..street_search_end_index].scan(Match[:street]).map { |s| s and s.strip } + + @street = Street.expand(@street) if @options[:expand_streets] + # SPECIAL CASE: 1600 Pennsylvania 20050 + @street << @full_state if @street.empty? and @state.downcase != @full_state.downcase + + street_end_index = @street.map { |s| text.rindex(s) }.reject(&:nil?).min||0 + + if @city.nil? || @city.empty? + @city = text[street_end_index..street_search_end_index+1].scan(Match[:city]) + if !@city.empty? + #@city = [@city[-1].strip] + @city = [@city.last.strip] + add = @city.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}} + @city |= add + @city.map! {|s| s.downcase} + @city.uniq! + else + @city = [] + end + + # SPECIAL CASE: no city, but a state with the same name. e.g. "New York" + @city << @full_state if @state.downcase != @full_state.downcase + end + + return @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country + end + + private + + def parse_state(regex_match, text) + idx = text.rindex(regex_match) + @full_state = @state[0].strip # special case: New York + @state = State[@full_state] + @city = "Washington" if @state == "DC" && text[idx...idx+regex_match.length] =~ /washington\s+d\.?c\.?/i + text + end + end +end From 7a32a6c9967562a5b883b0fbc541818fc6ed9a7b Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sat, 4 May 2013 23:49:14 -0400 Subject: [PATCH 10/22] extract #clean to Helper.clean --- lib/indirizzo/address.rb | 5 ++--- lib/indirizzo/helper.rb | 6 ++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/indirizzo/address.rb b/lib/indirizzo/address.rb index a5092a0..704d744 100644 --- a/lib/indirizzo/address.rb +++ b/lib/indirizzo/address.rb @@ -3,6 +3,7 @@ require 'indirizzo/match' require 'indirizzo/city' require 'indirizzo/street' +require 'indirizzo/helper' module Indirizzo # The Address class takes a US street address or place name and @@ -34,9 +35,7 @@ def initialize (text, options={}) # Removes any characters that aren't strictly part of an address string. def clean (value) - value.strip \ - .gsub(/[^a-z0-9 ,'&@\/-]+/io, "") \ - .gsub(/\s+/o, " ") + Helper.clean(value) end def assign_text_to_address(text) diff --git a/lib/indirizzo/helper.rb b/lib/indirizzo/helper.rb index a5b276d..7c2a447 100644 --- a/lib/indirizzo/helper.rb +++ b/lib/indirizzo/helper.rb @@ -20,5 +20,11 @@ def self.remove_noise_words(strings) strings = good_strings if !good_strings.empty? {|s| not Std_Abbr.key?(s) and not Name_Abbr.key?(s)} strings end + + def self.clean(value) + value.strip \ + .gsub(/[^a-z0-9 ,'&@\/-]+/io, "") \ + .gsub(/\s+/o, " ") + end end end From 05f8e903d9b49ed29e69e69b0cd2488b3b028a2f Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sat, 4 May 2013 23:56:39 -0400 Subject: [PATCH 11/22] extract #assign_text_to_address to Parser --- lib/indirizzo/address.rb | 52 +---------------------------------- lib/indirizzo/parser.rb | 58 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 51 deletions(-) diff --git a/lib/indirizzo/address.rb b/lib/indirizzo/address.rb index 704d744..2fe5f83 100644 --- a/lib/indirizzo/address.rb +++ b/lib/indirizzo/address.rb @@ -39,57 +39,7 @@ def clean (value) end def assign_text_to_address(text) - if !text[:address].nil? - @text = clean text[:address] - parse - else - @street = [] - @prenum = text[:prenum] - @sufnum = text[:sufnum] - if !text[:street].nil? - @street = text[:street].scan(Match[:street]) - end - @number = "" - if !@street.nil? - if text[:number].nil? - @street.map! { |single_street| - single_street.downcase! - @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s - single_street.sub! @number, "" - single_street.sub! /^\s*,?\s*/o, "" - } - else - @number = text[:number].to_s - end - @street = expand_streets(@street) if @options[:expand_streets] - street_parts - end - @city = [] - if !text[:city].nil? - @city.push(text[:city]) - @text = text[:city].to_s - else - @city.push("") - end - if !text[:region].nil? - # @state = [] - @state = text[:region] - if @state.length > 2 - # full_state = @state.strip # special case: New York - @state = State[@state] - end - elsif !text[:state].nil? - @state = text[:state] - elsif !text[:country].nil? - @state = text[:country] - end - - @zip = text[:postal_code] - @plus4 = text[:plus4] - if !@zip - @zip = @plus4 = "" - end - end + @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country = Parser.extract_data_from_hash(text, @options) end def expand_numbers (string) diff --git a/lib/indirizzo/parser.rb b/lib/indirizzo/parser.rb index 33e328d..97970ef 100644 --- a/lib/indirizzo/parser.rb +++ b/lib/indirizzo/parser.rb @@ -1,4 +1,5 @@ require 'indirizzo/match' +require 'indirizzo/helper' require 'indirizzo/street' require 'indirizzo/constants' @@ -83,6 +84,63 @@ def parse return @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country end + def self.extract_data_from_hash(address_hash, options) + text = address_hash + if !text[:address].nil? + @text = Helper.clean text[:address] + return self.new(@text, options).parse + else + @street = [] + @prenum = text[:prenum] + @sufnum = text[:sufnum] + if !text[:street].nil? + @street = text[:street].scan(Match[:street]) + end + @number = "" + if !@street.nil? + if text[:number].nil? + @street.map! { |single_street| + single_street.downcase! + @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s + single_street.sub! @number, "" + single_street.sub! /^\s*,?\s*/o, "" + } + else + @number = text[:number].to_s + end + @street = Street.expand(@street) if options[:expand_streets] + #Street.parts + end + @city = [] + if !text[:city].nil? + @city.push(text[:city]) + @text = text[:city].to_s + else + @city.push("") + end + if !text[:region].nil? + # @state = [] + @state = text[:region] + if @state.length > 2 + # full_state = @state.strip # special case: New York + @state = State[@state] + end + elsif !text[:state].nil? + @state = text[:state] + elsif !text[:country].nil? + @state = text[:country] + end + + @zip = text[:postal_code] + @plus4 = text[:plus4] + if !@zip + @zip = @plus4 = "" + end + end + + return @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country + end + private def parse_state(regex_match, text) From 69782f6b1a25290c24ca83443893ef99d9cf4a13 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:01:10 -0400 Subject: [PATCH 12/22] extract terrible method to its own class instead --- lib/indirizzo/address.rb | 3 +- lib/indirizzo/address_hash_extractor.rb | 61 +++++++++++++++++++++++++ lib/indirizzo/parser.rb | 57 ----------------------- 3 files changed, 63 insertions(+), 58 deletions(-) create mode 100644 lib/indirizzo/address_hash_extractor.rb diff --git a/lib/indirizzo/address.rb b/lib/indirizzo/address.rb index 2fe5f83..89d7996 100644 --- a/lib/indirizzo/address.rb +++ b/lib/indirizzo/address.rb @@ -1,5 +1,6 @@ require 'indirizzo/constants' require 'indirizzo/parser' +require 'indirizzo/address_hash_extractor' require 'indirizzo/match' require 'indirizzo/city' require 'indirizzo/street' @@ -39,7 +40,7 @@ def clean (value) end def assign_text_to_address(text) - @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country = Parser.extract_data_from_hash(text, @options) + @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country = AddressHashExtractor.extract(text, @options) end def expand_numbers (string) diff --git a/lib/indirizzo/address_hash_extractor.rb b/lib/indirizzo/address_hash_extractor.rb new file mode 100644 index 0000000..3bb3e7a --- /dev/null +++ b/lib/indirizzo/address_hash_extractor.rb @@ -0,0 +1,61 @@ +module Indirizzo + class AddressHashExtractor + def self.extract(address_hash, options) + text = address_hash + if !text[:address].nil? + @text = Helper.clean text[:address] + return Parser.new(@text, options).parse + else + @street = [] + @prenum = text[:prenum] + @sufnum = text[:sufnum] + if !text[:street].nil? + @street = text[:street].scan(Match[:street]) + end + @number = "" + if !@street.nil? + if text[:number].nil? + @street.map! { |single_street| + single_street.downcase! + @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s + single_street.sub! @number, "" + single_street.sub! /^\s*,?\s*/o, "" + } + else + @number = text[:number].to_s + end + @street = Street.expand(@street) if options[:expand_streets] + #Street.parts + end + @city = [] + if !text[:city].nil? + @city.push(text[:city]) + @text = text[:city].to_s + else + @city.push("") + end + if !text[:region].nil? + # @state = [] + @state = text[:region] + if @state.length > 2 + # full_state = @state.strip # special case: New York + @state = State[@state] + end + elsif !text[:state].nil? + @state = text[:state] + elsif !text[:country].nil? + @state = text[:country] + end + + @zip = text[:postal_code] + @plus4 = text[:plus4] + if !@zip + @zip = @plus4 = "" + end + end + + return @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country + end + + end +end diff --git a/lib/indirizzo/parser.rb b/lib/indirizzo/parser.rb index 97970ef..74656ee 100644 --- a/lib/indirizzo/parser.rb +++ b/lib/indirizzo/parser.rb @@ -84,63 +84,6 @@ def parse return @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country end - def self.extract_data_from_hash(address_hash, options) - text = address_hash - if !text[:address].nil? - @text = Helper.clean text[:address] - return self.new(@text, options).parse - else - @street = [] - @prenum = text[:prenum] - @sufnum = text[:sufnum] - if !text[:street].nil? - @street = text[:street].scan(Match[:street]) - end - @number = "" - if !@street.nil? - if text[:number].nil? - @street.map! { |single_street| - single_street.downcase! - @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s - single_street.sub! @number, "" - single_street.sub! /^\s*,?\s*/o, "" - } - else - @number = text[:number].to_s - end - @street = Street.expand(@street) if options[:expand_streets] - #Street.parts - end - @city = [] - if !text[:city].nil? - @city.push(text[:city]) - @text = text[:city].to_s - else - @city.push("") - end - if !text[:region].nil? - # @state = [] - @state = text[:region] - if @state.length > 2 - # full_state = @state.strip # special case: New York - @state = State[@state] - end - elsif !text[:state].nil? - @state = text[:state] - elsif !text[:country].nil? - @state = text[:country] - end - - @zip = text[:postal_code] - @plus4 = text[:plus4] - if !@zip - @zip = @plus4 = "" - end - end - - return @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country - end - private def parse_state(regex_match, text) From 969a88428ece201e1b344b095ffd261f8fc2c56a Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:06:32 -0400 Subject: [PATCH 13/22] extract zip-parsing into method * this is hinky thanks to the weird indices but at least this is a start --- lib/indirizzo/parser.rb | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/lib/indirizzo/parser.rb b/lib/indirizzo/parser.rb index 74656ee..c2c1ea4 100644 --- a/lib/indirizzo/parser.rb +++ b/lib/indirizzo/parser.rb @@ -13,17 +13,7 @@ def initialize(text, options={}) def parse text = @text.clone.downcase - @zip = text.scan(Match[:zip]).last - if @zip - last_match = $& - zip_index = text.rindex(last_match) - zip_end_index = zip_index + last_match.length - 1 - @zip, @plus4 = @zip.map {|s| s and s.strip } - else - @zip = @plus4 = "" - zip_index = text.length - zip_end_index = -1 - end + @zip, @plus4, zip_index, zip_end_index = extract_zip_from_text(text) @country = @text[zip_end_index+1..-1].sub(/^\s*,\s*/, '').strip @country = nil if @country == text @@ -93,5 +83,20 @@ def parse_state(regex_match, text) @city = "Washington" if @state == "DC" && text[idx...idx+regex_match.length] =~ /washington\s+d\.?c\.?/i text end + + def extract_zip_from_text(text) + zip = text.scan(Match[:zip]).last + if zip + last_match = $& + zip_index = text.rindex(last_match) + zip_end_index = zip_index + last_match.length - 1 + zip, plus4 = zip.map {|s| s and s.strip } + else + zip = plus4 = "" + zip_index = text.length + zip_end_index = -1 + end + return zip, plus4, zip_index, zip_end_index + end end end From c2c7a59c6cf67fd2a73a345a5e3544b368f037c0 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:12:58 -0400 Subject: [PATCH 14/22] extract state fetching into method * inline parse_state to avoid side-effects that were weird --- lib/indirizzo/parser.rb | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/lib/indirizzo/parser.rb b/lib/indirizzo/parser.rb index c2c1ea4..0ea7e9d 100644 --- a/lib/indirizzo/parser.rb +++ b/lib/indirizzo/parser.rb @@ -18,15 +18,7 @@ def parse @country = @text[zip_end_index+1..-1].sub(/^\s*,\s*/, '').strip @country = nil if @country == text - @state = text.scan(Match[:state]).last - if @state - last_match = $& - state_index = text.rindex(last_match) - text = parse_state(last_match, text) - else - @full_state = "" - @state = "" - end + @state, @full_state, @city, state_index = extract_state_from_text(text) @number = text.scan(Match[:number]).first # FIXME: 230 Fish And Game Rd, Hudson NY 12534 @@ -76,14 +68,6 @@ def parse private - def parse_state(regex_match, text) - idx = text.rindex(regex_match) - @full_state = @state[0].strip # special case: New York - @state = State[@full_state] - @city = "Washington" if @state == "DC" && text[idx...idx+regex_match.length] =~ /washington\s+d\.?c\.?/i - text - end - def extract_zip_from_text(text) zip = text.scan(Match[:zip]).last if zip @@ -98,5 +82,21 @@ def extract_zip_from_text(text) end return zip, plus4, zip_index, zip_end_index end + + def extract_state_from_text(text) + state = text.scan(Match[:state]).last + if state + last_match = $& + state_index = text.rindex(last_match) + idx = text.rindex(last_match) + full_state = state[0].strip # special case: New York + state = State[full_state] + city = "Washington" if state == "DC" && text[idx...idx+last_match.length] =~ /washington\s+d\.?c\.?/i + else + full_state = "" + state = "" + end + return state, full_state, city, state_index + end end end From 2ff67e282fe39de11d3f03eabd68aaf0ff1ff17e Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:17:23 -0400 Subject: [PATCH 15/22] extract city processing into method --- lib/indirizzo/parser.rb | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/lib/indirizzo/parser.rb b/lib/indirizzo/parser.rb index 0ea7e9d..328c03a 100644 --- a/lib/indirizzo/parser.rb +++ b/lib/indirizzo/parser.rb @@ -46,22 +46,7 @@ def parse street_end_index = @street.map { |s| text.rindex(s) }.reject(&:nil?).min||0 - if @city.nil? || @city.empty? - @city = text[street_end_index..street_search_end_index+1].scan(Match[:city]) - if !@city.empty? - #@city = [@city[-1].strip] - @city = [@city.last.strip] - add = @city.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}} - @city |= add - @city.map! {|s| s.downcase} - @city.uniq! - else - @city = [] - end - - # SPECIAL CASE: no city, but a state with the same name. e.g. "New York" - @city << @full_state if @state.downcase != @full_state.downcase - end + process_city(text, street_end_index, street_search_end_index) return @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country end @@ -98,5 +83,24 @@ def extract_state_from_text(text) end return state, full_state, city, state_index end + + def process_city(text, street_end_index, street_search_end_index) + if @city.nil? || @city.empty? + @city = text[street_end_index..street_search_end_index+1].scan(Match[:city]) + if !@city.empty? + #@city = [@city[-1].strip] + @city = [@city.last.strip] + add = @city.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}} + @city |= add + @city.map! {|s| s.downcase} + @city.uniq! + else + @city = [] + end + + # SPECIAL CASE: no city, but a state with the same name. e.g. "New York" + @city << @full_state if @state.downcase != @full_state.downcase + end + end end end From d1f77ddc5048e9d0b863ad2af2b1f17976802bb2 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:21:54 -0400 Subject: [PATCH 16/22] extract number processing into method --- lib/indirizzo/parser.rb | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/lib/indirizzo/parser.rb b/lib/indirizzo/parser.rb index 328c03a..3430184 100644 --- a/lib/indirizzo/parser.rb +++ b/lib/indirizzo/parser.rb @@ -20,17 +20,7 @@ def parse @state, @full_state, @city, state_index = extract_state_from_text(text) - @number = text.scan(Match[:number]).first - # FIXME: 230 Fish And Game Rd, Hudson NY 12534 - if @number # and not intersection? - last_match = $& - number_index = text.index(last_match) - number_end_index = number_index + last_match.length - 1 - @prenum, @number, @sufnum = @number.map {|s| s and s.strip} - else - number_end_index = -1 - @prenum = @number = @sufnum = "" - end + @prenum, @number, @sufnum, number_end_index = process_number(text) # FIXME: special case: Name_Abbr gets a bit aggressive # about replacing St with Saint. exceptional case: @@ -102,5 +92,20 @@ def process_city(text, street_end_index, street_search_end_index) @city << @full_state if @state.downcase != @full_state.downcase end end + + def process_number(text) + number = text.scan(Match[:number]).first + # FIXME: 230 Fish And Game Rd, Hudson NY 12534 + if number # and not intersection? + last_match = $& + number_index = text.index(last_match) + number_end_index = number_index + last_match.length - 1 + prenum, number, sufnum = number.map {|s| s and s.strip} + else + number_end_index = -1 + prenum = number = sufnum = "" + end + return prenum, number, sufnum, number_end_index + end end end From 4ce9b1f96c51c303788bdc3c103f77b423605765 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:28:18 -0400 Subject: [PATCH 17/22] basic extraction refactor driven by tests --- lib/indirizzo/address_hash_extractor.rb | 111 +++++++++++++----------- 1 file changed, 62 insertions(+), 49 deletions(-) diff --git a/lib/indirizzo/address_hash_extractor.rb b/lib/indirizzo/address_hash_extractor.rb index 3bb3e7a..c571cc9 100644 --- a/lib/indirizzo/address_hash_extractor.rb +++ b/lib/indirizzo/address_hash_extractor.rb @@ -1,61 +1,74 @@ module Indirizzo class AddressHashExtractor def self.extract(address_hash, options) - text = address_hash - if !text[:address].nil? - @text = Helper.clean text[:address] - return Parser.new(@text, options).parse + AddressHashExtractor.new(address_hash, options).extract + end + + def initialize(address_hash, options={}) + @address_hash = address_hash + @options = options + end + + def extract + if !@address_hash[:address].nil? + @text = Helper.clean @address_hash[:address] + return Parser.new(@text, @options).parse else - @street = [] - @prenum = text[:prenum] - @sufnum = text[:sufnum] - if !text[:street].nil? - @street = text[:street].scan(Match[:street]) - end - @number = "" - if !@street.nil? - if text[:number].nil? - @street.map! { |single_street| - single_street.downcase! - @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s - single_street.sub! @number, "" - single_street.sub! /^\s*,?\s*/o, "" - } - else - @number = text[:number].to_s - end - @street = Street.expand(@street) if options[:expand_streets] - #Street.parts - end - @city = [] - if !text[:city].nil? - @city.push(text[:city]) - @text = text[:city].to_s + handle_hash + end + + return @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country + end + + private + def handle_hash + text = @address_hash + @street = [] + @prenum = text[:prenum] + @sufnum = text[:sufnum] + if !text[:street].nil? + @street = text[:street].scan(Match[:street]) + end + @number = "" + if !@street.nil? + if text[:number].nil? + @street.map! { |single_street| + single_street.downcase! + @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s + single_street.sub! @number, "" + single_street.sub! /^\s*,?\s*/o, "" + } else - @city.push("") - end - if !text[:region].nil? - # @state = [] - @state = text[:region] - if @state.length > 2 - # full_state = @state.strip # special case: New York - @state = State[@state] - end - elsif !text[:state].nil? - @state = text[:state] - elsif !text[:country].nil? - @state = text[:country] + @number = text[:number].to_s end - - @zip = text[:postal_code] - @plus4 = text[:plus4] - if !@zip - @zip = @plus4 = "" + @street = Street.expand(@street) if @options[:expand_streets] + #Street.parts + end + @city = [] + if !text[:city].nil? + @city.push(text[:city]) + @text = text[:city].to_s + else + @city.push("") + end + if !text[:region].nil? + # @state = [] + @state = text[:region] + if @state.length > 2 + # full_state = @state.strip # special case: New York + @state = State[@state] end + elsif !text[:state].nil? + @state = text[:state] + elsif !text[:country].nil? + @state = text[:country] end - return @text, @city, @street, @number, @prenum, @sufnum, @full_state, @state, @zip, @plus4, @country + @zip = text[:postal_code] + @plus4 = text[:plus4] + if !@zip + @zip = @plus4 = "" + end end - end end From 43fd2fdb8b33f16a88d97b2eb6adcf32808d728a Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:30:37 -0400 Subject: [PATCH 18/22] extract city extraction code into method --- lib/indirizzo/address_hash_extractor.rb | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/lib/indirizzo/address_hash_extractor.rb b/lib/indirizzo/address_hash_extractor.rb index c571cc9..be863d0 100644 --- a/lib/indirizzo/address_hash_extractor.rb +++ b/lib/indirizzo/address_hash_extractor.rb @@ -44,13 +44,9 @@ def handle_hash @street = Street.expand(@street) if @options[:expand_streets] #Street.parts end - @city = [] - if !text[:city].nil? - @city.push(text[:city]) - @text = text[:city].to_s - else - @city.push("") - end + + handle_city + if !text[:region].nil? # @state = [] @state = text[:region] @@ -70,5 +66,15 @@ def handle_hash @zip = @plus4 = "" end end + + def handle_city + @city = [] + if !@address_hash[:city].nil? + @city.push(@address_hash[:city]) + @text = @address_hash[:city].to_s + else + @city.push("") + end + end end end From 2c8180b5a14c0acfc4aec2afb2c2a0ffecf1ed17 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:32:21 -0400 Subject: [PATCH 19/22] extract state extraction into method --- lib/indirizzo/address_hash_extractor.rb | 29 ++++++++++++++----------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/lib/indirizzo/address_hash_extractor.rb b/lib/indirizzo/address_hash_extractor.rb index be863d0..c7dcfba 100644 --- a/lib/indirizzo/address_hash_extractor.rb +++ b/lib/indirizzo/address_hash_extractor.rb @@ -46,19 +46,7 @@ def handle_hash end handle_city - - if !text[:region].nil? - # @state = [] - @state = text[:region] - if @state.length > 2 - # full_state = @state.strip # special case: New York - @state = State[@state] - end - elsif !text[:state].nil? - @state = text[:state] - elsif !text[:country].nil? - @state = text[:country] - end + handle_state @zip = text[:postal_code] @plus4 = text[:plus4] @@ -76,5 +64,20 @@ def handle_city @city.push("") end end + + def handle_state + if !@address_hash[:region].nil? + # @state = [] + @state = @address_hash[:region] + if @state.length > 2 + # full_state = @state.strip # special case: New York + @state = State[@state] + end + elsif !@address_hash[:state].nil? + @state = @address_hash[:state] + elsif !@address_hash[:country].nil? + @state = @address_hash[:country] + end + end end end From 8c3aa96534030ef28326ee393c01f9540875ec92 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:33:34 -0400 Subject: [PATCH 20/22] extract state and number extraction into method --- lib/indirizzo/address_hash_extractor.rb | 38 ++++++++++++++----------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/lib/indirizzo/address_hash_extractor.rb b/lib/indirizzo/address_hash_extractor.rb index c7dcfba..ae937e7 100644 --- a/lib/indirizzo/address_hash_extractor.rb +++ b/lib/indirizzo/address_hash_extractor.rb @@ -22,16 +22,22 @@ def extract private def handle_hash - text = @address_hash + handle_street_and_numbers + handle_city + handle_state + handle_zip + end + + def handle_street_and_numbers @street = [] - @prenum = text[:prenum] - @sufnum = text[:sufnum] - if !text[:street].nil? - @street = text[:street].scan(Match[:street]) + @prenum = @address_hash[:prenum] + @sufnum = @address_hash[:sufnum] + if !@address_hash[:street].nil? + @street = @address_hash[:street].scan(Match[:street]) end @number = "" if !@street.nil? - if text[:number].nil? + if @address_hash[:number].nil? @street.map! { |single_street| single_street.downcase! @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s @@ -39,19 +45,9 @@ def handle_hash single_street.sub! /^\s*,?\s*/o, "" } else - @number = text[:number].to_s + @number = @address_hash[:number].to_s end @street = Street.expand(@street) if @options[:expand_streets] - #Street.parts - end - - handle_city - handle_state - - @zip = text[:postal_code] - @plus4 = text[:plus4] - if !@zip - @zip = @plus4 = "" end end @@ -79,5 +75,13 @@ def handle_state @state = @address_hash[:country] end end + + def handle_zip + @zip = @address_hash[:postal_code] + @plus4 = @address_hash[:plus4] + if !@zip + @zip = @plus4 = "" + end + end end end From 78d4617f39c552491bbd8d889bf90af729aa03e2 Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:34:21 -0400 Subject: [PATCH 21/22] use attr_accessor correctly --- lib/indirizzo/address_hash_extractor.rb | 39 +++++++++++++------------ 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/lib/indirizzo/address_hash_extractor.rb b/lib/indirizzo/address_hash_extractor.rb index ae937e7..b3d0eda 100644 --- a/lib/indirizzo/address_hash_extractor.rb +++ b/lib/indirizzo/address_hash_extractor.rb @@ -8,10 +8,11 @@ def initialize(address_hash, options={}) @address_hash = address_hash @options = options end + attr_accessor :address_hash def extract - if !@address_hash[:address].nil? - @text = Helper.clean @address_hash[:address] + if !address_hash[:address].nil? + @text = Helper.clean address_hash[:address] return Parser.new(@text, @options).parse else handle_hash @@ -30,14 +31,14 @@ def handle_hash def handle_street_and_numbers @street = [] - @prenum = @address_hash[:prenum] - @sufnum = @address_hash[:sufnum] - if !@address_hash[:street].nil? - @street = @address_hash[:street].scan(Match[:street]) + @prenum = address_hash[:prenum] + @sufnum = address_hash[:sufnum] + if !address_hash[:street].nil? + @street = address_hash[:street].scan(Match[:street]) end @number = "" if !@street.nil? - if @address_hash[:number].nil? + if address_hash[:number].nil? @street.map! { |single_street| single_street.downcase! @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s @@ -45,7 +46,7 @@ def handle_street_and_numbers single_street.sub! /^\s*,?\s*/o, "" } else - @number = @address_hash[:number].to_s + @number = address_hash[:number].to_s end @street = Street.expand(@street) if @options[:expand_streets] end @@ -53,32 +54,32 @@ def handle_street_and_numbers def handle_city @city = [] - if !@address_hash[:city].nil? - @city.push(@address_hash[:city]) - @text = @address_hash[:city].to_s + if !address_hash[:city].nil? + @city.push(address_hash[:city]) + @text = address_hash[:city].to_s else @city.push("") end end def handle_state - if !@address_hash[:region].nil? + if !address_hash[:region].nil? # @state = [] - @state = @address_hash[:region] + @state = address_hash[:region] if @state.length > 2 # full_state = @state.strip # special case: New York @state = State[@state] end - elsif !@address_hash[:state].nil? - @state = @address_hash[:state] - elsif !@address_hash[:country].nil? - @state = @address_hash[:country] + elsif !address_hash[:state].nil? + @state = address_hash[:state] + elsif !address_hash[:country].nil? + @state = address_hash[:country] end end def handle_zip - @zip = @address_hash[:postal_code] - @plus4 = @address_hash[:plus4] + @zip = address_hash[:postal_code] + @plus4 = address_hash[:plus4] if !@zip @zip = @plus4 = "" end From 2dd0890801d3b9ce7304bca778a044af7fa2198e Mon Sep 17 00:00:00 2001 From: Dave Worth Date: Sun, 5 May 2013 00:37:11 -0400 Subject: [PATCH 22/22] tiny cleanup to maybe appease code climate --- lib/indirizzo/address_hash_extractor.rb | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/indirizzo/address_hash_extractor.rb b/lib/indirizzo/address_hash_extractor.rb index b3d0eda..4568fa6 100644 --- a/lib/indirizzo/address_hash_extractor.rb +++ b/lib/indirizzo/address_hash_extractor.rb @@ -64,12 +64,9 @@ def handle_city def handle_state if !address_hash[:region].nil? - # @state = [] @state = address_hash[:region] - if @state.length > 2 - # full_state = @state.strip # special case: New York - @state = State[@state] - end + # full_state = @state.strip # special case: New York + @state = State[@state] if @state.length > 2 elsif !address_hash[:state].nil? @state = address_hash[:state] elsif !address_hash[:country].nil?