diff --git a/README.md b/README.md index d42666c..23b0987 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,10 @@ Sitemapper.configure do |c| c.sitemap_host = "https://sitemaps.aws.whatever.com" # default nil + c.index_file_name = "my_index_file_name" # default "sitemap_index" + + c.sitemap_file_name = "my_file_name" # default "sitemap" + # The max number of elements to add to each sitemap c.max_urls = 20 # default 500 @@ -80,9 +84,11 @@ Same goes for in you want to add an image. Use `Sitemapper::ImageMap` and pass ` ## Saving your XML -Sitemapper gives you the raw XML in strings. This gives you the option to save that data however you wish. Maybe you're crazy and want to store it in your DB? Maybe you're running on Heroku and can't just write locally, so you need to ship it off to AWS. What ever the case, you have that freedom. +`Sitemapper.build` gives you the raw XML in strings. This gives you the option to save that data however you wish. Maybe you're crazy and want to store it in your DB? Maybe you're running on Heroku and can't just write locally, so you need to ship it off to AWS. Whatever the case, you have that freedom. + +There's a few options you have built in. `LocalStorage`, and `AwsStorage`. These are config options through `config.storage`. -There's a few options you have built in. `LocalStorage`, and `AwsStorage`. These are config options through `config.storage` +You can also use `Sitemapper.stream` to save the XML data one file at a time. This is useful for very large sitemaps which wouldn't fit in memory. This option won't return XML, and will instead use whatever storage (e.g. `LocalStorage` or `AwsStorage`) you have configured. ### LocalStorage @@ -137,6 +143,19 @@ Sitemapper.store(sitemaps, "my-prod-bucket/sitemaps") Lastly, so the searchengines know where your sitemaps are located (unless you aliased `/sitemap_index.xml`), you'll want to update your [robots.txt](http://www.robotstxt.org/) with `Sitemap: https://my-sitemap-host.com` +### Storing files one at a time + +Use `Sitemapper.stream` in place of `Sitemapper.build` to save files one at a time. For example: + +```crystal +Sitemapper.stream do |builder| + builder.add("/about", changefreq: "yearly", priority: 0.1) + builder.add("/profiles/somedude", changefreq: "always", priority: 0.9) +end +``` + +`Sitemapper.stream` accepts optional arguments for `host`, `max_url`, `use_index`, `storage`, and `storage_path`.All of them default to the options saved inside `Sitemapper.configure`. + ## Notifying Search Engines Once you have your sitemaps updated, it's usually a good idea to let the search engines know. Generally, they will crawl your site regularly anyway, but this at least gets things moving a little quicker. To do this, you can use the `ping_search_engines` method. diff --git a/spec/sitemapper_builder_spec.cr b/spec/sitemapper_in_memory_builder_spec.cr similarity index 81% rename from spec/sitemapper_builder_spec.cr rename to spec/sitemapper_in_memory_builder_spec.cr index ae17dd9..226f321 100644 --- a/spec/sitemapper_builder_spec.cr +++ b/spec/sitemapper_in_memory_builder_spec.cr @@ -1,15 +1,15 @@ require "./spec_helper" -describe Sitemapper::Builder do +describe Sitemapper::InMemoryBuilder do describe "#add" do it "adds /tacos to the paths" do - builder = Sitemapper::Builder.new(host: "", max_urls: 20, use_index: true) + builder = Sitemapper::InMemoryBuilder.new(host: "", max_urls: 20, use_index: true) builder.add("/tacos") builder.paginator.paths.size.should eq 1 end it "adds /burritors with a changefreq of weekly" do - builder = Sitemapper::Builder.new(host: "", max_urls: 20, use_index: true) + builder = Sitemapper::InMemoryBuilder.new(host: "", max_urls: 20, use_index: true) builder.add("/burritos", changefreq: "weekly") builder.paginator.paths.size.should eq 1 end @@ -17,7 +17,7 @@ describe Sitemapper::Builder do describe "#generate" do it "returns an array with 1 hash" do - builder = Sitemapper::Builder.new(host: "", max_urls: 20, use_index: false) + builder = Sitemapper::InMemoryBuilder.new(host: "", max_urls: 20, use_index: false) builder.add("/tacos") xml = builder.generate xml.size.should eq 1 @@ -26,7 +26,7 @@ describe Sitemapper::Builder do end it "returns an array with 4 hashes" do - builder = Sitemapper::Builder.new(host: "", max_urls: 1, use_index: true) + builder = Sitemapper::InMemoryBuilder.new(host: "", max_urls: 1, use_index: true) builder.add("/tacos/1") builder.add("/tacos/2") builder.add("/tacos/3") @@ -36,7 +36,7 @@ describe Sitemapper::Builder do end it "generates some valid sitemap xml" do - builder = Sitemapper::Builder.new(host: "http://food.com", max_urls: 100, use_index: true) + builder = Sitemapper::InMemoryBuilder.new(host: "http://food.com", max_urls: 100, use_index: true) builder.add("/tacos") xml = builder.generate.as(Array).first["data"] xml.should contain <<-XML @@ -53,7 +53,7 @@ describe Sitemapper::Builder do end it "generates the xml with a video tag data" do - builder = Sitemapper::Builder.new(host: "http://food.com", max_urls: 100, use_index: true) + builder = Sitemapper::InMemoryBuilder.new(host: "http://food.com", max_urls: 100, use_index: true) video = Sitemapper::VideoMap.new(thumbnail_loc: "http://video.org/sample.mpg", title: "Video", description: "This is a video", tags: ["red", "blue"]) builder.add("/tacos", video: video) xml = builder.generate.as(Array).first["data"] @@ -75,7 +75,7 @@ describe Sitemapper::Builder do end it "generates the xml with image tag data" do - builder = Sitemapper::Builder.new(host: "http://food.com", max_urls: 100, use_index: true) + builder = Sitemapper::InMemoryBuilder.new(host: "http://food.com", max_urls: 100, use_index: true) image = Sitemapper::ImageMap.new(loc: "http://image.org/sample.jpg", caption: "This is an image") builder.add("/tacos", image: image) xml = builder.generate.as(Array).first["data"] @@ -93,7 +93,7 @@ describe Sitemapper::Builder do end it "generates the sitemap_index with the specified host" do - builder = Sitemapper::Builder.new(host: "http://food.com", max_urls: 100, use_index: true) + builder = Sitemapper::InMemoryBuilder.new(host: "http://food.com", max_urls: 100, use_index: true) builder.add("/burgers") xml = builder.generate.as(Array).find { |h| h["name"] == "sitemap_index.xml" }.as(Hash(String, String)) xml["data"].should contain <<-XML @@ -103,7 +103,7 @@ describe Sitemapper::Builder do it "generates the sitemap_index with a custom sitemap host" do Sitemapper.configure { |c| c.sitemap_host = "https://sitemaps.myapp.com" } - builder = Sitemapper::Builder.new(host: "http://food.com", max_urls: 100, use_index: true) + builder = Sitemapper::InMemoryBuilder.new(host: "http://food.com", max_urls: 100, use_index: true) builder.add("/burgers") xml = builder.generate.as(Array).find { |h| h["name"] == "sitemap_index.xml" }.as(Hash(String, String)) xml["data"].should contain <<-XML diff --git a/src/sitemapper.cr b/src/sitemapper.cr index 4f3e6b3..da550c2 100644 --- a/src/sitemapper.cr +++ b/src/sitemapper.cr @@ -6,9 +6,8 @@ require "./sitemapper/video_map" require "./sitemapper/image_map" require "./sitemapper/sitemap_options" require "./sitemapper/paginator" -require "./sitemapper/builder" +require "./sitemapper/builder/*" require "./sitemapper/storage" -require "./sitemapper/streamer" require "./sitemapper/storage/*" require "./sitemapper/ping_bot" @@ -19,6 +18,8 @@ module Sitemapper setting use_index : Bool = false setting host : String, example: "https://mysite.com" setting sitemap_host : String? = nil + setting index_file_name : String = "sitemap_index" + setting sitemap_file_name : String = "sitemap" setting max_urls : Int32 = 500 setting storage : Sitemapper::Storage.class = Sitemapper::LocalStorage setting compress : Bool = true @@ -30,7 +31,7 @@ module Sitemapper Sitemapper.settings end - # Build your sitemaps. The block arg is an instance of `Sitemapper::Builder`. + # Build your sitemaps. The block arg is an instance of `Sitemapper::InMemoryBuilder`. # Args default to the configuration, but can be overriden. # ``` # Sitemapper.build(max_urls: 20) do |builder| @@ -43,12 +44,13 @@ module Sitemapper use_index : Bool = config.use_index, & ) : Array(Hash(String, String)) - builder = Sitemapper::Builder.new(host, max_urls, use_index) + builder = Sitemapper::InMemoryBuilder.new(host, max_urls, use_index) yield builder builder.generate end - # Build your sitemaps, streaming each file. The block arg is an instance of `Sitemapper::Streamer`. + # Build your sitemaps, saving each file once it reaches `max_urls`. + # The block arg is an instance of `Sitemapper::StreamBuilder`. # Args default to the configuration, but can be overriden. # ``` # Sitemapper.stream(path: "tmp/sitemaps") do |builder| @@ -62,10 +64,10 @@ module Sitemapper storage : Sitemapper::Storage.class = config.storage, storage_path : String = config.storage_path, & - ) : Array(Hash(String, String)) - builder = Sitemapper::Streamer.new(host, max_urls, use_index, storage, storage_path) + ) : Void + builder = Sitemapper::StreamBuilder.new(host, max_urls, use_index, storage, storage_path) yield builder - builder.generate + builder.finish end # Store your sitemap xml files. diff --git a/src/sitemapper/builder.cr b/src/sitemapper/builder.cr index 5c97e4c..1382f5a 100644 --- a/src/sitemapper/builder.cr +++ b/src/sitemapper/builder.cr @@ -1,5 +1,5 @@ module Sitemapper - class Builder + abstract class Builder XMLNS_SCHEMA = "http://www.sitemaps.org/schemas/sitemap/0.9" XMLNS_VIDEO_SCHEMA = "http://www.google.com/schemas/sitemap-video/1.1" XMLNS_IMAGE_SCHEMA = "http://www.google.com/schemas/sitemap-image/1.1" @@ -21,6 +21,11 @@ module Sitemapper self end + def index_add(path) : self + paginator.index_add(path) + self + end + def generate : Array(Hash(String, String)) paginator.total_pages.times do |page| filename = filename_for_page(page) @@ -59,7 +64,7 @@ module Sitemapper end private def build_xml_for_page(items) - XML.build(indent: " ", version: "1.0", encoding: "UTF-8") do |xml| + XML.build(indent: " ") do |xml| xml.element("urlset", xmlns: XMLNS_SCHEMA, "xmlns:video": XMLNS_VIDEO_SCHEMA, "xmlns:image": XMLNS_IMAGE_SCHEMA, "xmlns:xsi": XMLNS_XSI, "xsi:schemaLocation": XSI_SCHEMA_LOCATION) do items.each do |info| build_xml_from_info(xml, info) @@ -86,12 +91,22 @@ module Sitemapper end end - private def filename_for_page(page) - if paginator.total_pages == 1 - "sitemap.xml" - else - "sitemap#{page + 1}.xml" + private def generate_index(filenames : Array(String)) : Hash(String, String) + doc = XML.build(indent: " ") do |xml| + xml.element("sitemapindex", xmlns: XMLNS_SCHEMA, "xmlns:video": XMLNS_VIDEO_SCHEMA, "xmlns:image": XMLNS_IMAGE_SCHEMA, "xmlns:xsi": XMLNS_XSI, "xsi:schemaLocation": XSI_INDEX_SCHEMA_LOCATION) do + filenames.each do |filename| + xml.element("sitemap") do + sitemap_name = filename + (Sitemapper.config.compress ? ".gz" : "") + sitemap_url = [(Sitemapper.config.sitemap_host || @host), sitemap_name].join('/') + + xml.element("loc") { xml.text sitemap_url } + xml.element("lastmod") { xml.text Time.utc.to_s("%FT%X%:z") } + end + end + end end + filename = Sitemapper.config.index_file_name + ".xml" + {"name" => filename, "data" => doc} end end end diff --git a/src/sitemapper/builder/in_memory_builder.cr b/src/sitemapper/builder/in_memory_builder.cr new file mode 100644 index 0000000..f0f3854 --- /dev/null +++ b/src/sitemapper/builder/in_memory_builder.cr @@ -0,0 +1,59 @@ +require "../builder" + +module Sitemapper + # This class builds a list of sitemaps in memory, but doesn't save them. The + # caller must eventually call `Sitemapper.store` to save the resulting list + # of sitemaps. + class InMemoryBuilder < Builder + XMLNS_SCHEMA = "http://www.sitemaps.org/schemas/sitemap/0.9" + XMLNS_VIDEO_SCHEMA = "http://www.google.com/schemas/sitemap-video/1.1" + XMLNS_IMAGE_SCHEMA = "http://www.google.com/schemas/sitemap-image/1.1" + # See: https://sitemaps.org/protocol.html#validating + XMLNS_XSI = "http://www.w3.org/2001/XMLSchema-instance" + XSI_SCHEMA_LOCATION = "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" + XSI_INDEX_SCHEMA_LOCATION = "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd" + + getter paginator : Paginator + + def initialize(@host : String, @max_urls : Int32, @use_index : Bool) + @paginator = Paginator.new(limit: @max_urls) + @sitemaps = [] of Hash(String, String) + end + + def add(path, **kwargs) : self + options = SitemapOptions.new(**kwargs) + paginator.add(path, options) + self + end + + def index_add(path) : self + paginator.index_add(path) + self + end + + def generate : Array(Hash(String, String)) + paginator.total_pages.times do |page| + filename = filename_for_page(page) + doc = build_xml_for_page(paginator.items(page + 1)) + + @sitemaps << {"name" => filename, "data" => doc} + end + + if @use_index + filenames = paginator.index_items + filenames += @sitemaps.map { |sitemap| sitemap["name"] } + @sitemaps << generate_index(filenames) + end + + @sitemaps + end + + private def filename_for_page(page) + if paginator.total_pages == 1 + Sitemapper.config.sitemap_file_name + ".xml" + else + Sitemapper.config.sitemap_file_name + "#{page + 1}.xml" + end + end + end +end diff --git a/src/sitemapper/builder/stream_builder.cr b/src/sitemapper/builder/stream_builder.cr new file mode 100644 index 0000000..9f84737 --- /dev/null +++ b/src/sitemapper/builder/stream_builder.cr @@ -0,0 +1,64 @@ +require "../builder" + +module Sitemapper + # This class builds sitemap files one at a time, saving each as it reaches + # the limit of `@max_urls`. Callers don't need to call `Sitemapper.store` + # afterwards. + class StreamBuilder < Builder + getter paginator : Paginator + + def initialize(@host : String, @max_urls : Int32, @use_index : Bool, @storage : Sitemapper::Storage.class, @storage_path : String) + @paginator = Paginator.new(limit: @max_urls) + @filenames = [] of String + @index_filenames = [] of String + @sitemaps = [] of Hash(String, String) + @current_page = 1 + end + + def add(path, **kwargs) : self + options = SitemapOptions.new(**kwargs) + paginator.add(path, options) + if paginator.paths.size.modulo(@max_urls).zero? + flush + end + self + end + + def index_add(path) : self + @index_filenames << path + self + end + + def flush + filename = filename_for_page(@current_page) + doc = build_xml_for_page(paginator.items(1)) + @filenames << filename + + storage = @storage.new([{"name" => filename, "data" => doc}]) + storage.save(@storage_path) + + @current_page += 1 + @paginator = Paginator.new(limit: @max_urls) + end + + def finish : Void + unless paginator.paths.empty? + flush + end + + if @use_index + save_index + end + end + + private def save_index : Void + index = generate_index(@index_filenames + @filenames) + storage = @storage.new([index]) + storage.save(@storage_path) + end + + private def filename_for_page(page) + Sitemapper.config.sitemap_file_name + "#{page}.xml" + end + end +end diff --git a/src/sitemapper/paginator.cr b/src/sitemapper/paginator.cr index 55c420f..88839a5 100644 --- a/src/sitemapper/paginator.cr +++ b/src/sitemapper/paginator.cr @@ -2,20 +2,30 @@ module Sitemapper class Paginator DEFAULT_LIMIT = 500 property paths : Array(Tuple(String, SitemapOptions)) + property index_paths : Array(String) def initialize(@limit : Int32 = DEFAULT_LIMIT) @paths = [] of Tuple(String, SitemapOptions) + @index_paths = [] of String end def add(path : String, options : SitemapOptions) @paths << {path, options} end + def index_add(path : String) + @index_paths << path + end + def items(current_page : Int32) offset = (current_page * @limit) - @limit @paths[offset, @limit] end + def index_items + @index_paths + end + # This is calculated each time since you could # get 1 the first time, then add to it and get 2 the second def total_pages : Int32 diff --git a/src/sitemapper/streamer.cr b/src/sitemapper/streamer.cr index 2969a98..94dbff9 100644 --- a/src/sitemapper/streamer.cr +++ b/src/sitemapper/streamer.cr @@ -16,6 +16,11 @@ module Sitemapper self end + def index_add(path) : self + paginator.index_add(path) + self + end + def flush : Nil filename = filename_for_current_page doc = build_xml_for_page(paginator.items(1)) @@ -32,7 +37,7 @@ module Sitemapper end private def filename_for_current_page : String - "sitemap#{@current_page}.xml" + Sitemapper.config.sitemap_file_name + "#{@current_page}.xml" end end end