diff --git a/.gitignore b/.gitignore index adb8bb6..c52787e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,6 @@ .bundle .idea *.iml -spec/data/sample_data -hathi_upd* -hathi_full* .env .devenv archive diff --git a/Gemfile.lock b/Gemfile.lock index 56292af..92b242b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -10,7 +10,7 @@ GIT PATH remote: . specs: - hathifiles_database (0.4.1) + hathifiles_database (0.5.0) date_named_file dotenv ettin diff --git a/README.md b/README.md index 7e13fa5..ea13873 100644 --- a/README.md +++ b/README.md @@ -50,29 +50,23 @@ These are intended to be run under Docker for development purposes. ``` exe -├── catchup -├── daily_run -├── hathifiles_database_clear_everything_out -├── hathifiles_database_convert -├── hathifiles_database_full -├── hathifiles_database_full_update -├── hathifiles_database_update -└── swap_production_and_reindex +└── hathifiles_database_full_update ``` -These are exported by the `gemspec` as the gem's executables. -- `catchup` _deprecated_ loads multiple `upd` files -- `daily_run` _deprecated_ (contains hardcoded paths) loads today's `upd` file -- `hathifiles_database_clear_everything_out` interactive script to reinitialize the database -- `hathifiles_database_convert` _deprecated_ interactive script to dump `hathifiles` database to tab-delimited files -- `hathifiles_database_full` _deprecated_ load a single `full` hathifile +This is exported by the `gemspec` as the gem's executable. + - `hathifiles_database_full_update` the preferred date-independent method for loading `full` and `upd` hathifiles -- `hathifiles_database_update` _deprecated_ load a single `upd` hathifile -- `swap_production_and_reindex` _deprecated_ swaps tables between `hathifiles` and `hathifiles_reindex` databases -`swap_production_and_reindex` used to be part of the workflow for clearing and rebuilding the -production database from an auxiliary database. With Argo Workflows we should no longer need to -do this as `hathifiles_database_full_update` should be touching only the changed/deleted rows -in the `full` monthly hathifile. + +## Environment Variables +- Default Database Credentials -- override by passing keyword arguments to the `DB::Connection` initializer. + - `MARIADB_HATHIFILES_RW_USERNAME` + - `MARIADB_HATHIFILES_RW_PASSWORD` + - `MARIADB_HATHIFILES_RW_HOST` + - `MARIADB_HATHIFILES_RW_DATABASE` +- Filesystem + - `HATHIFILES_DIR` path to hathifiles archive +- Other + - `PUSHGATEWAY` Prometheus push gateway URL ## Pitfalls diff --git a/docker-compose.yml b/docker-compose.yml index a3b5348..56d6b2a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,14 +13,10 @@ services: test: build: . environment: - # Used by dumper.rb - HATHIFILES_MYSQL_USER: "ht_rights" - HATHIFILES_MYSQL_PASSWORD: "ht_rights" - HATHIFILES_MYSQL_HOST: "mariadb" - HATHIFILES_MYSQL_DATABASE: "ht" - # Used by connection.rb - # TODO: construct this based on the above variables - HATHIFILES_MYSQL_CONNECTION: "mysql2://ht_rights:ht_rights@mariadb/ht" + MARIADB_HATHIFILES_RW_USERNAME: "ht_rights" + MARIADB_HATHIFILES_RW_PASSWORD: "ht_rights" + MARIADB_HATHIFILES_RW_HOST: "mariadb" + MARIADB_HATHIFILES_RW_DATABASE: "ht" HATHIFILES_DIR: "/usr/src/app/spec/data" PUSHGATEWAY: http://pushgateway:9091 volumes: diff --git a/exe/catchup b/exe/catchup deleted file mode 100755 index 440d25e..0000000 --- a/exe/catchup +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env ruby - -require "date_named_file" -require "hathifiles_database" -require "dotenv" -require "date" - -HF_FILES = "/htapps/archive/hathifiles" -LOGFILE_DIR = "../logs/hathifiles_database" - -def usage - puts " - - Usage: - ruby #{__FILE__} <'dev' or 'production'> - e.g ruby #{__FILE__} 20211101 production - ruby #{__FILE__} first_of_month dev # or just 'fom' - ruby #{__FILE__} yesterday production - - " - exit 1 -end - -usage if ARGV.size != 2 - -devprod = ARGV[1].downcase -envfilename = case devprod.downcase -when "dev" - ".devenv" -when "prod" - ".env" -else - puts "\nUnknown target '#{devprod}'" - usage - exit 1 -end -envfile = Pathname.new(__dir__).parent + envfilename - -start_date = ARGV[0].downcase -today = DateTime.now -if %w[fom first_of_month].include? start_date - start_date = today - today.day + 1 -elsif %w[yesterday].include? start_date - start_date = today - 1 -end - -Dotenv.load(envfile) - -connection_string = ENV["HATHIFILES_MYSQL_CONNECTION"] -connection = HathifilesDatabase.new(connection_string) - -template = DateNamedFile.new "#{HF_FILES}/hathi_upd_%Y%m%d.txt.gz" -files = template.daily_through_yesterday(start_date) - -files.each do |f| - connection.logger.info "Starting work on #{f}" - connection.update_from_file f -end diff --git a/exe/daily_run b/exe/daily_run deleted file mode 100755 index dcd7989..0000000 --- a/exe/daily_run +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -HFDB_DIR=/htapps/babel/hathifiles/hathifiles_database/ -HFDB_LOGDIR=/htapps/babel/hathifiles/logs/hathifiles_database -HF_FILES=/htapps/archive/hathifiles - -YESTERDAY8=`date --date="yesterday" "+%Y%m%d"` - -SOURCEFILE=$HF_FILES/hathi_upd_${YESTERDAY8}.txt.gz -LOGFILE=$HFDB_LOGDIR/daily_${YESTERDAY8}.txt - -PATH=/l/local/rbenv/shims:/l/local/rbenv/bin:$PATH - -cd $HFDB_DIR -bundle exec ruby exe/hathifiles_database_update $SOURCEFILE > $LOGFILE 2>&1 - diff --git a/exe/hathifiles_database_clear_everything_out b/exe/hathifiles_database_clear_everything_out deleted file mode 100755 index c0d9ed8..0000000 --- a/exe/hathifiles_database_clear_everything_out +++ /dev/null @@ -1,22 +0,0 @@ -$:.unshift "../lib" - -require 'hathifiles_database' -require 'pathname' -require 'tty-prompt' -require 'dotenv' - -envfile = Pathname.new(__dir__).parent + '.env' -Dotenv.load(envfile) - - -filename = ARGV[0] -tempdir = Pathname.new('.').realdirpath + 'tmp' - -prompt = TTY::Prompt.new - - -connection_string = ENV['HATHIFILES_MYSQL_CONNECTION'] || prompt.ask("Connection string:") -connection = HathifilesDatabase.new(connection_string) - -connection.recreate_tables! - diff --git a/exe/hathifiles_database_convert b/exe/hathifiles_database_convert deleted file mode 100755 index 906902c..0000000 --- a/exe/hathifiles_database_convert +++ /dev/null @@ -1,28 +0,0 @@ - -here = Pathname.new __dir__ -lib = here.parent + "lib" -$:.unshift lib.to_s - -require 'hathifiles_database' -require 'pathname' -require 'tty-prompt' -require 'dotenv' - -envfile = Pathname.new(__dir__).parent + '.env' -Dotenv.load(envfile) - -prompt = TTY::Prompt.new - -filename = ARGV[0] -dirname = ARGV[1] || prompt.ask("Dir for files") - - -dirname = Pathname.new(dirname) -dirname.mkpath - -connection_string = ENV['HATHIFILES_MYSQL_CONNECTION'] || prompt.ask("Connection string:") -connection = HathifilesDatabase.new(connection_string) - -datafile = HathifilesDatabase::Datafile.new(filename) -datafile.dump_files_for_data_import(dirname) - diff --git a/exe/hathifiles_database_full b/exe/hathifiles_database_full deleted file mode 100755 index 1344ebe..0000000 --- a/exe/hathifiles_database_full +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env ruby - -# Invoke MonthlyUpdate class to load and delete the differences between -# the current database and a full monthly hathifile. - -$LOAD_PATH.unshift "../lib" - -require "dotenv" -require "pathname" -require "tmpdir" - -require "hathifiles_database" -require "hathifiles_database/monthly_update" - -envfile = Pathname.new(__dir__).parent + ".env" -Dotenv.load(envfile) - -# Use the monthly hathifile from command line -hathifile = ARGV[0] -# Fall back to the most recent monthly hathifile if not specified -if hathifile.nil? - hathifile = Dir.glob(File.join(ENV["HATHIFILES_DIR"], "hathi_full*")).max -end - -connection = HathifilesDatabase.new(ENV["HATHIFILES_MYSQL_CONNECTION"]) -Dir.mktmpdir do |tempdir| - HathifilesDatabase::MonthlyUpdate.new( - connection: connection, - hathifile: hathifile, - output_directory: tempdir - ).run -end diff --git a/exe/hathifiles_database_full_update b/exe/hathifiles_database_full_update index f5ebe16..49a8eeb 100755 --- a/exe/hathifiles_database_full_update +++ b/exe/hathifiles_database_full_update @@ -6,7 +6,6 @@ $LOAD_PATH.unshift "../lib" -require "cgi" require "dotenv" require "logger" require "pathname" @@ -18,30 +17,7 @@ require "hathifiles_database" envfile = Pathname.new(__dir__).parent + ".env" Dotenv.load(envfile) -# This is the "right" way to do the connection if there is a chance the password -# will contain non-URI-safe characters (as is likely to be the case). -# We are careful not to let the URI::InvalidURIError backtrace get logged since -# it can disclose the password. -# In future we should have a HathifilesDatabase::DB::Connection implementation that -# passes the individual ENV bits to Sequel, then we can deprecate the use of a connection -# string/URI. - -# See https://github.com/hathitrust/rights_database/blob/main/lib/rights_database/db.rb -# for a representative implementation. - -mysql_user = ENV["HATHIFILES_MYSQL_USER"] -mysql_password = CGI.escape ENV["HATHIFILES_MYSQL_PASSWORD"] -mysql_host = ENV["HATHIFILES_MYSQL_HOST"] -mysql_database = ENV["HATHIFILES_MYSQL_DATABASE"] -connection_uri = "mysql2://#{mysql_user}:#{mysql_password}@#{mysql_host}/#{mysql_database}" - -begin - connection = HathifilesDatabase.new(connection_uri) -rescue URI::InvalidURIError - Logger.new($stderr).fatal("invalid URI in database connection string") - exit 1 -end - +connection = HathifilesDatabase.new hathifiles = HathifilesDatabase::Hathifiles.new( hathifiles_directory: ENV["HATHIFILES_DIR"], connection: connection @@ -54,19 +30,21 @@ tracker = PushMetrics.new( logger: connection.logger ) -Dir.mktmpdir do |tempdir| - # `missing_full_hathifiles` returns an Array with zero or one element - # since only the most recent monthly file (if any) is of interest. - # - # We always process the full file first, then any updates. - # Whether or not this is strictly necessary (the update released - # on the same day as the full file may be superfluous), this is how - # `hathitrust_catalog_indexer` does it. - connection.logger.info "full hathifiles: #{hathifiles.missing_full_hathifiles}" - if hathifiles.missing_full_hathifiles.any? - hathifile = File.join(ENV["HATHIFILES_DIR"], hathifiles.missing_full_hathifiles.first) - connection.logger.info "processing monthly #{hathifile}" - HathifilesDatabase::MonthlyUpdate.new( +# `missing_full_hathifiles` returns an Array with zero or one element +# since only the most recent monthly file (if any) is of interest. +# +# We always process the full file first, then any updates. +# Whether or not this is strictly necessary (the update released +# on the same day as the full file may be superfluous), this is how +# `hathitrust_catalog_indexer` does it. +missing_hathifiles = hathifiles.missing_full_hathifiles + hathifiles.missing_update_hathifiles + +connection.logger.info "hathifiles to process: #{missing_hathifiles}" +missing_hathifiles.each do |hathifile| + Dir.mktmpdir do |tempdir| + hathifile = File.join(ENV["HATHIFILES_DIR"], hathifile) + connection.logger.info "processing #{hathifile}" + HathifilesDatabase::DeltaUpdate.new( connection: connection, hathifile: hathifile, output_directory: tempdir @@ -75,14 +53,5 @@ Dir.mktmpdir do |tempdir| tracker.on_batch { |_t| connection.logger.info tracker.batch_line } end end - connection.logger.info "updates: #{hathifiles.missing_update_hathifiles}" - hathifiles.missing_update_hathifiles.each do |hathifile| - hathifile = File.join(ENV["HATHIFILES_DIR"], hathifile) - connection.logger.info "processing update #{hathifile}" - connection.update_from_file(hathifile) do |records_inserted| - tracker.increment records_inserted - tracker.on_batch { |_t| connection.logger.info tracker.batch_line } - end - end end tracker.log_final_line diff --git a/exe/hathifiles_database_update b/exe/hathifiles_database_update deleted file mode 100755 index 93ef43e..0000000 --- a/exe/hathifiles_database_update +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env ruby - -# Invoke Connection class to load a hathifile, using the most recent one if -# not specified on the command line. - -$LOAD_PATH.unshift "../lib" - -require "dotenv" -require "hathifiles_database" -require "pathname" -require "tmpdir" - -envfile = Pathname.new(__dir__).parent + ".env" -Dotenv.load(envfile) - -# Use the daily hathifile from command line -hathifile = ARGV[0] -# Fall back to the most recent daily hathifile if not specified -if hathifile.nil? - hathifile = Dir.glob(File.join(ENV["HATHIFILES_DIR"], "hathi_upd*")).max -end - -connection = HathifilesDatabase.new(ENV["HATHIFILES_MYSQL_CONNECTION"]) - -# Do all operations in temporary directory so intermediate files get cleaned up -Dir.mktmpdir do |tempdir| - connection.logger.info "Starting work on #{hathifile}" - connection.update_from_file hathifile -end diff --git a/exe/swap_production_and_reindex b/exe/swap_production_and_reindex deleted file mode 100755 index 9add13b..0000000 --- a/exe/swap_production_and_reindex +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env ruby - -require "date_named_file" -require "hathifiles_database" -require "dotenv" - -HF_FILES = "/htapps/archive/hathifiles" -LOGFILE_DIR = "../logs/hathifiles_database" - -envfile = Pathname.new(__dir__).parent + ".env" -# devenvfile = Pathname.new(__dir__).parent + ".devenv" - -Dotenv.load(envfile) -connection_string = ENV["HATHIFILES_MYSQL_CONNECTION"] -connection = HathifilesDatabase.new(connection_string) - -production = connection.rawdb - -Dotenv.load(envfile) -connection_string = ENV["HATHIFILES_MYSQL_CONNECTION"] -reindex_connection = HathifilesDatabase.new(connection_string) - -reindex = reindex_connection.rawdb - -def prod(t) - "hathifiles.#{t}" -end - -def tmp(t) - "hathifiles.#{t}_swap" -end - -def ri(t) - "hathifiles_reindex.#{t}" -end - -# Get tables that are in both -tables = production.tables.intersection(reindex.tables) - -renames = tables.flat_map { |t| [[prod(t), tmp(t)], [ri(t), prod(t)], [tmp(t), ri(t)]] } -sql = "RENAME TABLE " + renames.map { |x| x.join(" TO ") }.join(", ") - -production.run(sql) diff --git a/lib/hathifiles_database.rb b/lib/hathifiles_database.rb index 2c4a55c..be23f73 100644 --- a/lib/hathifiles_database.rb +++ b/lib/hathifiles_database.rb @@ -2,13 +2,13 @@ require "hathifiles_database/version" require "hathifiles_database/datafile" +require "hathifiles_database/delta_update" require "hathifiles_database/db/connection" require "hathifiles_database/hathifiles" require "hathifiles_database/log" -require "hathifiles_database/monthly_update" module HathifilesDatabase - def self.new(connection_string) - HathifilesDatabase::DB::Connection.new(connection_string) + def self.new(...) + HathifilesDatabase::DB::Connection.new(...) end end diff --git a/lib/hathifiles_database/db/connection.rb b/lib/hathifiles_database/db/connection.rb index cc80531..36babb6 100644 --- a/lib/hathifiles_database/db/connection.rb +++ b/lib/hathifiles_database/db/connection.rb @@ -16,8 +16,6 @@ module HathifilesDatabase class DB class Connection extend HathifilesDatabase::Exception - - LOGGER = Logger.new($stderr) MIGRATION_DIR = Pathname.new(__dir__) + "migrations" attr_accessor :logger, :rawdb, :slice_size @@ -25,18 +23,29 @@ class Connection # We take the name of the main table from the constant # MAINTABLE and the names of the foreign tables from the # keys in the line's #foreign_table_data hash - # @param [String] connection_string A valid Sequel connection string - # (see https://sequel.jeremyevans.net/rdoc/files/doc/opening_databases_rdoc.html) - # @param [#info] logger A logger object that responds to, e.g., `#warn`, + + # @param [Hash] kwargs + # @option kwargs [String] :username (database connection) + # @option kwargs [String] :password (database connection) + # @option kwargs [String] :host (database connection) + # @option kwargs [String] :database (database connection) + # @option kwargs [Logger] :logger object that responds to, e.g., `#warn`, # `#info`, etc. - def initialize(connection_string, logger: LOGGER) - @rawdb = Sequel.connect(connection_string + "?local_infile=1&CharSet=utf8mb4") - # __setobj__(@rawdb) + def initialize(**kwargs) + @rawdb = Sequel.connect( + adapter: "mysql2", + user: kwargs[:username] || ENV["MARIADB_HATHIFILES_RW_USERNAME"], + password: kwargs[:password] || ENV["MARIADB_HATHIFILES_RW_PASSWORD"], + host: kwargs[:host] || ENV["MARIADB_HATHIFILES_RW_HOST"], + database: kwargs[:database] || ENV["MARIADB_HATHIFILES_RW_DATABASE"], + config_local_infile: 1, + encoding: "utf8mb4" + ) + @logger = kwargs[:logger] || Constants::LOGGER @main_table = @rawdb[Constants::MAINTABLE] @foreign_tables = Constants::FOREIGN_TABLES.values.each_with_object({}) do |tablename, h| h[tablename] = @rawdb[tablename] end - @logger = logger @slice_size = 100 end @@ -52,7 +61,6 @@ def initialize(connection_string, logger: LOGGER) def update_from_file( filepath, linespec = LineSpec.default_linespec, - logger: Constants::LOGGER, deletes_file: nil, hathifile_to_log: filepath, &block diff --git a/lib/hathifiles_database/delta_update.rb b/lib/hathifiles_database/delta_update.rb new file mode 100644 index 0000000..46935d4 --- /dev/null +++ b/lib/hathifiles_database/delta_update.rb @@ -0,0 +1,222 @@ +# frozen_string_literal: true + +require "tempfile" +require "zlib" + +require "hathifiles_database/dumper" +require "hathifiles_database/hathifiles" + +module HathifilesDatabase + # Updates the hf family of database tables with a monthly or update hathifile. + # Tries to avoid disruption and thrashing by computing a delta using the + # current hathifile and the state of the database and diffing + # two derivative files based on the current hf table and the hathifile + # to be inserted. + + # By sorting and comparing these dumps we arrive at a "changes" file + # and (monthly) a "deletions" file which can be submitted to the `Connection` class. + + # Here are the files that are generated in tmpdir in the process of running an update: + # + # hf_currentYYYYMMDD-* (a `Tempfile`-generated filename) + # Unsorted dump of the current hf table created by `mysql` executable + # + # hf_current.txt + # Sorted version of the above + # + # hf_current_ids.txt + # Just the sorted HTIDs from hf_current.txt, used to generate statistics-oriented files below. + # + # hf.tsv (and other *.tsv files) + # Formatted hathifile info in database form provided by Dumper and ultimately Datafile + # This file is sorted to create `*.new` (below), and all the TSVs are subsequently deleted. + # + # hathi_{upd,full}_YYMMDD.txt.gz.new + # Sorted dump of the above + # + # hathi_{upd,full}_YYMMDD.txt.gz.new_ids + # Just the sorted HTIDs from *.new + # + # hathi_{upd,full}_YYMMDD.txt.gz.all_changes + # `*.new` - hf_current.txt, modified or added records + # =================================================================================== + # Primary file submitted to the `Connection` class to modify the database. + # =================================================================================== + # + # hathi_{upd,full}_YYMMDD.txt.gz.all_changes_ids + # Just the sorted HTIDs from *.all_changes + # + # hathi_{upd,full}_YYMMDD.txt.gz.additions + # HTIDs not in `hf_current_ids.txt` but in the hathifile's `*.new_ids` + # + # hathi_{upd,full}_YYMMDD.txt.gz.updates + # HTIDs common to `hf_current_ids.txt` and `*.all_changes_ids` + # + # hathi_{upd,full}_YYMMDD.txt.gz.deletions + # hf_current.txt HTIDs - hf_new.txt HTIDs + # Note: only created for full hathifiles, not updates + # =================================================================================== + # When available, submitted to the `Connection` class to modify the database. + # =================================================================================== + # + + # See exe/hathifiles_database_full_update for minimal usage example. + class DeltaUpdate + attr_reader :connection, :hathifile, :output_directory, :dumper + + def initialize(connection:, hathifile:, output_directory:) + @connection = connection + @hathifile = hathifile.to_s # in case it's a Pathname + @output_directory = output_directory + @dumper = Dumper.new(connection) + @full = File.basename(hathifile).match? Hathifiles::FULL_RE + end + + # Assembles the additions and deletions files and submits them to the connection + # for application to the database. + def run(&block) + connection.update_from_file( + all_changes, + deletes_file: deletions, + hathifile_to_log: hathifile, + &block + ) + end + + # Dumps the current contents of hf table to a file and sorts it. + # @return [String] path to sorted dump of the current hf database + def current_dump + @current_dump ||= File.join(output_directory, "hf_current.txt").tap do |output_file| + Tempfile.create("hf_current") do |tempfile| + connection.logger.info "dumping current hf table to #{tempfile.path}" + dumper.dump_current(output_file: tempfile.path) + tempfile.flush + run_system_command "sort #{tempfile.path} > #{output_file}" + end + end + end + + # Extract the HTIDs from `new_dump` into hf_new_ids.txt and sort the result. + # @return [String] path to sorted dump based on monthly hathifile + def current_ids + @current_ids ||= File.join(output_directory, "hf_current_ids.txt").tap do |output_file| + connection.logger.info "extracting ids from #{current_dump} to #{output_file}" + run_system_command "cut -f 1 #{current_dump} | sort > #{output_file}" + end + end + + # Dumps a simulated hf table from a hathifile and sorts it. + # Also dumps the auxiliary tables but we ignore them. + # @return [String] path to sorted dump based on the new hathifile + def new_dump + @new_dump ||= hathifile_derivative("new").tap do |output_file| + connection.logger.info "dumping new database values from #{hathifile} to #{output_directory}" + dump_file_paths = dumper.dump_from_file(hathifile: hathifile, output_directory: output_directory) + run_system_command "sort #{dump_file_paths[:hf]} > #{output_file}" + # Delete the dump TSVs since we no longer need them + dump_file_paths.each_value do |value| + FileUtils.rm(value) + end + end + end + + # Extract the HTIDs from `.new` into sorted `new_ids` file. + # @return [String] path to sorted dump based on monthly hathifile + def new_ids + @new_ids ||= hathifile_derivative("new_ids").tap do |output_file| + connection.logger.info "extracting ids from #{new_dump} to #{output_file}" + run_system_command "cut -f 1 #{new_dump} | sort > #{output_file}" + end + end + + # Creates .all_changes file with only the records added or changed in new_dump + # but not current_dump. This file can be loaded just like an ordinary daily update. + # @return [String] path to changes file + def all_changes + @all_changes ||= hathifile_derivative("all_changes").tap do |output_file| + comm_cmd = "comm -13 #{current_dump} #{new_dump} > #{output_file}" + run_system_command comm_cmd + end + end + + # Creates .all_changes_ids file with only the HTIDs added or changed in new_dump + # but not current_dump. This is used in generating `updates` for statistics. + # @return [String] path to changes file + def all_changes_ids + @all_changes_ids ||= hathifile_derivative("all_changes_ids").tap do |output_file| + connection.logger.info "extracting ids from #{all_changes} to #{output_file}" + run_system_command "cut -f 1 #{all_changes} | sort > #{output_file}" + end + end + + # Creates .additions file with only the records added by the new hathifile, + # not currently in database. This file is only for gathering stats. + def additions + @additions ||= hathifile_derivative("additions").tap do |output_file| + # Additions are HTIDs not in the current ids but in the new ids + # We want Lines only in file2 (new ids) + comm_cmd = "comm -13 #{current_ids} #{new_ids} > #{output_file}" + run_system_command comm_cmd + end + end + + # Creates .updates file with only the records modified by the new hathifile, + # already in database but with different data. This file is only for gathering stats. + def updates + @updates ||= hathifile_derivative("updates").tap do |output_file| + # Updates are HTIDs common to both files, i.e., column 3 only + comm_cmd = "comm -12 #{current_ids} #{all_changes_ids} > #{output_file}" + run_system_command comm_cmd + end + end + + # Creates .deletions file with only the records in the database but not present + # in the new hathifile. This file is a newline-delimited list of HTIDs. + # Does not do deletions when the hathifile is an update -- that would trash the hf table! + # @return [String] path to deletions file or nil when processing an update file + def deletions + return nil unless @full + + @deletions ||= hathifile_derivative("deletions").tap do |output_file| + comm_cmd = "comm -23 #{current_ids} #{new_ids} > #{output_file}" + run_system_command comm_cmd + end + end + + # Struct with the number of lines in the hathifile, the changes, and deletions. + # This is for getting a handle on performance of delta computation vs wholesale replacement + # of database contents. + def statistics + @statistics ||= { + additions: linecount(path: additions), + all_changes: linecount(path: all_changes), + deletions: linecount(path: deletions), + hathifile_lines: gzip_linecount(path: hathifile), + updates: linecount(path: updates) + } + end + + private + + # @return [String] path to hathifile derivative with suffix + def hathifile_derivative(suffix) + File.join(output_directory, File.basename(hathifile)) + "." + suffix + end + + # Log a shellout and execute it + def run_system_command(cmd) + connection.logger.info cmd + system(cmd, exception: true) + end + + def linecount(path:) + return 0 if path.nil? + + `wc -l "#{path}"`.strip.split(" ")[0].to_i + end + + def gzip_linecount(path:) + Zlib::GzipReader.open(path, encoding: "utf-8") { |gz| gz.count } + end + end +end diff --git a/lib/hathifiles_database/dumper.rb b/lib/hathifiles_database/dumper.rb index 17c2d81..9af55e0 100644 --- a/lib/hathifiles_database/dumper.rb +++ b/lib/hathifiles_database/dumper.rb @@ -27,18 +27,15 @@ def dump_current(output_file:) # Create a TSV database dump based on a hathifile without # actually writing anything to the database. - # Used for constructing the delta between a monthly hathifile and the current - # state of the database. + # Used for constructing the delta between the DB and a new hathifile. def dump_from_file(hathifile:, output_directory:) datafile = HathifilesDatabase::Datafile.new(hathifile) - datafile.dump_files_for_data_import(output_directory) + datafile.dump_files_for_data_import(output_directory, nodate_suffix: true) end private def dump_cmd(ini_file:, output_file:) - # Use ENV under Docker and default under k8s - db = ENV.fetch("HATHIFILES_MYSQL_DATABASE", "hathifiles") # gsub to collapse newlines and multiple space into one line <<~END_CMD.gsub(/\s+/, " ") mysql @@ -46,9 +43,9 @@ def dump_cmd(ini_file:, output_file:) --skip-column-names --batch --raw - --host=#{ENV["HATHIFILES_MYSQL_HOST"]} + --host=#{ENV["MARIADB_HATHIFILES_RW_HOST"]} --execute='#{dump_sql}' - #{db} + #{ENV["MARIADB_HATHIFILES_RW_DATABASE"]} > #{output_file} END_CMD end @@ -56,7 +53,6 @@ def dump_cmd(ini_file:, output_file:) # Dump the hf table into a form that can be diffed and resubmitted as a hathifile. # No need to do an ORDER BY as we postprocess output using the `sort` command. def dump_sql - # gsub to collapse newlines and multiple space into one line @dump_sql ||= <<~END_SQL SELECT htid, access, rights_code, bib_num, description, source, @@ -73,8 +69,8 @@ def dump_sql def mysql_ini <<~END_INI [client] - user="#{ENV["HATHIFILES_MYSQL_USER"]}" - password="#{ENV["HATHIFILES_MYSQL_PASSWORD"]}" + user="#{ENV["MARIADB_HATHIFILES_RW_USERNAME"]}" + password="#{ENV["MARIADB_HATHIFILES_RW_PASSWORD"]}" END_INI end end diff --git a/lib/hathifiles_database/hathifiles.rb b/lib/hathifiles_database/hathifiles.rb index d8fc400..a24164a 100644 --- a/lib/hathifiles_database/hathifiles.rb +++ b/lib/hathifiles_database/hathifiles.rb @@ -22,10 +22,7 @@ class Hathifiles attr_reader :hathifiles_directory, :connection - def initialize( - hathifiles_directory: ENV["HATHIFILES_DIR"], - connection: HathifilesDatabase.new(ENV["HATHIFILES_MYSQL_CONNECTION"]) - ) + def initialize(connection:, hathifiles_directory: ENV["HATHIFILES_DIR"]) @hathifiles_directory = hathifiles_directory @connection = connection end @@ -77,8 +74,7 @@ def latest_update_hathifiles # @param type [String|Symbol] "full" or "upd" # @return [Array] hathifile basenames in arbitrary order def all_of_type(type:) - type = type.to_s - re = (type == "full") ? FULL_RE : UPD_RE + re = (type.to_s == "full") ? FULL_RE : UPD_RE Dir.glob(File.join(hathifiles_directory, "*")) .map { |hathifile| File.basename(hathifile) } .select { |hathifile| hathifile.match? re } diff --git a/lib/hathifiles_database/log.rb b/lib/hathifiles_database/log.rb index e1ebf15..4eb5475 100644 --- a/lib/hathifiles_database/log.rb +++ b/lib/hathifiles_database/log.rb @@ -1,7 +1,5 @@ # frozen_string_literal: true -require "sequel" - # Handles reading and writing hathifiles.hf_log table which is where # we record state so hathifiles_database as a whole can be date indepenedent. module HathifilesDatabase diff --git a/lib/hathifiles_database/monthly_update.rb b/lib/hathifiles_database/monthly_update.rb deleted file mode 100644 index 217ded9..0000000 --- a/lib/hathifiles_database/monthly_update.rb +++ /dev/null @@ -1,97 +0,0 @@ -# frozen_string_literal: true - -require "tempfile" - -require "hathifiles_database/dumper" - -module HathifilesDatabase - # Updates the hf family of database tables with a full monthly hathifile. - # Tries to avoid disruption and thrashing by computing a delta using the - # current monthly hathifile and the state of the database and diffing - # two derivative files based on the current hf table and the hathifile - # to be inserted. - - # By sorting and comparing these dumps we arrive at an "additions" file - # and a "deletions" file which can be submitted to the Connection class. - - # See exe/hathifiles_database_full for minimal usage example. - class MonthlyUpdate - attr_reader :connection, :hathifile, :output_directory, :dumper - - def initialize(connection:, hathifile:, output_directory:) - @connection = connection - @hathifile = hathifile.to_s # in case it's a Pathname - @output_directory = output_directory - @dumper = Dumper.new(connection) - end - - # Assembles the additions and deletions files and submits them to the connection - # for application to the database. - def run(&block) - connection.update_from_file( - additions, - deletes_file: deletions, - hathifile_to_log: hathifile, - &block - ) - end - - # Dumps the current contents of hf table to a file and sorts it. - # @return [String] path to sorted dump of the current hf database - def current_dump - @current_dump ||= File.join(output_directory, "hf_current.txt").tap do |output_file| - Tempfile.create("hf_current") do |tempfile| - connection.logger.info "dumping current hf table to #{tempfile.path}" - dumper.dump_current(output_file: tempfile.path) - tempfile.flush - run_system_command "sort #{tempfile.path} > #{output_file}" - end - end - end - - # Dumps a simulated hf table from a hathifile and sorts it. - # Also dumps the auxiliary tables but we ignore them. - # @return [String] path to sorted dump based on monthly hathifile - def new_dump - @new_dump ||= File.join(output_directory, "hf_new.txt").tap do |output_file| - connection.logger.info "dumping new database values from #{hathifile} to #{output_directory}" - dump_file_paths = dumper.dump_from_file(hathifile: hathifile, output_directory: output_directory) - run_system_command "sort #{dump_file_paths[:hf]} > #{output_file}" - end - end - - # Creates .additions file with only the records added or changed in new_dump - # but not current_dump. This file can be loaded just like an ordinary daily update. - # @return [String] path to additions file - def additions - @additions ||= hathifile_derivative("additions").tap do |output_file| - comm_cmd = "comm -13 #{current_dump} #{new_dump} > #{output_file}" - run_system_command comm_cmd - end - end - - # Creates .deletions file with only the records not in new_dump - # but present in current_dump. This file is a newline-delimited list - # of HTIDs. - # @return [String] path to deletions file - def deletions - @deletions ||= hathifile_derivative("deletions").tap do |output_file| - comm_cmd = "bash -c 'comm -23 <(cut -f 1 #{current_dump} | sort) <(cut -f 1 #{new_dump} | sort) > #{output_file}'" - run_system_command comm_cmd - end - end - - private - - # @return [String] path to hathifile derivative with suffix - def hathifile_derivative(suffix) - File.join(output_directory, Pathname.new(hathifile).basename.to_s) + "." + suffix - end - - # Log a shellout and execute it - def run_system_command(cmd) - connection.logger.info cmd - system(cmd, exception: true) - end - end -end diff --git a/lib/hathifiles_database/version.rb b/lib/hathifiles_database/version.rb index 653bba7..678c62c 100644 --- a/lib/hathifiles_database/version.rb +++ b/lib/hathifiles_database/version.rb @@ -1,3 +1,3 @@ module HathifilesDatabase - VERSION = "0.4.1" + VERSION = "0.5.0" end diff --git a/spec/connection_spec.rb b/spec/connection_spec.rb index 29b0273..f195416 100644 --- a/spec/connection_spec.rb +++ b/spec/connection_spec.rb @@ -2,12 +2,12 @@ require_relative "../lib/hathifiles_database/constants" RSpec.describe HathifilesDatabase::DB::Connection do - let(:conn) { described_class.new(ENV["HATHIFILES_MYSQL_CONNECTION"]) } + let(:conn) { described_class.new } let(:txt_datafile_path) { data_file_path "sample_10.txt" } - let(:gz_datafile_path) { data_file_path "sample_100.txt.gz" } + let(:gz_datafile_path) { data_file_path "hathi_full_20250101.txt.gz" } before(:all) do - described_class.new(ENV["HATHIFILES_MYSQL_CONNECTION"]).recreate_tables! + described_class.new.recreate_tables! end before(:each) do diff --git a/spec/data/README.txt b/spec/data/README.txt new file mode 100644 index 0000000..743735e --- /dev/null +++ b/spec/data/README.txt @@ -0,0 +1,5 @@ +The full file is 100 sample records +The update file is the same sample of 100 with the first 10 modified from "deny ic" to "allow pd" + +The sample_10 file is more manageable but should probably be renamed. +And we don't really care about being able to load non-gzipped hathifiles (do we?) diff --git a/spec/data/sample_100.txt.gz b/spec/data/hathi_full_20250101.txt.gz similarity index 100% rename from spec/data/sample_100.txt.gz rename to spec/data/hathi_full_20250101.txt.gz diff --git a/spec/data/hathi_upd_20250101.txt.gz b/spec/data/hathi_upd_20250101.txt.gz new file mode 100644 index 0000000..bdf610c Binary files /dev/null and b/spec/data/hathi_upd_20250101.txt.gz differ diff --git a/spec/delta_update_spec.rb b/spec/delta_update_spec.rb new file mode 100644 index 0000000..31f9b8a --- /dev/null +++ b/spec/delta_update_spec.rb @@ -0,0 +1,119 @@ +# frozen_string_literal: true + +require "tmpdir" + +TEST_HTID = "test.001" + +RSpec.describe HathifilesDatabase::DeltaUpdate do + around(:example) do |ex| + HathifilesDatabase::Constants::ALL_TABLES.each do |table| + conn.rawdb[table].delete + end + conn.rawdb[:hf].insert(htid: TEST_HTID) + Dir.mktmpdir do |dir| + @output_directory = dir + ex.run + end + end + + def line_count(file) + `wc -l "#{file}"`.strip.split(" ")[0].to_i + end + + let(:conn) { HathifilesDatabase.new } + let(:full_hathifile) { data_file_path "hathi_full_20250101.txt.gz" } + let(:upd_hathifile) { data_file_path "hathi_upd_20250101.txt.gz" } + let(:delta) { described_class.new(connection: conn, hathifile: full_hathifile, output_directory: @output_directory) } + + describe "#current_dump" do + it "creates a readable file" do + expect(File.readable?(delta.current_dump)).to eq(true) + end + + it "creates a file with the same number of rows as the hf table" do + expect(line_count(delta.current_dump)).to eq(conn.rawdb[:hf].count) + end + end + + describe "#new_dump" do + it "creates a readable file" do + expect(File.readable?(delta.new_dump)).to eq(true) + end + + it "creates a file with the same number of rows as the hathifile" do + hathifile_line_count = `zcat "#{full_hathifile}" | wc -l`.strip.split(" ")[0].to_i + expect(line_count(delta.new_dump)).to eq(hathifile_line_count) + end + end + + describe "#all_changes" do + it "creates a readable file" do + expect(File.readable?(delta.all_changes)).to eq(true) + end + + it "finds all entries in hathifile" do + added_lines = File.readlines(delta.all_changes).map(&:chomp) + expect(added_lines.count).to eq(100) + end + + it "finds no entries in hathifile if it has already been loaded" do + conn.update_from_file full_hathifile + added_lines = File.readlines(delta.all_changes).map(&:chomp) + expect(added_lines.count).to eq(0) + end + end + + describe "#deletions" do + context "with a full file" do + it "creates a readable file" do + expect(File.readable?(delta.deletions)).to eq(true) + end + + it "finds only test entry not in hathifile" do + deleted_htids = File.readlines(delta.deletions).map(&:chomp) + expect(deleted_htids.count).to eq(1) + expect(deleted_htids[0]).to eq(TEST_HTID) + end + end + + context "with an upd file" do + it "returns nil" do + upd_delta = described_class.new(connection: conn, hathifile: upd_hathifile, output_directory: @output_directory) + expect(upd_delta.deletions).to be_nil + end + end + end + + describe "#run" do + it "runs to completion and writes a log entry" do + delta.run + expect(conn.rawdb[:hf_log].count).to eq 1 + expect(conn.rawdb[:hf].count).to eq 100 + end + + it "has no effect when run a second time" do + delta.run + new_delta = described_class.new(connection: conn, hathifile: full_hathifile, output_directory: @output_directory) + new_delta.run + expected = {additions: 0, all_changes: 0, deletions: 0, hathifile_lines: 100, updates: 0} + expect(new_delta.statistics).to eq expected + end + + it "loads only the 10 changed entries between the monthly and update" do + delta.run + new_delta = described_class.new(connection: conn, hathifile: upd_hathifile, output_directory: @output_directory) + new_delta.run + expected = {additions: 0, all_changes: 10, deletions: 0, hathifile_lines: 100, updates: 10} + expect(new_delta.statistics).to eq expected + expect(line_count(new_delta.all_changes)).to eq 10 + end + end + + describe "#statistics" do + it "returns a statistics hash with reasonable values" do + delta.run + expected = {additions: 100, all_changes: 100, deletions: 1, hathifile_lines: 100, updates: 0} + expect(delta.statistics).to eq expected + end + end +end diff --git a/spec/dumper_spec.rb b/spec/dumper_spec.rb index 57cf893..1bc662a 100644 --- a/spec/dumper_spec.rb +++ b/spec/dumper_spec.rb @@ -5,11 +5,11 @@ require_relative "../lib/hathifiles_database/dumper" RSpec.describe HathifilesDatabase::Dumper do - let(:conn) { HathifilesDatabase.new(ENV["HATHIFILES_MYSQL_CONNECTION"]) } + let(:conn) { HathifilesDatabase.new } let(:dumper) { described_class.new(conn) } let(:all_tables) { [HathifilesDatabase::Constants::MAINTABLE] + HathifilesDatabase::Constants::FOREIGN_TABLES.values } let(:txt_datafile_path) { data_file_path "sample_10.txt" } - let(:gz_datafile_path) { data_file_path "sample_100.txt.gz" } + let(:gz_datafile_path) { data_file_path "hathi_full_20250101.txt.gz" } before(:each) do all_tables.each do |table| diff --git a/spec/hathifiles_database_spec.rb b/spec/hathifiles_database_spec.rb index 8d8c1d9..0a25e7e 100644 --- a/spec/hathifiles_database_spec.rb +++ b/spec/hathifiles_database_spec.rb @@ -4,6 +4,6 @@ end it "can create a connection" do - expect(HathifilesDatabase.new(ENV["HATHIFILES_MYSQL_CONNECTION"])).to be_a HathifilesDatabase::DB::Connection + expect(HathifilesDatabase.new).to be_a HathifilesDatabase::DB::Connection end end diff --git a/spec/hathifiles_spec.rb b/spec/hathifiles_spec.rb index 930d798..ebcaef0 100644 --- a/spec/hathifiles_spec.rb +++ b/spec/hathifiles_spec.rb @@ -13,7 +13,7 @@ conn.rawdb[:hf_log].delete end - let(:conn) { HathifilesDatabase.new(ENV["HATHIFILES_MYSQL_CONNECTION"]) } + let(:conn) { HathifilesDatabase.new } let(:log) { HathifilesDatabase::Log.new(connection: conn) } let(:hathifiles) { described_class.new(hathifiles_directory: @output_directory, connection: conn) } diff --git a/spec/log_spec.rb b/spec/log_spec.rb index f35aff0..790e3e0 100644 --- a/spec/log_spec.rb +++ b/spec/log_spec.rb @@ -7,7 +7,7 @@ conn.rawdb[:hf_log].delete end - let(:conn) { HathifilesDatabase.new(ENV["HATHIFILES_MYSQL_CONNECTION"]) } + let(:conn) { HathifilesDatabase.new } let(:log) { described_class.new(connection: conn) } describe ".new" do diff --git a/spec/monthly_update_spec.rb b/spec/monthly_update_spec.rb deleted file mode 100644 index 2f2ce4d..0000000 --- a/spec/monthly_update_spec.rb +++ /dev/null @@ -1,87 +0,0 @@ -# frozen_string_literal: true - -require "tmpdir" -require_relative "../lib/hathifiles_database/monthly_update" - -TEST_HTID = "test.001" - -RSpec.describe HathifilesDatabase::MonthlyUpdate do - around(:example) do |ex| - HathifilesDatabase::Constants::ALL_TABLES.each do |table| - conn.rawdb[table].delete - end - conn.rawdb[:hf].insert(htid: TEST_HTID) - Dir.mktmpdir do |dir| - @output_directory = dir - ex.run - end - end - - def word_count(file) - `wc -l "#{file}"`.strip.split(" ")[0].to_i - end - - let(:conn) { HathifilesDatabase.new(ENV["HATHIFILES_MYSQL_CONNECTION"]) } - let(:hathifile) { data_file_path "sample_100.txt.gz" } - let(:monthly) { described_class.new(connection: conn, hathifile: hathifile, output_directory: @output_directory) } - - describe "#current_dump" do - it "creates a readable file" do - expect(File.readable?(monthly.current_dump)).to eq(true) - end - - it "creates a file with the same number of rows as the hf table" do - expect(word_count(monthly.current_dump)).to eq(conn.rawdb[:hf].count) - end - end - - describe "#new_dump" do - it "creates a readable file" do - expect(File.readable?(monthly.new_dump)).to eq(true) - end - - it "creates a file with the same number of rows as the hf table" do - hathifile_line_count = `zcat "#{hathifile}" | wc -l`.strip.split(" ")[0].to_i - expect(word_count(monthly.new_dump)).to eq(hathifile_line_count) - end - end - - describe "#additions" do - it "creates a readable file" do - expect(File.readable?(monthly.additions)).to eq(true) - end - - it "finds all entries in hathifile" do - added_lines = File.readlines(monthly.additions).map(&:chomp) - expect(added_lines.count).to eq(100) - end - - it "finds no entries in hathifile if it has already been loaded" do - conn.update_from_file hathifile - added_lines = File.readlines(monthly.additions).map(&:chomp) - expect(added_lines.count).to eq(0) - end - end - - describe "#deletions" do - it "creates a readable file" do - expect(File.readable?(monthly.deletions)).to eq(true) - end - - it "finds only test entry not in hathifile" do - deleted_htids = File.readlines(monthly.deletions).map(&:chomp) - expect(deleted_htids.count).to eq(1) - expect(deleted_htids[0]).to eq(TEST_HTID) - end - end - - describe "#run" do - it "runs to completion and writes a log entry" do - conn.rawdb.transaction do - monthly.run - expect(conn.rawdb[:hf_log].count).to eq 1 - raise Sequel::Rollback - end - end - end -end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 5f8d5b4..029161f 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -34,7 +34,7 @@ # Verify that a file exists in the spec/data dir and return its path # @param [String] relative_path The relative_path within spec/data def data_file_path(relative_path) - path = SPEC_DATA_DIR + relative_path + path = File.join(SPEC_DATA_DIR, relative_path) raise "File #{relative_path} not found under #{SPEC_DATA_DIR}" unless File.exist?(path) path