diff --git a/project.clj b/project.clj index 6afad7a..eb46c64 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject event-data-percolator "0.3.0" +(defproject event-data-percolator "0.3.1" :description "Event Data Percolator" :url "http://eventdata.crossref.org/" :license {:name "MIT License" @@ -6,7 +6,7 @@ :dependencies [[org.clojure/clojure "1.8.0"] [org.clojure/core.cache "0.6.5"] [org.clojure/core.async "0.2.395"] - [event-data-common "0.1.30"] + [event-data-common "0.1.32"] [enlive "1.1.6"] [org.clojure/core.memoize "0.5.8"] [commons-codec/commons-codec "1.10"] diff --git a/release.sh b/release.sh deleted file mode 100755 index 02ae128..0000000 --- a/release.sh +++ /dev/null @@ -1,7 +0,0 @@ -# To build and tag a version: - -: ${TAG:?"Need to set TAG for release"} - -docker build -f Dockerfile -t crossref/event-data-percolator:$TAG . - -docker push crossref/event-data-percolator:$TAG diff --git a/src/event_data_percolator/action.clj b/src/event_data_percolator/action.clj index ec1c160..1f461d8 100644 --- a/src/event_data_percolator/action.clj +++ b/src/event_data_percolator/action.clj @@ -26,7 +26,7 @@ "Take an Action, decorate with 'duplicate' field IFF there's an action ID. If there's duplicate information (a chunk of JSON representing a previous Evidence Record), associate it with the Action, otherwise pass it through. The store is updated with the values in the 'push' process." - [action evidence-record-id] + [context evidence-record action] (if-let [id (:id action)] (let [k (str "action/" id) @@ -42,20 +42,23 @@ (defn process-observations-candidates "Step Process all the observations of an Action to generate Candidates. Collect Candidates. If it's a duplicate action, the candidate extractor won't run." - [action domain-set web-trace-atom] + [context evidence-record action] (let [observations (:observations action) duplicate? (:duplicate action) - processed-observations (map #(observation/process-observation % duplicate? domain-set web-trace-atom) observations)] + processed-observations (map + #(observation/process-observation context % duplicate?) + observations)] (-> action (assoc :processed-observations processed-observations) (dissoc :observations)))) (defn match-candidates "Attempt to match all candidates into DOIs." - [action web-trace-atom] + [context evidence-record action] (let [matches (mapcat (fn [observation] - (map #(match/match-candidate % web-trace-atom) (:candidates observation))) + (map #(match/match-candidate context %) (:candidates observation))) (:processed-observations action)) + ; if the :match field wasn't set then it didn't work. Remove these. successful (filter :match matches)] (assoc action :matches successful))) @@ -80,7 +83,7 @@ (defn dedupe-matches "Deduplicate matches." - [action] + [context evidence-record action] (let [matches (:matches action) ; Dedupe by the :match field (i.e. the matched DOI). @@ -142,7 +145,7 @@ (defn create-events-for-action "Update action to include a seq of Events generated from observations in the Action. Plus extra-events if included, and if there were any matches." - [evidence-record action] + [context evidence-record action] (let [events-from-matches (map (partial create-event-from-match evidence-record action) (:matches action)) events-from-extras (when (not-empty (:matches action)) (map (partial create-event-from-extra-event evidence-record) (:extra-events action))) diff --git a/src/event_data_percolator/evidence_record.clj b/src/event_data_percolator/evidence_record.clj index 450c2f0..63296d4 100644 --- a/src/event_data_percolator/evidence_record.clj +++ b/src/event_data_percolator/evidence_record.clj @@ -68,74 +68,69 @@ (catch RuntimeException e e))) (defn map-actions - "Map over actions within an Input Evidence Record, leaving the rest intact." - [f evidence-record] + "Map over actions within an Input Evidence Record, leaving the rest intact. + call (f context evidence-record action)" + [context f evidence-record] (assoc evidence-record :pages (map (fn [page] (assoc page - :actions (map f (:actions page)))) (:pages evidence-record)))) + :actions (map #(f context evidence-record %) (:actions page)))) (:pages evidence-record)))) (defn url "Associate a URL based on the ID." - [evidence-record] + [context evidence-record] (assoc evidence-record :url (generate-url (:id evidence-record)))) (defn dedupe-actions "Dedupe actions in an input Evidence Record." - [evidence-record] + [context evidence-record] (log/debug "Deduping in " (:id evidence-record)) - (map-actions #(action/dedupe-action % (:id evidence-record)) evidence-record)) + (map-actions context action/dedupe-action evidence-record)) (defn candidates "Produce candidates in input evidence-record." - [evidence-record domain-set web-trace-atom] + [context evidence-record] (log/debug "Candidates in " (:id evidence-record)) - (map-actions #(action/process-observations-candidates % domain-set web-trace-atom) evidence-record)) + (map-actions context action/process-observations-candidates evidence-record)) (defn match "Match candidates in input evidence-record." - [evidence-record web-trace-atom] + [context evidence-record] (log/debug "Match in " (:id evidence-record)) - (map-actions #(action/match-candidates % web-trace-atom) evidence-record)) + (map-actions context action/match-candidates evidence-record)) (defn dedupe-matches "Dedupe matches WITHIN an action." - [evidence-record] + [context evidence-record] (log/debug "Dedupe in " (:id evidence-record)) - (map-actions action/dedupe-matches evidence-record)) + (map-actions context action/dedupe-matches evidence-record)) (defn events "Generate an Event for each candidate match, update extra Events." - [evidence-record] + [context evidence-record] (log/debug "Events in " (:id evidence-record)) - (->> evidence-record - (map-actions (partial action/create-events-for-action evidence-record)))) + (map-actions context action/create-events-for-action evidence-record)) (def percolator-version (System/getProperty "event-data-percolator.version")) (defn process - [evidence-record domain-artifact-version domain-set] - ; an atom that's passed around to functions that might want to log which URLs they access - ; and their respose codes. - (let [web-trace-atom (atom []) - result (-> - evidence-record - url - dedupe-actions - (candidates domain-set web-trace-atom) - (match web-trace-atom) - dedupe-matches - (assoc-in [:percolator :artifacts :domain-set-artifact-version] domain-artifact-version) - (assoc-in [:percolator :software-version] percolator-version) - events) + [context evidence-record] + (let [result (->> + evidence-record + (url context) + (dedupe-actions context) + (candidates context) + (match context) + (dedupe-matches context) + (#(assoc-in % [:percolator :artifacts :domain-set-artifact-version] (:domain-list-artifact-version context))) + (#(assoc-in % [:percolator :software-version] percolator-version)) + (events context)) + ; There are lazy sequences in here. Force the entire structure to be realized. - ; This is necessary because the web-trace-atom's value is observed at this point, - ; so we need to be confident that everything that's going to happen, has happened. - realized (clojure.walk/postwalk identity result) - with-trace (assoc realized :web-trace @web-trace-atom)] - (log/debug "Finished processing" (:id with-trace)) - with-trace)) + realized (clojure.walk/postwalk identity result)] + (log/debug "Finished processing" (:id realized)) + realized)) (defn extract-all-events "Extract all events for pushing downstream." diff --git a/src/event_data_percolator/match.clj b/src/event_data_percolator/match.clj index 7235ea7..98cd0ee 100644 --- a/src/event_data_percolator/match.clj +++ b/src/event_data_percolator/match.clj @@ -19,13 +19,14 @@ :landing-page-url landing-page-url/match-landing-page-url-candidate}) (defn match-unrecognised-type - [candidate] + [context candidate] (assoc candidate :match nil :error :unrecognised-candidate-type)) (defn match-candidate - [candidate web-trace-atom] + [context candidate] (let [f (candidate-processors (:type candidate) match-unrecognised-type)] - (f candidate web-trace-atom))) + (f context candidate))) + diff --git a/src/event_data_percolator/matchers/doi_url.clj b/src/event_data_percolator/matchers/doi_url.clj index a949d84..56b937c 100644 --- a/src/event_data_percolator/matchers/doi_url.clj +++ b/src/event_data_percolator/matchers/doi_url.clj @@ -4,7 +4,7 @@ [event-data-percolator.matchers.plain-doi :as plain-doi])) (defn match-doi-url-candidate - [candidate web-trace-atom] + [context candidate] (assoc candidate - :match (-> candidate :value crdoi/non-url-doi plain-doi/match-plain-doi))) + :match (->> candidate :value crdoi/non-url-doi (plain-doi/match-plain-doi context)))) diff --git a/src/event_data_percolator/matchers/landing_page_url.clj b/src/event_data_percolator/matchers/landing_page_url.clj index 73b6a67..9b01372 100644 --- a/src/event_data_percolator/matchers/landing_page_url.clj +++ b/src/event_data_percolator/matchers/landing_page_url.clj @@ -22,11 +22,11 @@ (defn try-from-get-params "If there's a DOI in a get parameter of a URL, find it" - [url] + [context url] (try (let [params (-> url cemerick-url/query->map clojure.walk/keywordize-keys) doi-like-values (keep (fn [[k v]] (when (re-matches whole-doi-re v) v)) params) - extant (keep doi/validate-cached doi-like-values)] + extant (keep (partial doi/validate-cached context) doi-like-values)] (-> extant first normalize-doi-if-exists)) ; Some things look like URLs but turn out not to be. @@ -34,7 +34,7 @@ (defn try-doi-from-url-text "Match an embedded DOI, try various treatments to make it fit." - [url] + [context url] (let [matches (map second (re-seq doi-re url)) last-slash (map #(clojure.string/replace % #"^(10\.\d+/(.*))/.*$" "$1") matches) @@ -56,15 +56,15 @@ candidates (distinct (concat last-slash first-slash semicolon hashchar question-mark amp-mark)) - extant (keep doi/validate-cached candidates)] + extant (keep (partial doi/validate-cached context) candidates)] (-> extant first normalize-doi-if-exists))) (defn try-pii-from-url-text - [url] + [context url] (->> url pii/find-candidate-piis - (map (comp pii/validate-pii :value)) + (map (comp (partial pii/validate-pii context) :value)) first)) (def interested-tag-attrs @@ -90,13 +90,12 @@ (def interested-tag-text "List of selectors whose text content we're interested in." - [ - ; e.g. http://jnci.oxfordjournals.org/content/108/6/djw160.full + [; e.g. http://jnci.oxfordjournals.org/content/108/6/djw160.full "span.slug-doi"]) (defn try-fetched-page-metadata-content "Extract DOI from Metadata tags." - [text] + [context text] (try (when text (let [document (Jsoup/parse text) @@ -118,7 +117,7 @@ interested-values (distinct (concat interested-attr-values interested-text-values)) ; Try to normalize by removing recognised prefixes, then resolve - extant (keep (comp doi/validate-cached crdoi/non-url-doi) interested-values)] + extant (keep (comp (partial doi/validate-cached context) crdoi/non-url-doi) interested-values)] (-> extant first normalize-doi-if-exists))) ; We're getting text from anywhere. Anything could happen. @@ -128,8 +127,8 @@ nil)))) (defn try-fetched-page-metadata - [url web-trace-atom] - (-> url (web/fetch-respecting-robots web-trace-atom) :body try-fetched-page-metadata-content)) + [context url] + (->> url (web/fetch-respecting-robots context) :body (try-fetched-page-metadata-content context))) (def redis-db-number (delay (Integer/parseInt (get env :landing-page-cache-redis-db "0")))) @@ -151,14 +150,14 @@ ; This one function is responsible for all outgoing web traffic. Cache its results. ; Other results are derived algorithmically, so there's no use caching those. (defn try-fetched-page-metadata-cached - [url web-trace-atom] + [context url] (if skip-cache - (try-fetched-page-metadata url web-trace-atom) + (try-fetched-page-metadata context url) (if-let [cached-result (store/get-string @redis-cache-store url)] (if (= cached-result "NULL") nil cached-result) - (if-let [result (try-fetched-page-metadata url web-trace-atom)] + (if-let [result (try-fetched-page-metadata context url)] ; success (do (redis/set-string-and-expiry-seconds @redis-cache-store url @success-expiry-seconds result) @@ -177,15 +176,15 @@ (defn match-landing-page-url "Try a multitude of ways to match, cheapest first." - [url web-trace-atom] + [context url] ; Step through lazy seq, an item at a time. (or - (try-from-get-params url) - (try-doi-from-url-text url) - (try-pii-from-url-text url) - (try-fetched-page-metadata-cached url web-trace-atom))) + (try-from-get-params context url) + (try-doi-from-url-text context url) + (try-pii-from-url-text context url) + (try-fetched-page-metadata-cached context url))) (defn match-landing-page-url-candidate - [candidate web-trace-atom] + [context candidate] (assoc candidate - :match (match-landing-page-url (:value candidate) web-trace-atom))) + :match (match-landing-page-url context (:value candidate)))) diff --git a/src/event_data_percolator/matchers/pii.clj b/src/event_data_percolator/matchers/pii.clj index a4e500b..5d17fc5 100644 --- a/src/event_data_percolator/matchers/pii.clj +++ b/src/event_data_percolator/matchers/pii.clj @@ -3,6 +3,6 @@ (:require [event-data-percolator.util.pii :as pii])) (defn match-pii-candidate - [candidate web-trace-atom] + [context candidate] (assoc candidate - :match (pii/validate-pii (:value candidate)))) + :match (pii/validate-pii context (:value candidate)))) diff --git a/src/event_data_percolator/matchers/plain_doi.clj b/src/event_data_percolator/matchers/plain_doi.clj index 8f22c4c..aeba80a 100644 --- a/src/event_data_percolator/matchers/plain_doi.clj +++ b/src/event_data_percolator/matchers/plain_doi.clj @@ -5,11 +5,11 @@ (defn match-plain-doi "Return a canonical DOI if this is a valid, extant DOI." - [plain-doi] - (when-let [validated (doi/validate-cached plain-doi)] + [context plain-doi] + (when-let [validated (doi/validate-cached context plain-doi)] (crdoi/normalise-doi validated))) (defn match-plain-doi-candidate - [candidate web-trace-atom] + [context candidate] (assoc candidate - :match (match-plain-doi (:value candidate)))) + :match (match-plain-doi context (:value candidate)))) diff --git a/src/event_data_percolator/matchers/shortdoi_url.clj b/src/event_data_percolator/matchers/shortdoi_url.clj index d07f4e4..4d175fa 100644 --- a/src/event_data_percolator/matchers/shortdoi_url.clj +++ b/src/event_data_percolator/matchers/shortdoi_url.clj @@ -5,18 +5,18 @@ (defn match-shortdoi-url "Return a canonical DOI if this is a valid, extant Short DOI." - [short-doi-url] + [context short-doi-url] (let [valid-url (try (new URL short-doi-url) (catch Exception _ nil)) shortdoi-path (when valid-url (let [the-path (.getPath valid-url)] ; Drop leading slash, unless there isn't a path. (when-not (clojure.string/blank? the-path) (.substring the-path 1)))) - validated (doi/validate-cached shortdoi-path)] + validated (doi/validate-cached context shortdoi-path)] (when validated (crdoi/normalise-doi validated)))) (defn match-shortdoi-url-candidate - [candidate web-trace-atom] + [context candidate] (assoc candidate - :match (match-shortdoi-url (:value candidate)))) + :match (match-shortdoi-url context (:value candidate)))) diff --git a/src/event_data_percolator/observation.clj b/src/event_data_percolator/observation.clj index fb3ac35..428ecb2 100644 --- a/src/event_data_percolator/observation.clj +++ b/src/event_data_percolator/observation.clj @@ -38,23 +38,25 @@ (defn unrecognised-observation-type "An observation processor for unrecognised types. Just pass through and set unrecognised flag." - [observation domain-list web-trace-atom] + [context observation] (assoc observation :error :unrecognised-observation-type)) (defn process-observation "Process an observation, extracting candidates unless it's part of a duplicate action." - [observation duplicate? domain-set web-trace-atom] + [context observation duplicate?] ; Choose a dispatch function or pass-through if unrecognised-observation-type. (let [sensitive? (:sensitive observation) typ (:type observation) + ; How to process this? If it's a duplicate, pass through and don't do anything. ; Otherwise choose the right processing function, or 'unrecognised'. f (if duplicate? - (fn [observation _ _] observation) + (fn [_ observation] observation) (process-types typ unrecognised-observation-type)) - processed (f observation domain-set web-trace-atom) + processed (f context observation) ; Now assign common hash, and remove sensitive info if required. result (postflight-process processed sensitive?)] + result)) diff --git a/src/event_data_percolator/observation_types/content_url.clj b/src/event_data_percolator/observation_types/content_url.clj index d470936..b0642ed 100644 --- a/src/event_data_percolator/observation_types/content_url.clj +++ b/src/event_data_percolator/observation_types/content_url.clj @@ -1,7 +1,8 @@ (ns event-data-percolator.observation-types.content-url "Extract unlinked DOIs, unlinked URLs and linked URLs (including DOIs) from HTML document at given URL." (:require [event-data-percolator.observation-types.html :as html] - [event-data-percolator.util.web :as web]) + [event-data-percolator.util.web :as web] + [event-data-common.evidence-log :as evidence-log]) (:import [org.jsoup Jsoup] [org.apache.commons.codec.digest DigestUtils] [java.net URL])) @@ -23,21 +24,28 @@ (landing-page-domain-set (landing-page-domain-set domain)))) (defn process-content-url-observation - [observation landing-page-domain-set web-trace-atom] + [context observation] (let [input (:input-url observation "") valid? (url-valid? input) - domain-allowed (not (url-is-landing-page landing-page-domain-set input)) + domain-allowed (not (url-is-landing-page (:domain-set context) input)) proceed (and valid? domain-allowed) ; The :ignore-robots flag is passed in by Agents that have specific exemptions. ; E.g. Wikipedia sites' API is excluded for general-purpose robots but allowed for our uses. content (when proceed (if (:ignore-robots observation) - (web/fetch-ignoring-robots input web-trace-atom) - (web/fetch-respecting-robots input web-trace-atom)))] + (web/fetch-ignoring-robots context input) + (web/fetch-respecting-robots context input)))] - (when-let [newsfeed-links (html/newsfeed-links-from-html (:body content) input)] - (when web-trace-atom - (swap! web-trace-atom concat (map (fn [link] {:url link :type :newsfeed-url}) newsfeed-links)))) + (doseq [newsfeed-link (html/newsfeed-links-from-html (:body content) input)] + (evidence-log/log! { + ; Service + :s "percolator" + ; Component + :c "newsfeed-link" + ; Evidence Record ID + :r (:id context) + ; URL + :u newsfeed-link})) (if-not domain-allowed (assoc observation :error :skipped-domain) @@ -45,5 +53,5 @@ (assoc observation :error :failed-fetch-url) (let [; Attach content then pass the thing to the HTML processor for heavy lifting. new-observation (assoc observation :input-content (:body content)) - html-observations (html/process-html-content-observation new-observation landing-page-domain-set web-trace-atom)] + html-observations (html/process-html-content-observation context new-observation)] html-observations))))) diff --git a/src/event_data_percolator/observation_types/html.clj b/src/event_data_percolator/observation_types/html.clj index dc99a8c..0927ed7 100644 --- a/src/event_data_percolator/observation_types/html.clj +++ b/src/event_data_percolator/observation_types/html.clj @@ -62,17 +62,20 @@ (defn process-html-content-observation "Process an observation of type html-content." - [observation landing-page-domain-set web-trace-atom] + [context observation] (let [input (:input-content observation "") candidate-urls (links-from-html input) text (plaintext-from-html input) ; Get all the candidates from the plaintext view. - plaintext-candidates (:candidates (plaintext/process-plaintext-content-observation (assoc observation :input-content text) landing-page-domain-set web-trace-atom)) + plaintext-candidates (:candidates (plaintext/process-plaintext-content-observation + context + (assoc observation :input-content text))) ; Then merge new candidates. candidates (concat plaintext-candidates - (keep #(url/url-to-landing-page-url-candidate % landing-page-domain-set) candidate-urls) + (keep #(url/url-to-landing-page-url-candidate % (:domain-set context)) candidate-urls) (keep url/url-to-doi-url-candidate candidate-urls))] + (assoc observation :candidates candidates))) diff --git a/src/event_data_percolator/observation_types/plaintext.clj b/src/event_data_percolator/observation_types/plaintext.clj index d5701be..5806cfb 100644 --- a/src/event_data_percolator/observation_types/plaintext.clj +++ b/src/event_data_percolator/observation_types/plaintext.clj @@ -40,12 +40,13 @@ (defn process-plaintext-content-observation "Process an observation of type plaintext-content." - [observation landing-page-domain-set web-trace-atom] + [context observation] (let [input (:input-content observation "") possible-urls (possible-urls-from-text input) candidates (concat (candidate-dois-from-text input) (candidate-piis-from-text input) (keep url/url-to-doi-url-candidate possible-urls) - (keep #(url/url-to-landing-page-url-candidate % landing-page-domain-set) possible-urls))] + (keep #(url/url-to-landing-page-url-candidate % (:domain-set context)) possible-urls))] + (assoc observation :candidates candidates))) diff --git a/src/event_data_percolator/observation_types/url.clj b/src/event_data_percolator/observation_types/url.clj index 7a7e1eb..b44a71c 100644 --- a/src/event_data_percolator/observation_types/url.clj +++ b/src/event_data_percolator/observation_types/url.clj @@ -39,10 +39,11 @@ (defn process-url-observation "Process a url observation into a candidate url. Check if valid and if on the domain list." - [observation landing-page-domain-set web-trace-atom] + [context observation] (let [input (:input-url observation "") ; single input input, but candidate responses are always lists. candidates (remove nil? [(url-to-doi-url-candidate input) - (url-to-landing-page-url-candidate input landing-page-domain-set)])] + (url-to-landing-page-url-candidate input (:domain-set context))])] + (assoc observation :candidates candidates))) diff --git a/src/event_data_percolator/process.clj b/src/event_data_percolator/process.clj index 1f94876..d572dec 100644 --- a/src/event_data_percolator/process.clj +++ b/src/event_data_percolator/process.clj @@ -13,7 +13,7 @@ [clojure.core.memoize :as memo] [clojure.data.json :as json] [org.httpkit.client :as client] - [event-data-common.status :as status] + [event-data-common.evidence-log :as evidence-log] [robert.bruce :refer [try-try-again]]) (:import [org.apache.kafka.clients.producer KafkaProducer Producer ProducerRecord] [org.apache.kafka.clients.consumer KafkaConsumer Consumer ConsumerRecords] @@ -21,24 +21,24 @@ (def domain-list-artifact-name "crossref-domain-list") -(defn retrieve-domain-list +(defn retrieve-domain-set "Return tuple of [version-url, domain-list-set]" [] (log/info "Retrieving domain list artifact") - (status/send! "percolator" "artifact" "fetch" -1 1 "domain-list") + ; Fetch the cached copy of the domain list. (let [domain-list-artifact-version (artifact/fetch-latest-version-link domain-list-artifact-name) ; ~ 5KB string, set of ~ 8000 - domain-list (-> domain-list-artifact-name artifact/fetch-latest-artifact-stream clojure.java.io/reader line-seq set)] - [domain-list-artifact-version domain-list])) + domain-set (-> domain-list-artifact-name artifact/fetch-latest-artifact-stream clojure.java.io/reader line-seq set)] + [domain-list-artifact-version domain-set])) (def cache-milliseconds "One hour" 3600000) -(def cached-domain-list +(def cached-domain-set "Cache the domain list and version url. It's very rarely actually updated." - (memo/ttl retrieve-domain-list {} :ttl/threshold cache-milliseconds)) + (memo/ttl retrieve-domain-set {} :ttl/threshold cache-milliseconds)) (def evidence-store (delay @@ -59,13 +59,12 @@ (def kafka-producer (delay - (let [properties (java.util.Properties.)] - (.put properties "bootstrap.servers" (:global-kafka-bootstrap-servers env)) - (.put properties "acks", "all") - (.put properties "retries", (int 5)) - (.put properties "key.serializer", "org.apache.kafka.common.serialization.StringSerializer") - (.put properties "value.serializer", "org.apache.kafka.common.serialization.StringSerializer") - (KafkaProducer. properties)))) + (KafkaProducer. { + "bootstrap.servers" (:global-kafka-bootstrap-servers env) + "acks" "all" + "retries" (int 5) + "key.serializer" "org.apache.kafka.common.serialization.StringSerializer" + "value.serializer" "org.apache.kafka.common.serialization.StringSerializer"}))) (defn storage-key-for-evidence-record-id [id] @@ -103,10 +102,18 @@ [evidence-record-input] (log/info "Processing" (:id evidence-record-input)) (let [id (:id evidence-record-input) - [domain-list-artifact-version domain-list] (cached-domain-list) + [domain-list-artifact-version domain-set] (cached-domain-set) + + ; Execution context for all processing involved in processing this Evidence Record. + ; This context is passed to all functions that need it. + ; Don't include the input evidence record, as it's modified in a few steps. + ; Keeping the original version around could be confusing. + context {:id id + :domain-set domain-set + :domain-list-artifact-version domain-list-artifact-version} ; Actually do the work of processing an Evidence Record. - evidence-record-processed (evidence-record/process evidence-record-input domain-list-artifact-version domain-list) + evidence-record-processed (evidence-record/process context evidence-record-input) ; Remove the JWT before saving as a public record. public-evidence-record (dissoc evidence-record-processed :jwt) @@ -129,7 +136,11 @@ (.send @kafka-producer (ProducerRecord. topic (:id event) - (json/write-str (assoc event :jwt jwt))))) + (json/write-str (assoc event :jwt jwt)))) + + (evidence-log/log! { + :s "percolator" :c "event" :f "send" + :r (:id context) :n (:id event)})) (log/info "Finished saving" id))) @@ -157,53 +168,80 @@ (defn process-kafka-inputs "Process an input stream from Kafka in this thread." [] - (let [properties (java.util.Properties.)] - (.put properties "bootstrap.servers" (:global-kafka-bootstrap-servers env)) - (.put properties "group.id" "percolator-process") - (.put properties "key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") - (.put properties "value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") - - ; This is only used in the absence of an existing marker for the group. - (.put properties "auto.offset.reset" "earliest") - - (let [consumer (KafkaConsumer. properties) - topic-name (:percolator-input-evidence-record-topic env)] - (log/info "Subscribing to" topic-name) - (.subscribe consumer (list topic-name)) - (log/info "Subscribed to" topic-name "got" (count (or (.assignment consumer) [])) "assigned partitions") - (loop [] - (log/info "Polling...") - (let [^ConsumerRecords records (.poll consumer (int 10000)) - lag (lag-for-assigned-partitions consumer) - c (atom 0)] - (log/info "Lag for partitions:" lag) - - (doseq [[topic-number topic-lag] lag] - (status/send! "percolator" "input-queue" "lag" topic-number topic-lag)) - - (log/info "Got" (.count records) "records." (.hashCode records)) - (doseq [^ConsumerRecords record records] - (swap! c inc) - - (log/info "Start processing:" (.key record) "size:" (.serializedValueSize record) @c "/" (.count records)) - - (status/send! "percolator" "input-queue" "message-size" (.serializedValueSize record)) - (status/send! "percolator" "input-queue" "time-lag" (- (System/currentTimeMillis) (.timestamp record))) - - (let [value (.value record) - evidence-record (json/read-str value :key-fn keyword) - schema-errors (evidence-record/validation-errors evidence-record)] - (log/info "Look at" (:id evidence-record)) - (if schema-errors - (log/error "Schema errors with input Evidence Record id" (:id evidence-record) schema-errors) - (duplicate-guard - (json/read-str (.value record) :key-fn keyword) - process-and-save))) - (log/info "Finished processing record" (.key record))) - - (log/info "Finished processing records" (.count records) "records." (.hashCode records)) - ; The only way this ends is violently. - (recur)))))) + (let [consumer (KafkaConsumer. + {"bootstrap.servers" (:global-kafka-bootstrap-servers env) + "group.id" "percolator-process" + "key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer" + "value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer" + "auto.offset.reset" "earliest"}) + + topic-name (:percolator-input-evidence-record-topic env)] + + (log/info "Subscribing to" topic-name) + (.subscribe consumer (list topic-name)) + + (log/info "Subscribed to" topic-name "got" (count (or (.assignment consumer) [])) "assigned partitions") + (loop [] + (log/info "Polling...") + (let [^ConsumerRecords records (.poll consumer (int 10000)) + lag (lag-for-assigned-partitions consumer) + c (atom 0)] + (log/info "Lag for partitions:" lag) + + (doseq [[partition-number partition-lag] lag] + (evidence-log/log! { + ; Service + :s "percolator" + ; Component + :c "process" + ; Facet + :f "input-message-lag" + ; Partition + :p partition-number + ; Value + :v partition-lag})) + + (log/info "Got" (.count records) "records." (.hashCode records)) + (doseq [^ConsumerRecords record records] + (swap! c inc) + + (log/info "Start processing:" (.key record) "size:" (.serializedValueSize record) @c "/" (.count records)) + + (evidence-log/log! { + :s "percolator" :c "process" :f "input-message-size" + :v (.serializedValueSize record)}) + + (evidence-log/log! { + :s "percolator" :c "process" :f "input-message-time-lag" + :v (- (System/currentTimeMillis) (.timestamp record))}) + + (let [value (.value record) + evidence-record (json/read-str value :key-fn keyword) + schema-errors (evidence-record/validation-errors evidence-record)] + (log/info "Look at" (:id evidence-record)) + + (evidence-log/log! { + :s "percolator" :c "process" :f "start" + ; Evidence Record ID + :r (:id evidence-record)}) + + (if schema-errors + (log/error "Schema errors with input Evidence Record id" (:id evidence-record) schema-errors) + (duplicate-guard + (json/read-str (.value record) :key-fn keyword) + + ; This is where all the work happens! + process-and-save)) + + (log/info "Finished processing record" (.key record)) + + (evidence-log/log! { + :s "percolator" :c "process" :f "finish" + :r (:id evidence-record)}))) + + (log/info "Finished processing records" (.count records) "records." (.hashCode records))) + ; The only way this ends is violently. + (recur)))) (defn process-kafka-inputs-concurrently "Run a number of threads to process inputs." diff --git a/src/event_data_percolator/util/doi.clj b/src/event_data_percolator/util/doi.clj index 1517503..9032859 100644 --- a/src/event_data_percolator/util/doi.clj +++ b/src/event_data_percolator/util/doi.clj @@ -8,7 +8,7 @@ [event-data-common.storage.store :as store] [event-data-common.storage.redis :as redis] [clojure.tools.logging :as log] - [event-data-common.status :as status] + [event-data-common.evidence-log :as evidence-log] [clojure.data.json :as json])) (def doi-re #"(10\.\d{4,9}/[^\s]+)") @@ -22,44 +22,52 @@ (defn resolve-doi "Resolve and validate a DOI or ShortDOI, expressed as not-URL form. May or may not be URLEscaped. Return the DOI." - [doi] + [context doi] - (status/send! "percolator" "doi-api" "request" nil 1 doi) (let [is-short-doi (not (re-matches #"^10\.\d+/.*" doi)) + ; if it looks like a full DOI, look that up. It it looks like a handle, different syntax. input-handle (if is-short-doi (str "10/" doi) doi) + response @(try-try-again {:sleep 5000 :tries 2} #(http/get (str "https://doi.org/api/handles/" input-handle) {:as :text})) + status (:status response) body (when (= 200 status) (-> response :body (json/read-str :key-fn keyword))) + ; Either get the validated handle, or for a short DOI, the DOI it's aliased to. handle (when body - (status/send! "percolator" "doi-api" "match" nil 1 doi) + (evidence-log/log! { + :s "percolator" :c "resolve-doi" :f "success" + :r (:id context) :d doi}) + (if is-short-doi (->> body :values (filter #(= (:type %) "HS_ALIAS")) first :data :value) (:handle body)))] (when-not body - (status/send! "percolator" "doi-api" "no-match" nil 1 doi)) + (evidence-log/log! { + :s "percolator" :c "resolve-doi" :f "failure" + :r (:id context) :d doi})) handle)) (defn resolve-doi-maybe-escaped "Try to resolve a possibly URL-encoded DOI. If it can be decoded and still resolve, return it decoded." - [original] + [context original] (if ; %2F is the URL-encoded slash which is present in every DOI. (and original (re-find #"%2[Ff]" original)) (let [decoded (try (URLDecoder/decode original "UTF-8") (catch java.lang.IllegalArgumentException _ nil))] - (if-let [resolved (when decoded (resolve-doi decoded))] + (if-let [resolved (when decoded (resolve-doi context decoded))] resolved - (resolve-doi original))) - (resolve-doi original))) + (resolve-doi context original))) + (resolve-doi context original))) (defn drop-right-char "Drop a character from the right of a string. @@ -77,7 +85,7 @@ (def max-drops 5) (defn validate-doi-dropping "For a given suspected DOI or shortDOI, validate that it exists, possibly chopping some of the end off to get there." - [doi] + [context doi] (loop [i 0 doi doi] ; Terminate if we're at the end of clipping things off or the DOI no longer looks like an DOI. @@ -90,7 +98,7 @@ ; Stop recursion. nil ; Or try this substring. - (if-let [clean-doi (resolve-doi-maybe-escaped doi)] + (if-let [clean-doi (resolve-doi-maybe-escaped context doi)] ; resolve-doi may alter the DOI it returns, e.g. resolving a shortDOI to a real DOI or lower-casing. ; We have a working DOI! @@ -98,7 +106,7 @@ ; If there is a question mark, try removing it to see if it still works. (if (.contains clean-doi "?") (let [before-q (first (.split clean-doi "\\?"))] - (if (resolve-doi before-q) + (if (resolve-doi context before-q) before-q clean-doi)) clean-doi) @@ -125,14 +133,14 @@ (defn validate-cached "Take a suspected DOI or ShortDOI and return the correct full well-formed, extant DOI. This is the function you want." - [suspected-doi] + [context suspected-doi] (if skip-cache - (validate-doi-dropping suspected-doi) + (validate-doi-dropping context suspected-doi) (if-let [cached-result (store/get-string @redis-cache-store suspected-doi)] (if (= cached-result "NULL") nil cached-result) - (if-let [result (validate-doi-dropping suspected-doi)] + (if-let [result (validate-doi-dropping context suspected-doi)] ; success (do (redis/set-string-and-expiry-seconds @redis-cache-store suspected-doi @success-expiry-seconds result) diff --git a/src/event_data_percolator/util/pii.clj b/src/event_data_percolator/util/pii.clj index 196df83..e355122 100644 --- a/src/event_data_percolator/util/pii.clj +++ b/src/event_data_percolator/util/pii.clj @@ -6,7 +6,7 @@ [crossref.util.doi :as crdoi] [robert.bruce :refer [try-try-again]] [clojure.tools.logging :as log] - [event-data-common.status :as status] + [event-data-common.evidence-log :as evidence-log] [clojure.data.json :as json])) (def pii-re #"([SB][0-9XB\-]{16,20})") @@ -22,7 +22,7 @@ (defn validate-pii "Validate a PII and return the DOI if it's been used as an alternative ID." - [pii] + [context pii] (when-not (clojure.string/blank? pii) (let [result (try (try-try-again {:sleep 5000 :tries 2} @@ -37,21 +37,25 @@ nil))) items (get-in result ["message" "items"])] - - (status/send! "percolator" "metadata-api" "request") (when-not result (log/error "Failed to retrieve PII from API for" pii) - (status/send! "percolator" "metadata-api" "fail")) + (evidence-log/log! { + :s "percolator" :c "lookup-pii" :f "failed" + :r (:id context) + :v pii})) (when result - (status/send! "percolator" "metadata-api" "ok")) + (evidence-log/log! { + :s "percolator" :c "lookup-pii" :f "success" + :r (:id context) + :v pii})) ; Only return when there's exactly one match. ; If so, check that the DOI exists and in the process normalize (don't trust the API's indexed data). (when (= 1 (count items)) (let [possible-doi (get (first items) "DOI") - extant-doi (doi/validate-cached possible-doi)] + extant-doi (doi/validate-cached context possible-doi)] (when extant-doi (crdoi/normalise-doi extant-doi))))))) diff --git a/src/event_data_percolator/util/web.clj b/src/event_data_percolator/util/web.clj index 8f1acd2..0849c59 100644 --- a/src/event_data_percolator/util/web.clj +++ b/src/event_data_percolator/util/web.clj @@ -7,7 +7,7 @@ [event-data-common.storage.store :as store] [org.httpkit.client :as client] [clojure.core.memoize :as memo] - [event-data-common.status :as status] + [event-data-common.evidence-log :as evidence-log] [event-data-percolator.consts]) (:import [java.net URL] [crawlercommons.robots SimpleRobotRulesParser BaseRobotRules] @@ -20,63 +20,97 @@ (defn fetch "Fetch the content at a URL as a string, following redirects and accepting cookies. Take an optional atom to which sequences of urls and status codes will be appended." - ([url] (fetch url nil)) - ([url trace-atom] - (status/send! "percolator" "web-fetch" "request" nil 1 url) - (try - (loop [headers {"Referer" "https://eventdata.crossref.org" - "User-Agent" event-data-percolator.consts/user-agent-for-robots} - depth 0 - url url] - (if (> depth redirect-depth) - nil - (let [result @(http/get url {:follow-redirects false :headers headers :as :text :throw-exceptions true}) - error (:error result) - cookie (-> result :headers :set-cookie) - new-headers (merge headers (when cookie {"Cookie" cookie}))] - - - ; Trace. Two kinds of exception handling, the returned error and the try-catch below. - (when trace-atom - (if-not error - (swap! trace-atom concat [{:url url :type :request :status (:status result)}]) - (swap! trace-atom concat [{:url url :type :request - :error (cond - (instance? org.httpkit.client.TimeoutException error) :timeout-error - :default :unknown-error)}]))) - (if (#{200 401} (:status result)) - (status/send! "percolator" "web-fetch" "ok" nil 1 url) - (status/send! "percolator" "web-fetch" "fail" nil 1 url)) - - (condp = (:status result) - 200 result - ; Weirdly some Nature pages return 401 with the content. http://www.nature.com/nrendo/journal/v10/n9/full/nrendo.2014.114.html - 401 result - 301 (recur new-headers (inc depth) (-> result :headers :location)) - 303 (recur new-headers (inc depth) (-> result :headers :location)) - 302 (recur new-headers (inc depth) (-> result :headers :location)) - nil)))) + [context url] - ; On error just return nil, but add exception to trace. - (catch java.net.URISyntaxException exception (when trace-atom - (do (swap! trace-atom concat [{:error :url-syntax-error :type :request :url url}]) - nil))) + (evidence-log/log! { + ; Service + :s "percolator" + ; Component + :c "fetch" + ; Facet + :f "request" + ; Evidence Record ID + :r (:id context) + ; DOI + :d "" + ; URL + :u "url"}) + + (try + (loop [headers {"Referer" "https://eventdata.crossref.org" + "User-Agent" event-data-percolator.consts/user-agent-for-robots} + depth 0 + url url] + (if (> depth redirect-depth) + nil + (let [result @(http/get url {:follow-redirects false :headers headers :as :text :throw-exceptions true}) + error (:error result) + cookie (-> result :headers :set-cookie) + new-headers (merge headers (when cookie {"Cookie" cookie}))] + + (evidence-log/log! { + ; Service + :s "percolator" + ; Component + :c "fetch" + ; Facet + :f "response" + ; Evidence Record ID + :r (:id context) + ; DOI + :d "" + ; URL + :u "url"}) + + (condp = (:status result) + 200 result + ; Weirdly some Nature pages return 401 with the content. http://www.nature.com/nrendo/journal/v10/n9/full/nrendo.2014.114.html + 401 result + 301 (recur new-headers (inc depth) (-> result :headers :location)) + 303 (recur new-headers (inc depth) (-> result :headers :location)) + 302 (recur new-headers (inc depth) (-> result :headers :location)) + nil)))) + + ; On error just return nil, but add exception to trace. + (catch java.net.URISyntaxException exception + (do + (evidence-log/log! { + :s "percolator" :c "fetch" + :f "error" :v "uri-syntax-exception" + :r (:id context) :u url}) + nil)) - (catch java.net.UnknownHostException exception (when trace-atom - (do (swap! trace-atom concat [{:error :unknown-host-error :type :request :url url}]) - nil))) + (catch java.net.UnknownHostException exception + (do + (evidence-log/log! { + :s "percolator" :c "fetch" + :f "error" :v "unknown-host-exception" + :r (:id context) :u url}) + nil)) - (catch org.httpkit.client.TimeoutException exception (when trace-atom - (do (swap! trace-atom concat [{:error :timeout-error :type :request :url url}]) - nil))) + (catch org.httpkit.client.TimeoutException exception + (do + (evidence-log/log! { + :s "percolator" :c "fetch" + :f "error" :v "timeout-exception" + :r (:id context) :u url}) + nil)) - (catch org.httpkit.ProtocolException exception (when trace-atom - (do (swap! trace-atom concat [{:error :timeout-error :type :request :url url}]) - nil))) + (catch org.httpkit.ProtocolException exception + (do + (evidence-log/log! { + :s "percolator" :c "fetch" + :f "error" :v "protocol-exception" + :r (:id context) :u url}) + nil)) - (catch Exception exception (when trace-atom - (do (swap! trace-atom concat [{:error :unknown :exception-message (.getMessage exception) :type :request :url url}]) - nil)))))) + (catch Exception exception + (do + (evidence-log/log! { + :s "percolator" :c "fetch" + :f "error" :v "unknown-exception" + :r (:id context) :u url}) + nil)))) (def redis-cache-store (delay (redis/build "robot-cache:" (:percolator-robots-cache-redis-host env) @@ -90,14 +124,14 @@ (defn fetch-robots-cached "Return robots file. Return nil if it doesn't exist." - [robots-file-url] + [context robots-file-url] (if skip-cache - (:body (fetch robots-file-url)) + (:body (fetch nil robots-file-url)) (if-let [cached-result (store/get-string @redis-cache-store robots-file-url)] (if (= cached-result "NULL") nil cached-result) - (let [result (:body (fetch robots-file-url))] + (let [result (:body (fetch context robots-file-url))] (redis/set-string-and-expiry-seconds @redis-cache-store robots-file-url @expiry-seconds (or result "NULL")) result)))) @@ -109,41 +143,62 @@ (defn get-rules "Get a Robot Rules object for the given robots.txt file url. Or nil if there aren't any." - [robots-file-url] - (when-let [file-content (fetch-robots-cached robots-file-url)] + [context robots-file-url] + (when-let [file-content (fetch-robots-cached context robots-file-url)] (parse-rules robots-file-url file-content))) ; The robots files are cached in Redis, but must be re-parsed. Keep track of the thousand-odd most visited sites. +; Note that this calls fetch, which records Evidence Logs about its activity. Because this is cached, a prior +; request under a previous Evidence Record may have satisfy the robots.txt request. (def get-rules-cached (memo/lu get-rules :lu/threshold 1024)) (defn allowed? - [url-str] + [context url-str] (let [robots-file-url (new URL (new URL url-str) "/robots.txt") - rules (get-rules-cached (str robots-file-url)) + rules (get-rules-cached context (str robots-file-url)) + ; If there's no robots file, proceed. allowed (if-not rules true (.isAllowed rules url-str))] - (if allowed - (status/send! "percolator" "robot" "allowed" nil 1 url-str) - (status/send! "percolator" "robot" "not-allowed" nil 1 url-str)) - allowed)) (defn fetch-respecting-robots "Fetch URL, respecting robots.txt directives for domain." - [url trace-atom] - (let [allowed (allowed? url)] - (if-not allowed - (do - (when trace-atom - (swap! trace-atom concat [{:error :robots-forbidden :type :request :url url}])) - nil) - (fetch url trace-atom)))) + [context url] + + (let [allowed (allowed? context url)] + (evidence-log/log! { + ; Service + :s "percolator" + ; Component + :c "robot-check" + ; Facet + :f "result" + ; Value + :v (boolean allowed) + ; Evidence Record ID + :r (:id context) + ; URL + :u url}) + + (when allowed + (fetch context url)))) (defn fetch-ignoring-robots "Fetch URL, ignoring any robots.txt directives" - [url trace-atom] - ; Just an alias, but intentional. - (fetch url trace-atom)) - + [context url] + + (evidence-log/log! { + ; Service + :s "percolator" + ; Component + :c "robot-check" + ; Facet + :f "skip" + ; Evidence Record ID + :r (:id context) + ; URL + :u url}) + + (fetch context url)) diff --git a/test/event_data_percolator/action_test.clj b/test/event_data_percolator/action_test.clj index 54362b0..0135e03 100644 --- a/test/event_data_percolator/action_test.clj +++ b/test/event_data_percolator/action_test.clj @@ -133,7 +133,7 @@ :url "http://example.com/evidence/123456" :pages [{:actions [input-action]}]} - result-action (action/create-events-for-action evidence-record input-action)] + result-action (action/create-events-for-action util/mock-context evidence-record input-action)] (is (empty? (:events result-action)) "No Events should have been emitted."))) (testing "When there are are extra Events and there was at least one match, those Extra Events should be emitted, with the requisite fields." @@ -162,7 +162,7 @@ :url "http://example.com/evidence/123456" :pages [{:actions [input-action]}]} - result-action (action/create-events-for-action evidence-record input-action)] + result-action (action/create-events-for-action util/mock-context evidence-record input-action)] (is (= (count (:events result-action)) 3) "Three Events should have been emitted, one from the match and two from the extras.") ; compare with out :id field, that's random. @@ -218,7 +218,7 @@ :url "http://example.com/evidence/123456" :pages [{:actions [input-action]}]} - result-action (action/create-events-for-action evidence-record input-action)] + result-action (action/create-events-for-action util/mock-context evidence-record input-action)] (is (= (count (:events result-action)) 1) "One Events should have been emitted, from the match.") (is (= (map #(dissoc % :id) (:events result-action)) [{:license "http://example.com/license" @@ -259,6 +259,6 @@ {:type :landing-page-url :value "https://www.example.com/article/123456789" :match "https://doi.org/10.5555/12345678"} {:type :doi-url, :value "https://doi.org/10.6666/24242424" :match "https://doi.org/10.6666/24242424"}]} - result (action/dedupe-matches input-action)] + result (action/dedupe-matches util/mock-context util/mock-evidence-record input-action)] (is (= result expected-result))))) diff --git a/test/event_data_percolator/evidence_record_test.clj b/test/event_data_percolator/evidence_record_test.clj index 2d616f0..06894a8 100644 --- a/test/event_data_percolator/evidence_record_test.clj +++ b/test/event_data_percolator/evidence_record_test.clj @@ -13,7 +13,7 @@ (deftest ^:unit url (testing "url should add url based on the id." (let [bundle {:id "20170101-twitter-123456789"} - result (evidence-record/url bundle)] + result (evidence-record/url util/mock-context bundle)] (is (:id result) "Original bundle data preserved.") (is (= (:url result) "https://evidence.eventdata.crossref.org/evidence/20170101-twitter-123456789") "URL should be set")))) @@ -34,8 +34,8 @@ {:type "url" :input-url "http://doi.org/10.5555/22222"}]}]}]} - ; Supply empty domain list as we're not testing landing page extraction. - result (evidence-record/candidates evidence-record #{} (atom []))] + result (evidence-record/candidates util/mock-context evidence-record)] + (is (= result {:id "1234" :pages [{:actions [{:unrelated :junk @@ -50,12 +50,8 @@ {:type "url" :input-url "http://doi.org/10.5555/22222" :candidates [{:type :doi-url, :value "http://doi.org/10.5555/22222"}]}]}]}]}) - "Overall structure preserved. Candidates are attached to actions and input-content-hash where appropriate. Unrelated information at all levels carried through."))))) -(defn doi-ok - "Fake OK return from DOI proxy." - [handle] - {:status 200 :body (json/write-str {"handle" handle})}) + "Overall structure preserved. Candidates are attached to actions and input-content-hash where appropriate. Unrelated information at all levels carried through."))))) (deftest ^:component match (testing "match should transform candidates, but leave structure intact" @@ -78,7 +74,8 @@ {:type "url" :input-url "http://doi.org/10.5555/22222" :candidates [{:type :doi-url, :value "http://doi.org/10.5555/22222"}]}]}]}]} - result (evidence-record/match evidence-record nil)] + + result (evidence-record/match util/mock-context evidence-record)] (is (= result {:id "1234" :pages [{:actions @@ -103,6 +100,7 @@ {:type :doi-url :value "http://doi.org/10.5555/22222" :match "https://doi.org/10.5555/22222"}]}]}]}) + "Overall structure preserved. Matches gathered for each action over candidates. Occurred-at carried through to match."))))) @@ -139,7 +137,8 @@ {:type :doi-url :value "http://doi.org/10.5555/22222" :match "https://doi.org/10.5555/22222"}]}]}]} - result (evidence-record/events evidence-record) + + result (evidence-record/events util/mock-context evidence-record) events (-> result :pages first :actions first :events)] (is (= (count events) 2) "Two events produced from two matches.") @@ -199,7 +198,7 @@ {:ignore-this :stuff :actions [:some-dummy-action-object-2 :some-dummy-action-object-3]}]} - result (evidence-record/map-actions str input)] + result (evidence-record/map-actions util/mock-context (fn [context evidence-record action] (str action)) input)] (is (= result {:id "1234" :pages [ @@ -211,7 +210,7 @@ ":some-dummy-action-object-3"]}]}))))) (deftest ^:component end-to-end-process - (testing "End-to-end processing of Input Bundle should result in an Evidence Record with Events and HTTP tracing." + (testing "End-to-end processing of Input Bundle should result in an Evidence Record with Events." ; A single redirect so that we can demonstrate that the trace is captured. (fake/with-fake-http ["http://article.com/article/22222" {:status 303 :headers {:location "http://article.com/article/22222-X"}} "http://article.com/article/22222-X" {:status 200 :body ""} @@ -220,8 +219,8 @@ ; This one throws a timeout error, which should be reported "http://article.com/article/XXXXX" (fn [a b c] (throw (new org.httpkit.client.TimeoutException "I got bored")))] - (let [domain-list #{"article.com"} - evidence-record {:id "1234" + + (let [evidence-record {:id "1234" :artifacts {:other :value} ; pass-through any artifact info from input package. :pages [ {:actions [ @@ -233,20 +232,14 @@ {:type "url" :input-url "http://article.com/article/XXXXX"}]}]}]} - result (evidence-record/process evidence-record "http://d1v52iseus4yyg.cloudfront.net/a/crossref-domain-list/versions/1482489046417" domain-list)] - + result (evidence-record/process (assoc util/mock-context :domain-set #{"article.com"}) evidence-record)] + (is (= (-> result :percolator :artifacts :domain-set-artifact-version) - "http://d1v52iseus4yyg.cloudfront.net/a/crossref-domain-list/versions/1482489046417") - "Domain list artifact version should be correctly set.") + (:domain-list-artifact-version util/mock-context)) + "Domain list artifact version should be correctly set from context object.") (is (= (-> result :artifacts :other) :value) "Pre-existing values in artifacts are passed through.") - (is (= (set (:web-trace result)) - #{{:type :request :url "http://article.com/article/22222" :status 303 } - {:type :request :url "http://article.com/article/22222-X" :status 200 } - {:type :request :url "http://article.com/article/XXXXX" :error :timeout-error}}) - "All HTTP access should be recorded") - ; The rest of the pieces are tested above. (is (= 1 (-> result :pages first :actions first :events count)) "One event should be found") (is (-> result :id)))))) @@ -256,9 +249,8 @@ (deftest ^:component deduplication-across-bundles ; This is the most likely case. (testing "Duplicates can be detected between a evidence-records" - (fake/with-fake-http ["https://doi.org/api/handles/10.5555/12345678" (doi-ok "10.5555/12345678")] - (let [domain-list #{} - ; We submit the same input bundle twice. + (fake/with-fake-http ["https://doi.org/api/handles/10.5555/12345678" (util/doi-ok "10.5555/12345678")] + (let [; We submit the same input bundle twice. evidence-record {:id "1234" :pages [ {:actions [ @@ -269,12 +261,12 @@ {:type "plaintext" :input-content "10.5555/12345678"}]}]}]} - result-1 (evidence-record/process evidence-record "http://d1v52iseus4yyg.cloudfront.net/a/crossref-domain-list/versions/1482489046417" domain-list) + result-1 (evidence-record/process util/mock-context evidence-record) ; Now save the action IDs. This is normally triggered in 'push'. push-output-bundle-result (action/store-action-duplicates result-1) - result-2 (evidence-record/process evidence-record "http://d1v52iseus4yyg.cloudfront.net/a/crossref-domain-list/versions/1482489046417" domain-list) + result-2 (evidence-record/process util/mock-context evidence-record) evidence-record-id-1 (:id result-1) evidence-record-id-2 (:id result-2) @@ -300,9 +292,8 @@ (deftest ^:component action-id-can-be-ommitted (testing "Action IDs can be ommitted if it's sensible to do so, e.g. low chance of collision, very high rate of input per Wikipedia" - (fake/with-fake-http ["https://doi.org/api/handles/10.5555/9898989898" (doi-ok "10.5555/9898989898")] - (let [domain-list #{} - evidence-record {:id "1234" + (fake/with-fake-http ["https://doi.org/api/handles/10.5555/9898989898" (util/doi-ok "10.5555/9898989898")] + (let [evidence-record {:id "1234" :pages [ {:actions [ ; Same actions in the input bundle. In reality this shouldn't happen, but do it to verify that they aren't deduplicated. @@ -318,12 +309,12 @@ :input-content "10.5555/9898989898"}]}]}]} ; Also send twice. - result-1 (evidence-record/process evidence-record "http://d1v52iseus4yyg.cloudfront.net/a/crossref-domain-list/versions/1482489046417" domain-list) + result-1 (evidence-record/process util/mock-context evidence-record) ; Now save the action IDs. This is normally triggered in 'push'. push-output-bundle-result (action/store-action-duplicates result-1) - result-2 (evidence-record/process evidence-record "http://d1v52iseus4yyg.cloudfront.net/a/crossref-domain-list/versions/1482489046417" domain-list)] + result-2 (evidence-record/process util/mock-context evidence-record)] (is (= (dissoc result-1 :id diff --git a/test/event_data_percolator/matchers/doi_url_test.clj b/test/event_data_percolator/matchers/doi_url_test.clj index 795bc69..e4c2256 100644 --- a/test/event_data_percolator/matchers/doi_url_test.clj +++ b/test/event_data_percolator/matchers/doi_url_test.clj @@ -10,19 +10,19 @@ (deftest ^:component match-doi-url-candidate (testing "match-doi-url-candidate matches valid DOI." (fake/with-fake-http ["https://doi.org/api/handles/10.5555/12345678" {:status 200 :body (json/write-str {"handle" "10.5555/12345678"})}] - (let [result (doi-url/match-doi-url-candidate {:value "https://doi.org/10.5555/12345678"} nil)] + (let [result (doi-url/match-doi-url-candidate util/mock-context {:value "https://doi.org/10.5555/12345678"})] (is (= result {:value "https://doi.org/10.5555/12345678", :match "https://doi.org/10.5555/12345678"}))))) (testing "match-doi-url-candidate does not match nonexistent DOI." ; It will try to drop a few off the end to match, with different encodings. Tolerate this. (fake/with-fake-http [#"https://doi.org/api/handles/10.5555/12" (util/doi-not-found) #"https://doi.org/api/handles/10.5555%2F12" (util/doi-not-found)] - (let [result (doi-url/match-doi-url-candidate {:value "http://doi.org/10.5555/12345678"} nil)] + (let [result (doi-url/match-doi-url-candidate util/mock-context {:value "http://doi.org/10.5555/12345678"})] (is (= result {:value "http://doi.org/10.5555/12345678", :match nil}))))) (testing "match-doi-url-candidate normalizes DOI." (fake/with-fake-http ["https://doi.org/api/handles/10.5555/12345678" (util/doi-ok "10.5555/12345678")] ; Use dx.doi.org resolver and HTTPs. Should be normalized to doi.org and HTTPS. - (let [result (doi-url/match-doi-url-candidate {:value "http://dx.doi.org/10.5555/12345678"} nil)] + (let [result (doi-url/match-doi-url-candidate util/mock-context {:value "http://dx.doi.org/10.5555/12345678"})] (is (= result {:value "http://dx.doi.org/10.5555/12345678", :match "https://doi.org/10.5555/12345678"})))))) diff --git a/test/event_data_percolator/matchers/landing_page_url_test.clj b/test/event_data_percolator/matchers/landing_page_url_test.clj index a7abd3d..41a1bf7 100644 --- a/test/event_data_percolator/matchers/landing_page_url_test.clj +++ b/test/event_data_percolator/matchers/landing_page_url_test.clj @@ -18,20 +18,20 @@ (testing "try-from-get-params can find valid DOIs" (fake/with-fake-http ["https://doi.org/api/handles/10.5555/12345678" (util/doi-ok "10.5555/12345678")] (doseq [input url-params-inputs] - (let [result (landing-page-url/try-from-get-params input)] + (let [result (landing-page-url/try-from-get-params util/mock-context input)] (is (= result "https://doi.org/10.5555/12345678")))))) (testing "try-from-get-params nil if DOI doesn't exist" (fake/with-fake-http ["https://doi.org/api/handles/10.5555/12345678" (util/doi-not-found)] (doseq [input url-params-inputs] - (let [result (landing-page-url/try-from-get-params input)] + (let [result (landing-page-url/try-from-get-params util/mock-context input)] (is (= result nil))))))) (deftest ^:component try-doi-from-url-text-extra (testing "URL with embedded DOI plus text." (fake/with-fake-http ["https://doi.org/api/handles/10.5235/219174411798862578" (util/doi-ok "10.5235/219174411798862578") #"https://doi.org/api/handles/10.5235/219174411798862578/.*" (util/doi-not-found)] - (is (= (landing-page-url/try-doi-from-url-text "http://www.nomos-elibrary.de/10.5235/219174411798862578/criminal-law-issues-in-the-case-law-of-the-european-court-of-justice-a-general-overview-jahrgang-1-2011-heft-2") + (is (= (landing-page-url/try-doi-from-url-text util/mock-context "http://www.nomos-elibrary.de/10.5235/219174411798862578/criminal-law-issues-in-the-case-law-of-the-european-court-of-justice-a-general-overview-jahrgang-1-2011-heft-2") "https://doi.org/10.5235/219174411798862578"))))) (deftest ^:component try-doi-from-url-text-not-exist @@ -39,7 +39,7 @@ (fake/with-fake-http ["https://doi.org/api/handles/10.5235/219174411798862XXX" (util/doi-not-found) #"https://doi.org/api/handles/10.5235/.*" (util/doi-not-found) "https://doi.org/api/handles/10.5235/219174411798862XXX/criminal-law-issues-in-the-case-law-of-the-european-court-of-justice-a-general-overview-jahrgang-1-2011-heft-2" (util/doi-not-found)] - (is (= (landing-page-url/try-doi-from-url-text "http://www.nomos-elibrary.de/10.5235/219174411798862578/criminal-law-issues-in-the-case-law-of-the-european-court-of-justice-a-general-overview-jahrgang-1-2011-heft-2") + (is (= (landing-page-url/try-doi-from-url-text util/mock-context "http://www.nomos-elibrary.de/10.5235/219174411798862578/criminal-law-issues-in-the-case-law-of-the-european-court-of-justice-a-general-overview-jahrgang-1-2011-heft-2") nil))))) (deftest ^:component try-doi-from-url-text-jsessionid @@ -48,7 +48,7 @@ ; stuff that we may look for after the end of the legit DOI ; NB regex escaped #"https://doi.org/api/handles/10.1002/1521-3951\(200009\)221:1<453::AID-PSSB453>3.0.CO;2-Q/.*" (util/doi-not-found)] - (is (= (landing-page-url/try-doi-from-url-text "http://onlinelibrary.wiley.com/doi/10.1002/1521-3951(200009)221:1<453::AID-PSSB453>3.0.CO;2-Q/abstract;jsessionid=FAD5B5661A7D092460BEEDA0D55204DF.f02t01") + (is (= (landing-page-url/try-doi-from-url-text util/mock-context "http://onlinelibrary.wiley.com/doi/10.1002/1521-3951(200009)221:1<453::AID-PSSB453>3.0.CO;2-Q/abstract;jsessionid=FAD5B5661A7D092460BEEDA0D55204DF.f02t01") "https://doi.org/10.1002/1521-3951(200009)221:1<453::aid-pssb453>3.0.co;2-q"))))) (deftest ^:component try-doi-from-url-text-slash-extras @@ -58,14 +58,14 @@ "https://doi.org/api/handles/10.7815%2Fijorcs.21.2011.012%2Farul-anitha" (util/doi-not-found) "https://doi.org/api/handles/10.7815/ijorcs.21.2011.012" (util/doi-ok "10.7815/ijorcs.21.2011.012")] - (is (= (landing-page-url/try-doi-from-url-text "http://www.ijorcs.org/manuscript/id/12/doi:10.7815/ijorcs.21.2011.012/arul-anitha/network-security-using-linux-intrusion-detection-system") + (is (= (landing-page-url/try-doi-from-url-text util/mock-context "http://www.ijorcs.org/manuscript/id/12/doi:10.7815/ijorcs.21.2011.012/arul-anitha/network-security-using-linux-intrusion-detection-system") "https://doi.org/10.7815/ijorcs.21.2011.012"))))) (deftest ^:component try-doi-from-url-text-url-escape (testing "URL with embedded URL-escaped DOI." (fake/with-fake-http ["https://doi.org/api/handles/10.1007%2Fs00423-015-1364-1" (util/doi-ok "10.1007/s00423-015-1364-1") "https://doi.org/api/handles/10.1007/s00423-015-1364-1" (util/doi-ok "10.1007/s00423-015-1364-1")] - (is (= (landing-page-url/try-doi-from-url-text "http://link.springer.com/article/10.1007%2Fs00423-015-1364-1") + (is (= (landing-page-url/try-doi-from-url-text util/mock-context "http://link.springer.com/article/10.1007%2Fs00423-015-1364-1") "https://doi.org/10.1007/s00423-015-1364-1"))))) (deftest ^:component try-pii-from-url-text-pii @@ -77,14 +77,14 @@ "https://doi.org/api/handles/10.1016/s0169-5347(01)02380-1" (util/doi-ok "10.1016/s0169-5347(01)02380-1")] - (is (= (landing-page-url/try-pii-from-url-text "http://api.elsevier.com/content/article/PII:S0169534701023801?httpAccept=text/plain") + (is (= (landing-page-url/try-pii-from-url-text util/mock-context "http://api.elsevier.com/content/article/PII:S0169534701023801?httpAccept=text/plain") "https://doi.org/10.1016/s0169-5347(01)02380-1"))))) (deftest ^:component try-fetched-page-metadata-citation-doi (testing "DOI can be fetched from meta tag: citation_doi" (fake/with-fake-http ["http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4852986/?report=classic" (slurp "resources/PMC4852986") "https://doi.org/api/handles/10.1007/s10461-013-0685-8" (util/doi-ok "10.1007/s10461-013-0685-8")] - (is (= (landing-page-url/try-fetched-page-metadata "http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4852986/?report=classic" nil) + (is (= (landing-page-url/try-fetched-page-metadata util/mock-context "http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4852986/?report=classic") "https://doi.org/10.1007/s10461-013-0685-8")))) ; NB pubsonline.informs.org sends different HTML to different agents (Firefox vs Curl). @@ -99,14 +99,14 @@ ; Misidentified short-dois #"https://doi.org/api/handles/.*" (util/doi-not-found) ] - (is (= (landing-page-url/try-fetched-page-metadata "http://pubsonline.informs.org/doi/abs/10.1287/mnsc.2016.2427" nil) + (is (= (landing-page-url/try-fetched-page-metadata util/mock-context "http://pubsonline.informs.org/doi/abs/10.1287/mnsc.2016.2427") "https://doi.org/10.1287/mnsc.2016.2427"))))) (deftest ^:component try-fetched-page-metadat-dc-identifier (testing "DOI can be fetched from meta tag: DC.identifier (different case)" (fake/with-fake-http ["https://figshare.com/articles/A_Modeler_s_Tale/3423371/1" (slurp "resources/A_Modeler_s_Tale") "https://doi.org/api/handles/10.6084/m9.figshare.3423371.v1" (util/doi-ok "10.6084/m9.figshare.3423371.v1")] - (is (= (landing-page-url/try-fetched-page-metadata "https://figshare.com/articles/A_Modeler_s_Tale/3423371/1" nil) + (is (= (landing-page-url/try-fetched-page-metadata util/mock-context "https://figshare.com/articles/A_Modeler_s_Tale/3423371/1") "https://doi.org/10.6084/m9.figshare.3423371.v1"))))) (deftest ^:component try-fetched-page-metadata-dc-identifier-doi @@ -116,7 +116,7 @@ ; Misidentified short-dois #"https://doi.org/api/handles/.*" (util/doi-not-found)] - (is (= (landing-page-url/try-fetched-page-metadata "http://www.circumpolarhealthjournal.net/index.php/ijch/article/view/18594/html" nil) + (is (= (landing-page-url/try-fetched-page-metadata util/mock-context "http://www.circumpolarhealthjournal.net/index.php/ijch/article/view/18594/html") "https://doi.org/10.3402/ijch.v71i0.18594"))))) (deftest ^:component try-fetched-page-metadata-prism-url @@ -125,14 +125,14 @@ "https://doi.org/api/handles/10.1186/s13054-016-1322-5" (util/doi-ok "10.1186/s13054-016-1322-5") ; Misidentified short-dois #"https://doi.org/api/handles/.*" (util/doi-not-found)] - (is (= (landing-page-url/try-fetched-page-metadata "http://ccforum.biomedcentral.com/articles/10.1186/s13054-016-1322-5" nil) + (is (= (landing-page-url/try-fetched-page-metadata util/mock-context "http://ccforum.biomedcentral.com/articles/10.1186/s13054-016-1322-5") "https://doi.org/10.1186/s13054-016-1322-5"))))) (deftest ^:component try-fetched-page-metadata (testing "DOI can be fetched from meta tag: citation_doi" (fake/with-fake-http ["http://jnci.oxfordjournals.org/content/108/6/djw160.full" (slurp "resources/djw160.full") "https://doi.org/api/handles/10.1093/jnci/djw160" (util/doi-ok "10.1093/jnci/djw160")] - (is (= (landing-page-url/try-fetched-page-metadata "http://jnci.oxfordjournals.org/content/108/6/djw160.full" nil) + (is (= (landing-page-url/try-fetched-page-metadata util/mock-context "http://jnci.oxfordjournals.org/content/108/6/djw160.full") "https://doi.org/10.1093/jnci/djw160"))))) ; Regression test for https://github.com/CrossRef/event-data-percolator/issues/29 @@ -141,6 +141,6 @@ (fake/with-fake-http [; This first URL was getting called. Now shouldn't be, but left as an illustration. "https://doi.org/api/handles/10.1007/s11906-017-0700-y?platform=hootsuite" (util/doi-ok "10.1007/s11906-017-0700-y") "https://doi.org/api/handles/10.1007/s11906-017-0700-y" (util/doi-ok "10.1007/s11906-017-0700-y")] - (let [result (landing-page-url/try-doi-from-url-text "http://link.springer.com/article/10.1007/s11906-017-0700-y?platform=hootsuite")] + (let [result (landing-page-url/try-doi-from-url-text util/mock-context "http://link.springer.com/article/10.1007/s11906-017-0700-y?platform=hootsuite")] (is (= result "https://doi.org/10.1007/s11906-017-0700-y") "Question mark character should not be included in DOI"))))) diff --git a/test/event_data_percolator/matchers/pii_test.clj b/test/event_data_percolator/matchers/pii_test.clj index 23bd396..0913beb 100644 --- a/test/event_data_percolator/matchers/pii_test.clj +++ b/test/event_data_percolator/matchers/pii_test.clj @@ -15,7 +15,7 @@ :body (json/write-str {:message {:items [{:DOI "10.5555/12345678"}]}})} "https://doi.org/api/handles/10.5555/12345678" (util/doi-ok "10.5555/12345678")] - (let [result (pii/match-pii-candidate {:value "S232251141300001-2"} nil)] + (let [result (pii/match-pii-candidate util/mock-context {:value "S232251141300001-2"})] (is (= result {:value "S232251141300001-2", :match "https://doi.org/10.5555/12345678"}))))) (testing "match-pii-candidate doesn't match DOI if not unique mapping." @@ -25,7 +25,7 @@ :body (json/write-str {:message {:items [{:DOI "10.5555/12345678"} {:DOI "10.5555/11111"}]}})}] - (let [result (pii/match-pii-candidate {:value "S232251141300001-2"} nil)] + (let [result (pii/match-pii-candidate util/mock-context {:value "S232251141300001-2"})] (is (= result {:value "S232251141300001-2", :match nil}))))) (testing "match-pii-candidate doesn't match DOI if it doesn't exist." @@ -37,19 +37,19 @@ "https://doi.org/api/handles/10.5555/NOT_FOUND" (util/doi-not-found) ; And attempts to chop the end off #"https://doi.org/api/handles/1.*" (util/doi-not-found)] - (let [result (pii/match-pii-candidate {:value "S232251141300001-2"} nil)] + (let [result (pii/match-pii-candidate nil {:value "S232251141300001-2"})] (is (= result {:value "S232251141300001-2", :match nil}))))) (testing "empty PII should never result in a match or query" ; Ensure that no network activity is made. (fake/with-fake-http [] - (let [result (pii/match-pii-candidate {:value ""} nil)] + (let [result (pii/match-pii-candidate util/mock-context {:value ""})] (is (= result {:value "", :match nil}))))) (testing "nill PII should never result in a match or query" ; Ensure that no network activity is made. (fake/with-fake-http [] - (let [result (pii/match-pii-candidate {:value nil} nil)] + (let [result (pii/match-pii-candidate nil {:value nil})] (is (= result {:value nil, :match nil}))))) (testing "match-pii-candidate can deal with non-JSON response." @@ -57,7 +57,7 @@ {:status 200 :headers {:content-type "application/json"} :body "BANG"}] - (let [result (pii/match-pii-candidate {:value "CRASHING-XML"} nil)] + (let [result (pii/match-pii-candidate util/mock-context {:value "CRASHING-XML"})] (is (= result {:value "CRASHING-XML", :match nil}))))) (testing "match-pii-candidate can deal with empty response." @@ -65,12 +65,12 @@ {:status 200 :headers {:content-type "application/json"} :body ""}] - (let [result (pii/match-pii-candidate {:value "CRASHING-EMPTY"} nil)] + (let [result (pii/match-pii-candidate util/mock-context {:value "CRASHING-EMPTY"})] (is (= result {:value "CRASHING-EMPTY", :match nil}))))) (testing "match-pii-candidate can deal with exception." (fake/with-fake-http ["https://api.crossref.org/v1/works" #(throw (new Exception "Something went wrong."))] - (let [result (pii/match-pii-candidate {:value "CRASHING-EXCEPTION"} nil)] + (let [result (pii/match-pii-candidate util/mock-context {:value "CRASHING-EXCEPTION"})] (is (= result {:value "CRASHING-EXCEPTION", :match nil})))))) diff --git a/test/event_data_percolator/matchers/plain_doi_test.clj b/test/event_data_percolator/matchers/plain_doi_test.clj index 7ca71c1..7b5ec17 100644 --- a/test/event_data_percolator/matchers/plain_doi_test.clj +++ b/test/event_data_percolator/matchers/plain_doi_test.clj @@ -9,7 +9,7 @@ (deftest ^:component match-plain-doi-candidate (testing "match-plain-doi-candidate matches valid DOI." (fake/with-fake-http ["https://doi.org/api/handles/10.5555/12345678" (util/doi-ok "10.5555/12345678")] - (let [result (plain-doi/match-plain-doi-candidate {:value "10.5555/12345678"} nil)] + (let [result (plain-doi/match-plain-doi-candidate util/mock-context {:value "10.5555/12345678"})] (is (= result {:value "10.5555/12345678", :match "https://doi.org/10.5555/12345678"}))))) (testing "match-plain-doi-candidate does not match nonexistent DOI." @@ -20,6 +20,6 @@ ; Nor do subsequent ones. #"https://doi.org/api/handles/10.5555.*" (util/doi-not-found)] - (let [result (plain-doi/match-plain-doi-candidate {:value "10.5555/12345678"} nil)] + (let [result (plain-doi/match-plain-doi-candidate util/mock-context {:value "10.5555/12345678"})] (is (= result {:value "10.5555/12345678", :match nil})))))) diff --git a/test/event_data_percolator/matchers/shortdoi_url_test.clj b/test/event_data_percolator/matchers/shortdoi_url_test.clj index f6bcae3..553448f 100644 --- a/test/event_data_percolator/matchers/shortdoi_url_test.clj +++ b/test/event_data_percolator/matchers/shortdoi_url_test.clj @@ -7,27 +7,27 @@ (deftest ^:component match-shortdoi-url-candidate (testing "match-shortdoi-url-candidate matches valid shortDOI and converts to full DOI." (fake/with-fake-http ["https://doi.org/api/handles/10/hvx" (util/short-doi-ok "10.5555/12345678")] - (let [result (shortdoi-url/match-shortdoi-url-candidate {:value "http://doi.org/hvx"} nil)] + (let [result (shortdoi-url/match-shortdoi-url-candidate util/mock-context {:value "http://doi.org/hvx"})] (is (= result {:value "http://doi.org/hvx" :match "https://doi.org/10.5555/12345678"})))))) ; Regression for https://github.com/CrossRef/event-data-percolator/issues/40 (deftest ^:component match-shortdoi-url-candidate-empty (testing "match-shortdoi-url-candidate handles empty-doi" (fake/with-fake-http [] - (let [result (shortdoi-url/match-shortdoi-url-candidate {:value "http://doi.org/"} nil)] + (let [result (shortdoi-url/match-shortdoi-url-candidate util/mock-context {:value "http://doi.org/"})] (is (nil? (:match result)) "Should return nil without throwing exception.")))) (testing "match-shortdoi-url-candidate handles empty value" (fake/with-fake-http [] - (let [result (shortdoi-url/match-shortdoi-url-candidate {:value ""} nil)] + (let [result (shortdoi-url/match-shortdoi-url-candidate util/mock-context {:value ""})] (is (nil? (:match result)) "Should return nil without throwing exception.")))) (testing "match-shortdoi-url-candidate handles URL with no path" (fake/with-fake-http [] - (let [result (shortdoi-url/match-shortdoi-url-candidate {:value "http://example.com"} nil)] + (let [result (shortdoi-url/match-shortdoi-url-candidate util/mock-context {:value "http://example.com"})] (is (nil? (:match result)) "Should return nil without throwing exception.")))) (testing "match-shortdoi-url-candidate handles null value" (fake/with-fake-http [] - (let [result (shortdoi-url/match-shortdoi-url-candidate {:value nil} nil)] + (let [result (shortdoi-url/match-shortdoi-url-candidate util/mock-context {:value nil})] (is (nil? (:match result)) "Should return nil without throwing exception."))))) diff --git a/test/event_data_percolator/observation_test.clj b/test/event_data_percolator/observation_test.clj index 7667a42..978e03d 100644 --- a/test/event_data_percolator/observation_test.clj +++ b/test/event_data_percolator/observation_test.clj @@ -15,40 +15,69 @@ (deftest ^:unit process-observation-negative (testing "Unrecognised observation types should be passed through, with 'unrecognised' flag set." - (let [result (observation/process-observation (assoc valid-candidate :type "YOU-PROBABLY-HAVENT-HEARD-OF-IT") false #{} (atom []))] + (let [result (observation/process-observation + util/mock-context + (assoc valid-candidate :type "YOU-PROBABLY-HAVENT-HEARD-OF-IT") false)] + (is (-> result :candidates empty?) "Candidates should not be attached when type not recognised.") (is (= :unrecognised-observation-type (-> result :error)) "Unrecognised flag should be set."))) (testing "When sensitive, input-content should be removed in all cases." ; Sensitive false first. - (let [result (observation/process-observation valid-candidate false #{} (atom []))] + (let [result (observation/process-observation + util/mock-context + valid-candidate + false)] + (is (-> result :candidates not-empty) "The type is recognised, indicated by presence of candidates.") (is (-> result :input-content) "The input content is passed through when sensitive is false")) - (let [result (observation/process-observation (assoc valid-candidate :sensitive true) false #{} (atom []))] + (let [result (observation/process-observation + util/mock-context + (assoc valid-candidate :sensitive true) + false)] + (is (-> result :candidates not-empty) "The type is recognised, indicated by presence of candidates.") (is (nil? (-> result :input-content)) "The input content is removed when sensitive is true"))) (testing "When there is a duplicate, candidates should not be extracted" - (let [result (observation/process-observation valid-candidate true #{} (atom []))] + (let [result (observation/process-observation + util/mock-context + valid-candidate true)] (is (-> result :candidates empty?) "Candidates should not be extracted when duplicate."))) (testing "When there is a duplicate, sensitive should still result in input-content being removed.") - (let [result (observation/process-observation (assoc valid-candidate :sensitive true) true #{} (atom []))] + (let [result (observation/process-observation + util/mock-context + (assoc valid-candidate :sensitive true) + true)] + (is (-> result :input-content empty?) "Input content should be removed.") (is (-> result :input-content-hash) "Input content hash should be included."))) (deftest ^:unit process-observation (testing "When the observation type is recognised, the appropriate observation processor should be applied and candidates produced." - (let [result (observation/process-observation valid-candidate false #{} (atom []))] + (let [result (observation/process-observation + util/mock-context + valid-candidate + false)] + ; Content of candidates is tested elsewhere. (is (-> result :candidates not-empty) "Candidates are attached as a result of processor running."))) (testing "When the observation type is recognised, any extra fields should be carried through." - (let [result (observation/process-observation (assoc valid-candidate :random-extra :value) false #{} (atom []))] + (let [result (observation/process-observation + util/mock-context + (assoc valid-candidate :random-extra :value) + false)] + (is (= :value (:random-extra result)) "Extra field passed through."))) (testing "When the observation type is recognised, input-content-hash should be supplied for input-content." - (let [result (observation/process-observation valid-candidate false #{} (atom []))] + (let [result (observation/process-observation + util/mock-context + valid-candidate + false)] + (is (:input-content-hash result) "Input hash attached.")))) diff --git a/test/event_data_percolator/observation_types/content_url_test.clj b/test/event_data_percolator/observation_types/content_url_test.clj index 1086803..1dc51be 100644 --- a/test/event_data_percolator/observation_types/content_url_test.clj +++ b/test/event_data_percolator/observation_types/content_url_test.clj @@ -17,31 +17,36 @@ (is (false? (content-url/url-valid? "http://example.com/somefile.pdf"))))) (deftest ^:unit process-content-url-observation - (testing "process-content-url-observation should set error when URL isn't allowed" - (let [result (content-url/process-content-url-observation {:input-url nil} #{"example.com"} (atom []))] + (testing "process-content-url-observation should set error when URL isn't allowed becuase it's empty and therefore doesn't match the domain." + (let [result (content-url/process-content-url-observation + util/mock-context + {:input-url nil})] + (is (:error result)))) (testing "process-content-url-observation should set error when URL can't be retrieved" (fake/with-fake-http ["http://cannot-be-retrieved.com/abc" {:status 404}] (let [result (content-url/process-content-url-observation - {:input-url "http://cannot-be-retrieved.com/abc"} - #{} (atom []))] + util/mock-context + {:input-url "http://cannot-be-retrieved.com/abc"})] (is (:error result))))) (testing "process-content-url-observation not visit landing page domains" ; Assert that no web call is made. (fake/with-fake-http [] (let [result (content-url/process-content-url-observation - {:input-url "http://publisher-site.com/this/page"} - #{"publisher-site.com"} (atom []))] - + (assoc util/mock-context :domain-set #{"example.com"}) + {:input-url "http://example.com/this/page"})] + (is (= (:error result) :skipped-domain) "Results in :skipped-domain error") (is (nil? (:input-content result)) "No content is returned.")))) (testing "process-content-url-observation should set candidates on match where there are matches" (fake/with-fake-http ["http://can-be-retrieved.com/abc" "Webpage content 10.5555/12345678"] - (let [result (content-url/process-content-url-observation {:input-url "http://can-be-retrieved.com/abc"} #{} (atom []))] + (let [result (content-url/process-content-url-observation + util/mock-context + {:input-url "http://can-be-retrieved.com/abc"})] (is (nil? (:error result))) ; Simplest possible thing that returns candidates (actually passed all the way through to plain-text). @@ -54,7 +59,8 @@ (fake/with-fake-http ["http://disallow-robots.com/abc" "Disalowed content 10.5555/12345678" "http://disallow-robots.com/robots.txt" "User-agent: *\nDisallow: /"] (let [result (content-url/process-content-url-observation - {:input-url "http://disallow-robots.com/abc"} #{} (atom []))] + util/mock-context + {:input-url "http://disallow-robots.com/abc"})] ; No candidates should be matched because of robots exclusion. (is (= result {:input-url "http://disallow-robots.com/abc" @@ -64,8 +70,10 @@ (testing "Fetch usually respects robots.txt Allow" (fake/with-fake-http ["http://allow-robots.com/abc" "Allowed content 10.5555/12345678" "http://allow-robots.com/robots.txt" "User-agent: *\nAllow: /"] + (let [result (content-url/process-content-url-observation - {:input-url "http://allow-robots.com/abc"} #{} (atom []))] + util/mock-context + {:input-url "http://allow-robots.com/abc"})] (is (= result {:input-url "http://allow-robots.com/abc" :input-content "Allowed content 10.5555/12345678" @@ -76,8 +84,9 @@ (fake/with-fake-http ["http://disallow-robots.com/abc" "Disallow robots content 10.5555/12345678" "http://disallow-robots.com/robots.txt" "User-agent: *\nDisallow: /"] (let [result (content-url/process-content-url-observation + util/mock-context {:input-url "http://disallow-robots.com/abc" - :ignore-robots true} #{} (atom []))] + :ignore-robots true})] ; No candidates should be matched because of robots exclusion. (is (= result {:input-url "http://disallow-robots.com/abc" @@ -90,8 +99,9 @@ (fake/with-fake-http ["http://allow-robots.com/abc" "Allow robots content 10.5555/12345678" "http://allow-robots.com/robots.txt" "User-agent: *\nAllow: /"] (let [result (content-url/process-content-url-observation + util/mock-context {:input-url "http://allow-robots.com/abc" - :ignore-robots true} #{} (atom []))] + :ignore-robots true})] (is (= result {:input-url "http://allow-robots.com/abc" :ignore-robots true diff --git a/test/event_data_percolator/observation_types/html_test.clj b/test/event_data_percolator/observation_types/html_test.clj index 86b4f02..985dd7a 100644 --- a/test/event_data_percolator/observation_types/html_test.clj +++ b/test/event_data_percolator/observation_types/html_test.clj @@ -2,7 +2,8 @@ "Tests for the html extractor. Unstructured extraction is passed from html to plaintext namespace functions, so proper testing of DOI and URL extraction from plaintext are performed in plaintext.test" (:require [clojure.test :refer :all] - [event-data-percolator.observation-types.html :as html])) + [event-data-percolator.observation-types.html :as html] + [event-data-percolator.test-util :as util])) (def plain-text "this is just some text 10.5555/11111") @@ -55,9 +56,9 @@ (deftest ^:unit process-html-content-observation (testing "Plain DOIs can be extracted from text nodes" (let [result (html/process-html-content-observation - {:type "html" :input-content "the quick brown 10.5555/1111 jumps"} - domain-set - (atom []))] + util/mock-context + {:type "html" :input-content "the quick brown 10.5555/1111 jumps"})] + (is (= result {:type "html" :input-content "the quick brown 10.5555/1111 jumps" :candidates [{:value "10.5555/1111" :type :plain-doi}]}) @@ -65,9 +66,9 @@ (testing "URL DOIs can be extracted from text nodes" (let [result (html/process-html-content-observation - {:type "html" :input-content "

the quick brown 10.5555/1111 jumps

"} - domain-set - (atom []))] + util/mock-context + {:type "html" :input-content "

the quick brown 10.5555/1111 jumps

"})] + (is (= result {:type "html" :input-content "

the quick brown 10.5555/1111 jumps

" :candidates [{:value "10.5555/1111" :type :plain-doi}]}) @@ -75,9 +76,9 @@ (testing "Hyperlinked URL DOIs can be extracted from links" (let [result (html/process-html-content-observation - {:type "html" :input-content "cliquez ici"} - domain-set - (atom [])) + (assoc util/mock-context :domain-set #{"doi.org"}) + {:type "html" :input-content "cliquez ici"}) + expected {:type "html" :input-content "cliquez ici" :candidates [{:type :landing-page-url, :value "http://doi.org/10.5555/22222"} @@ -87,9 +88,8 @@ (testing "ShortDOI URL DOIs can be extracted from text nodes" (let [result (html/process-html-content-observation - {:type "html" :input-content "

http://doi.org/abcd

"} - domain-set - (atom []))] + (assoc util/mock-context :domain-set #{"doi.org"}) + {:type "html" :input-content "

http://doi.org/abcd

"})] (is (= result {:type "html" :input-content "

http://doi.org/abcd

" @@ -99,9 +99,9 @@ (testing "ShortDOI hyperlinked DOIs can be extracted from links" (let [result (html/process-html-content-observation - {:type "html" :input-content "short and sweet"} - domain-set - (atom [])) + (assoc util/mock-context :domain-set #{"doi.org"}) + {:type "html" :input-content "short and sweet"}) + expected {:type "html" :input-content "short and sweet" :candidates [{:type :landing-page-url, :value "http://doi.org/abcd"} @@ -111,9 +111,9 @@ (testing "PIIs can be extracted from text nodes" (let [result (html/process-html-content-observation - {:type "html" :input-content "this is my PII S232251141300001-2 there"} - domain-set - (atom []))] + util/mock-context + {:type "html" :input-content "this is my PII S232251141300001-2 there"})] + (is (= result {:type "html" :input-content "this is my PII S232251141300001-2 there" :candidates [{:value "S232251141300001-2" :type :pii}]}) @@ -121,9 +121,9 @@ (testing "Landing Page URLs can be extracted from text nodes" (let [result (html/process-html-content-observation - {:type "html" :input-content "one two three http://example.com/four five http://ignore.com/four"} - domain-set - (atom []))] + (assoc util/mock-context :domain-set #{"example.com"}) + {:type "html" :input-content "one two three http://example.com/four five http://ignore.com/four"})] + (is (= result {:type "html" :input-content "one two three http://example.com/four five http://ignore.com/four" :candidates [{:value "http://example.com/four" :type :landing-page-url}]}) @@ -132,9 +132,9 @@ (testing "Landing Page URLs can be extracted from links" (let [result (html/process-html-content-observation - {:type "html" :input-content "

this ignore me!

"} - domain-set - (atom []))] + (assoc util/mock-context :domain-set #{"example.com"}) + {:type "html" :input-content "

this ignore me!

"})] + (is (= result {:type "html" :input-content "

this ignore me!

" :candidates [{:value "http://example.com/five" :type :landing-page-url}]}) @@ -144,14 +144,15 @@ (testing "HTML that contains a DOI in the link and in the text returns one candidate per match type. This will later be de-duped in event-data-percolator.action-test/match-candidates-deupe ." (let [html "10.5555/12345678" result (html/process-html-content-observation - {:type "html" :input-content html} - domain-set - (atom []))] + (assoc util/mock-context :domain-set #{"example.com" "doi.org"}) + {:type "html" :input-content html})] + (is (= result {:type "html" :input-content html :candidates [{:type :plain-doi :value "10.5555/12345678"} {:type :landing-page-url :value "https://doi.org/10.5555/12345678"} {:type :doi-url :value "https://doi.org/10.5555/12345678"}]}) + "Three different kinds of candidates retrieved when DOI is linked and in text.")))) (def rss-html diff --git a/test/event_data_percolator/observation_types/plaintext_test.clj b/test/event_data_percolator/observation_types/plaintext_test.clj index 3c421af..8838ec5 100644 --- a/test/event_data_percolator/observation_types/plaintext_test.clj +++ b/test/event_data_percolator/observation_types/plaintext_test.clj @@ -1,35 +1,47 @@ (ns event-data-percolator.observation-types.plaintext-test (:require [clojure.test :refer :all] - [event-data-percolator.observation-types.plaintext :as plaintext])) + [event-data-percolator.observation-types.plaintext :as plaintext] + [event-data-percolator.test-util :as util])) (def domain-set #{"example.com" "example.net"}) (deftest ^:unit process-plaintext-content-observation (testing "Plain DOIs can be extracted from text" - (let [result (plaintext/process-plaintext-content-observation {:type "html" :input-content "the quick brown 10.5555/1111 jumps"} domain-set (atom []))] + (let [result (plaintext/process-plaintext-content-observation + util/mock-context + {:type "html" :input-content "the quick brown 10.5555/1111 jumps"})] + (is (= result {:type "html" :input-content "the quick brown 10.5555/1111 jumps" :candidates [{:value "10.5555/1111" :type :plain-doi}]}) "One plain DOI candidate returned."))) (testing "ShortDOI URL DOIs can be extracted from text" - (let [result (plaintext/process-plaintext-content-observation {:type "html" :input-content "this is a shortdoi http://doi.org/abcd"} domain-set (atom []))] + (let [result (plaintext/process-plaintext-content-observation + util/mock-context + {:type "html" :input-content "this is a shortdoi http://doi.org/abcd"})] + (is (= result {:type "html" :input-content "this is a shortdoi http://doi.org/abcd" :candidates [{:value "http://doi.org/abcd" :type :shortdoi-url}]}) "One ShortDOI URL candidate found when unlinked"))) (testing "PIIs can be extracted from text" - (let [result (plaintext/process-plaintext-content-observation {:type "html" :input-content "this is my PII S232251141300001-2 there"} domain-set (atom []))] + (let [result (plaintext/process-plaintext-content-observation + util/mock-context + {:type "html" :input-content "this is my PII S232251141300001-2 there"})] + (is (= result {:type "html" :input-content "this is my PII S232251141300001-2 there" :candidates [{:value "S232251141300001-2" :type :pii}]}) "PII candidate found in text"))) (testing "Landing Page URLs can be extracted from text" - (let [result (plaintext/process-plaintext-content-observation {:type "html" :input-content "one two three http://example.com/four five http://ignore.com/four"} domain-set (atom []))] + (let [result (plaintext/process-plaintext-content-observation + util/mock-context + {:type "html" :input-content "one two three http://example.com/four five http://ignore.com/four"})] + (is (= result {:type "html" :input-content "one two three http://example.com/four five http://ignore.com/four" :candidates [{:value "http://example.com/four" :type :landing-page-url}]}) "Article landing page from known domain can be extracted from text. Non-matching domains ignored.")))) - diff --git a/test/event_data_percolator/observation_types/url_test.clj b/test/event_data_percolator/observation_types/url_test.clj index 877bd75..5c9522e 100644 --- a/test/event_data_percolator/observation_types/url_test.clj +++ b/test/event_data_percolator/observation_types/url_test.clj @@ -1,43 +1,61 @@ (ns event-data-percolator.observation-types.url-test (:require [clojure.test :refer :all] - [event-data-percolator.observation-types.url :as url])) + [event-data-percolator.observation-types.url :as url] + [event-data-percolator.test-util :as util])) (def domain-set #{"example.com" "example.net"}) (deftest ^:unit process-url-observation (testing "URL DOIs on doi.org can be matched" - (let [result (url/process-url-observation {:type "url" :input-url "https://doi.org/10.5555/1111"} domain-set (atom []))] + (let [result (url/process-url-observation + util/mock-context + {:type "url" :input-url "https://doi.org/10.5555/1111"})] + (is (= result {:type "url" :input-url "https://doi.org/10.5555/1111" :candidates [{:value "https://doi.org/10.5555/1111" :type :doi-url}]})))) (testing "URL DOIs on dx.doi.org can be matched" - (let [result (url/process-url-observation {:type "url" :input-url "http://dx.doi.org/10.5555/1111"} domain-set (atom []))] + (let [result (url/process-url-observation + util/mock-context + {:type "url" :input-url "http://dx.doi.org/10.5555/1111"})] + (is (= result {:type "url" :input-url "http://dx.doi.org/10.5555/1111" :candidates [{:value "http://dx.doi.org/10.5555/1111" :type :doi-url}]})))) (testing "ShortDOI URL DOIs can be extracted from text" - (let [result (url/process-url-observation {:type "url" :input-url "http://doi.org/abcd"} domain-set (atom []))] + (let [result (url/process-url-observation + util/mock-context + {:type "url" :input-url "http://doi.org/abcd"})] + (is (= result {:type "url" :input-url "http://doi.org/abcd" :candidates [{:value "http://doi.org/abcd" :type :shortdoi-url}]})))) (testing "Landing Page URLs can be extracted from text" - (let [result (url/process-url-observation {:type "url" :input-url "http://example.com/four"} domain-set (atom []))] + (let [result (url/process-url-observation + (assoc util/mock-context :domain-set #{"example.com"}) + {:type "url" :input-url "http://example.com/four"})] + (is (= result {:type "url" :input-url "http://example.com/four" :candidates [{:value "http://example.com/four" :type :landing-page-url}]})))) (testing "Landing Page URLs not on recognised domain list are not extracted." - (let [result (url/process-url-observation {:type "url" :input-url "http://bad-example.com/four"} domain-set (atom []))] + (let [result (url/process-url-observation + (assoc util/mock-context :domain-set #{"example.com"}) + {:type "url" :input-url "http://bad-example.com/four"})] + (is (= result {:type "url" :input-url "http://bad-example.com/four" :candidates []})))) (testing "Nil input handled ok." - (let [result (url/process-url-observation {:type "url" :input-url nil} domain-set (atom []))] + (let [result (url/process-url-observation + util/mock-context + {:type "url" :input-url nil})] + (is (= result {:type "url" :input-url nil :candidates []}))))) - diff --git a/test/event_data_percolator/test_util.clj b/test/event_data_percolator/test_util.clj index 34d9d75..a459002 100644 --- a/test/event_data_percolator/test_util.clj +++ b/test/event_data_percolator/test_util.clj @@ -1,6 +1,18 @@ (ns event-data-percolator.test-util (:require [clojure.data.json :as json])) +; Mock context. No tests should rely on the values here. +(def mock-context + {:id "20170101-myagent-1234" + :domain-set #{"example.com"} + :domain-list-artifact-version "http://d1v52iseus4yyg.cloudfront.net/a/crossref-domain-list/versions/1482489046417"}) + +; Mock evidence record. No tests should rely on the values here. +(def mock-evidence-record + {}) + +; These are Fake HTTP responses. + (defn doi-ok "Fake OK return from DOI Handle API." [handle] diff --git a/test/event_data_percolator/util/doi_test.clj b/test/event_data_percolator/util/doi_test.clj index 240288b..fe091da 100644 --- a/test/event_data_percolator/util/doi_test.clj +++ b/test/event_data_percolator/util/doi_test.clj @@ -10,16 +10,16 @@ (deftest ^:unit resolve-escaped (testing "resolve-doi-maybe-escaped should return an unescaped DOI, if input unescaped and it's valid" (fake/with-fake-http ["https://doi.org/api/handles/10.1007/s00423-015-1364-1" (util/doi-ok "10.1007/s00423-015-1364-1")] - (is (= (doi/resolve-doi-maybe-escaped "10.1007/s00423-015-1364-1") + (is (= (doi/resolve-doi-maybe-escaped util/mock-context "10.1007/s00423-015-1364-1") "10.1007/s00423-015-1364-1")))) (testing "resolve-doi-maybe-escaped should return an unescaped DOI, if input URL escaped and it's valid" (fake/with-fake-http ["https://doi.org/api/handles/10.1007/s00423-015-1364-1" (util/doi-ok "10.1007/s00423-015-1364-1")] - (is (= (doi/resolve-doi-maybe-escaped "10.1007%2fs00423-015-1364-1") + (is (= (doi/resolve-doi-maybe-escaped util/mock-context "10.1007%2fs00423-015-1364-1") "10.1007/s00423-015-1364-1") "Works with lower case escaped slash.") - (is (= (doi/resolve-doi-maybe-escaped "10.1007%2Fs00423-015-1364-1") + (is (= (doi/resolve-doi-maybe-escaped util/mock-context "10.1007%2Fs00423-015-1364-1") "10.1007/s00423-015-1364-1") "Works with upper case escaped slash.")))) @@ -44,37 +44,37 @@ ; Regression for https://github.com/CrossRef/event-data-percolator/issues/33 and 25 (deftest ^:unit qmarks-hash-not-included-resolve-doi (testing "Trailing question marks and hashes are discarded by resolve-doi" - (is (= (doi/resolve-doi "10.1111/nicc.12290#.wlw5yueemak.twitter") + (is (= (doi/resolve-doi util/mock-context "10.1111/nicc.12290#.wlw5yueemak.twitter") "10.1111/nicc.12290")) - (is (= (doi/resolve-doi "10.2752/136270497779613666?journalcode=rfft20") + (is (= (doi/resolve-doi util/mock-context "10.2752/136270497779613666?journalcode=rfft20") "10.2752/136270497779613666")) - (is (= (doi/resolve-doi "10.1111/(issn)1475-6811?hootpostid=cdae1d8ac3a881bcc0152faf4bb970a1") + (is (= (doi/resolve-doi util/mock-context "10.1111/(issn)1475-6811?hootpostid=cdae1d8ac3a881bcc0152faf4bb970a1") "10.1111/(issn)1475-6811")) - (is (= (doi/resolve-doi "10.1007/s11739-017-1643-7?utm_content=bufferd3cda&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer") + (is (= (doi/resolve-doi util/mock-context "10.1007/s11739-017-1643-7?utm_content=bufferd3cda&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer") "10.1007/s11739-017-1643-7")) ; https://github.com/CrossRef/event-data-percolator/issues/25 - (is (= (doi/resolve-doi "10.1007/s00127-017-1346-4?wt_mc=internal.event.1.sem.articleauthoronlinefirst") + (is (= (doi/resolve-doi util/mock-context "10.1007/s00127-017-1346-4?wt_mc=internal.event.1.sem.articleauthoronlinefirst") "10.1007/s00127-017-1346-4")))) (deftest ^:unit shortdoi-resolve-doi (testing "ShortDOIs are resolved to normal DOIs by resolve-doi" - (is (= (doi/resolve-doi "hvx") + (is (= (doi/resolve-doi util/mock-context "hvx") "10.5555/12345678")))) ; Regression for https://github.com/CrossRef/event-data-percolator/issues/31 (deftest ^:unit empty-doi-resolve-doi (testing "An empty DOI doesn't resolve as extant" - (is (= (doi/resolve-doi "") nil)) - (is (= (doi/validate-doi-dropping "") nil)) - (is (= (doi/validate-doi-dropping "https://doi.org/") nil)))) + (is (= (doi/resolve-doi util/mock-context "") nil)) + (is (= (doi/validate-doi-dropping util/mock-context "") nil)) + (is (= (doi/validate-doi-dropping util/mock-context "https://doi.org/") nil)))) ; Regression for https://github.com/CrossRef/event-data-percolator/issues/30 (deftest ^:unit nonexistent-doi-resolve-doi (testing "Nonexisting DOIs that return 303 from proxy shouldn't be accepted as extant." - (is (= (doi/resolve-doi "www.uclouvain.be/784506.html") nil)) - (is (= (doi/validate-doi-dropping "https://doi.org/www.uclouvain.be/784506.html") nil)))) + (is (= (doi/resolve-doi util/mock-context "www.uclouvain.be/784506.html") nil)) + (is (= (doi/validate-doi-dropping util/mock-context "https://doi.org/www.uclouvain.be/784506.html") nil))))