Skip to content
This repository has been archived by the owner on Jul 26, 2019. It is now read-only.

Commit

Permalink
Convert to use Evidence Log #58
Browse files Browse the repository at this point in the history
 - use new common library that replaces status service with evidence log
 - new context object represents execution context for each evidence record
 - fit context into call path to all places where logging is required
 - tests
 - still need to unify logging vocabulary
  • Loading branch information
Joe Wass committed Aug 2, 2017
1 parent 2e94a04 commit fa4eccd
Show file tree
Hide file tree
Showing 33 changed files with 601 additions and 417 deletions.
4 changes: 2 additions & 2 deletions project.clj
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
(defproject event-data-percolator "0.3.0"
(defproject event-data-percolator "0.3.1"
:description "Event Data Percolator"
:url "http://eventdata.crossref.org/"
:license {:name "MIT License"
:url "https://opensource.org/licenses/MIT"}
:dependencies [[org.clojure/clojure "1.8.0"]
[org.clojure/core.cache "0.6.5"]
[org.clojure/core.async "0.2.395"]
[event-data-common "0.1.30"]
[event-data-common "0.1.32"]
[enlive "1.1.6"]
[org.clojure/core.memoize "0.5.8"]
[commons-codec/commons-codec "1.10"]
Expand Down
7 changes: 0 additions & 7 deletions release.sh

This file was deleted.

17 changes: 10 additions & 7 deletions src/event_data_percolator/action.clj
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"Take an Action, decorate with 'duplicate' field IFF there's an action ID.
If there's duplicate information (a chunk of JSON representing a previous Evidence Record), associate it with the Action, otherwise pass it through.
The store is updated with the values in the 'push' process."
[action evidence-record-id]
[context evidence-record action]

(if-let [id (:id action)]
(let [k (str "action/" id)
Expand All @@ -42,20 +42,23 @@
(defn process-observations-candidates
"Step Process all the observations of an Action to generate Candidates. Collect Candidates.
If it's a duplicate action, the candidate extractor won't run."
[action domain-set web-trace-atom]
[context evidence-record action]
(let [observations (:observations action)
duplicate? (:duplicate action)
processed-observations (map #(observation/process-observation % duplicate? domain-set web-trace-atom) observations)]
processed-observations (map
#(observation/process-observation context % duplicate?)
observations)]
(-> action
(assoc :processed-observations processed-observations)
(dissoc :observations))))

(defn match-candidates
"Attempt to match all candidates into DOIs."
[action web-trace-atom]
[context evidence-record action]
(let [matches (mapcat (fn [observation]
(map #(match/match-candidate % web-trace-atom) (:candidates observation)))
(map #(match/match-candidate context %) (:candidates observation)))
(:processed-observations action))

; if the :match field wasn't set then it didn't work. Remove these.
successful (filter :match matches)]
(assoc action :matches successful)))
Expand All @@ -80,7 +83,7 @@

(defn dedupe-matches
"Deduplicate matches."
[action]
[context evidence-record action]
(let [matches (:matches action)

; Dedupe by the :match field (i.e. the matched DOI).
Expand Down Expand Up @@ -142,7 +145,7 @@

(defn create-events-for-action
"Update action to include a seq of Events generated from observations in the Action. Plus extra-events if included, and if there were any matches."
[evidence-record action]
[context evidence-record action]

(let [events-from-matches (map (partial create-event-from-match evidence-record action) (:matches action))
events-from-extras (when (not-empty (:matches action)) (map (partial create-event-from-extra-event evidence-record) (:extra-events action)))
Expand Down
65 changes: 30 additions & 35 deletions src/event_data_percolator/evidence_record.clj
Original file line number Diff line number Diff line change
Expand Up @@ -68,74 +68,69 @@
(catch RuntimeException e e)))

(defn map-actions
"Map over actions within an Input Evidence Record, leaving the rest intact."
[f evidence-record]
"Map over actions within an Input Evidence Record, leaving the rest intact.
call (f context evidence-record action)"
[context f evidence-record]
(assoc evidence-record
:pages (map (fn [page]
(assoc page
:actions (map f (:actions page)))) (:pages evidence-record))))
:actions (map #(f context evidence-record %) (:actions page)))) (:pages evidence-record))))

(defn url
"Associate a URL based on the ID."
[evidence-record]
[context evidence-record]
(assoc evidence-record
:url (generate-url (:id evidence-record))))

(defn dedupe-actions
"Dedupe actions in an input Evidence Record."
[evidence-record]
[context evidence-record]
(log/debug "Deduping in " (:id evidence-record))
(map-actions #(action/dedupe-action % (:id evidence-record)) evidence-record))
(map-actions context action/dedupe-action evidence-record))

(defn candidates
"Produce candidates in input evidence-record."
[evidence-record domain-set web-trace-atom]
[context evidence-record]
(log/debug "Candidates in " (:id evidence-record))
(map-actions #(action/process-observations-candidates % domain-set web-trace-atom) evidence-record))
(map-actions context action/process-observations-candidates evidence-record))

(defn match
"Match candidates in input evidence-record."
[evidence-record web-trace-atom]
[context evidence-record]
(log/debug "Match in " (:id evidence-record))
(map-actions #(action/match-candidates % web-trace-atom) evidence-record))
(map-actions context action/match-candidates evidence-record))

(defn dedupe-matches
"Dedupe matches WITHIN an action."
[evidence-record]
[context evidence-record]
(log/debug "Dedupe in " (:id evidence-record))
(map-actions action/dedupe-matches evidence-record))
(map-actions context action/dedupe-matches evidence-record))

(defn events
"Generate an Event for each candidate match, update extra Events."
[evidence-record]
[context evidence-record]
(log/debug "Events in " (:id evidence-record))
(->> evidence-record
(map-actions (partial action/create-events-for-action evidence-record))))
(map-actions context action/create-events-for-action evidence-record))

(def percolator-version (System/getProperty "event-data-percolator.version"))

(defn process
[evidence-record domain-artifact-version domain-set]
; an atom that's passed around to functions that might want to log which URLs they access
; and their respose codes.
(let [web-trace-atom (atom [])
result (->
evidence-record
url
dedupe-actions
(candidates domain-set web-trace-atom)
(match web-trace-atom)
dedupe-matches
(assoc-in [:percolator :artifacts :domain-set-artifact-version] domain-artifact-version)
(assoc-in [:percolator :software-version] percolator-version)
events)
[context evidence-record]
(let [result (->>
evidence-record
(url context)
(dedupe-actions context)
(candidates context)
(match context)
(dedupe-matches context)
(#(assoc-in % [:percolator :artifacts :domain-set-artifact-version] (:domain-list-artifact-version context)))
(#(assoc-in % [:percolator :software-version] percolator-version))
(events context))

; There are lazy sequences in here. Force the entire structure to be realized.
; This is necessary because the web-trace-atom's value is observed at this point,
; so we need to be confident that everything that's going to happen, has happened.
realized (clojure.walk/postwalk identity result)
with-trace (assoc realized :web-trace @web-trace-atom)]
(log/debug "Finished processing" (:id with-trace))
with-trace))
realized (clojure.walk/postwalk identity result)]
(log/debug "Finished processing" (:id realized))
realized))

(defn extract-all-events
"Extract all events for pushing downstream."
Expand Down
7 changes: 4 additions & 3 deletions src/event_data_percolator/match.clj
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
:landing-page-url landing-page-url/match-landing-page-url-candidate})

(defn match-unrecognised-type
[candidate]
[context candidate]
(assoc candidate
:match nil
:error :unrecognised-candidate-type))

(defn match-candidate
[candidate web-trace-atom]
[context candidate]
(let [f (candidate-processors (:type candidate) match-unrecognised-type)]
(f candidate web-trace-atom)))
(f context candidate)))


4 changes: 2 additions & 2 deletions src/event_data_percolator/matchers/doi_url.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[event-data-percolator.matchers.plain-doi :as plain-doi]))

(defn match-doi-url-candidate
[candidate web-trace-atom]
[context candidate]
(assoc candidate
:match (-> candidate :value crdoi/non-url-doi plain-doi/match-plain-doi)))
:match (->> candidate :value crdoi/non-url-doi (plain-doi/match-plain-doi context))))

43 changes: 21 additions & 22 deletions src/event_data_percolator/matchers/landing_page_url.clj
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@

(defn try-from-get-params
"If there's a DOI in a get parameter of a URL, find it"
[url]
[context url]
(try
(let [params (-> url cemerick-url/query->map clojure.walk/keywordize-keys)
doi-like-values (keep (fn [[k v]] (when (re-matches whole-doi-re v) v)) params)
extant (keep doi/validate-cached doi-like-values)]
extant (keep (partial doi/validate-cached context) doi-like-values)]
(-> extant first normalize-doi-if-exists))

; Some things look like URLs but turn out not to be.
(catch IllegalArgumentException _ nil)))

(defn try-doi-from-url-text
"Match an embedded DOI, try various treatments to make it fit."
[url]
[context url]
(let [matches (map second (re-seq doi-re url))

last-slash (map #(clojure.string/replace % #"^(10\.\d+/(.*))/.*$" "$1") matches)
Expand All @@ -56,15 +56,15 @@

candidates (distinct (concat last-slash first-slash semicolon hashchar question-mark amp-mark))

extant (keep doi/validate-cached candidates)]
extant (keep (partial doi/validate-cached context) candidates)]
(-> extant first normalize-doi-if-exists)))

(defn try-pii-from-url-text
[url]
[context url]
(->>
url
pii/find-candidate-piis
(map (comp pii/validate-pii :value))
(map (comp (partial pii/validate-pii context) :value))
first))

(def interested-tag-attrs
Expand All @@ -90,13 +90,12 @@

(def interested-tag-text
"List of selectors whose text content we're interested in."
[
; e.g. http://jnci.oxfordjournals.org/content/108/6/djw160.full
[; e.g. http://jnci.oxfordjournals.org/content/108/6/djw160.full
"span.slug-doi"])

(defn try-fetched-page-metadata-content
"Extract DOI from Metadata tags."
[text]
[context text]
(try
(when text
(let [document (Jsoup/parse text)
Expand All @@ -118,7 +117,7 @@
interested-values (distinct (concat interested-attr-values interested-text-values))

; Try to normalize by removing recognised prefixes, then resolve
extant (keep (comp doi/validate-cached crdoi/non-url-doi) interested-values)]
extant (keep (comp (partial doi/validate-cached context) crdoi/non-url-doi) interested-values)]

(-> extant first normalize-doi-if-exists)))
; We're getting text from anywhere. Anything could happen.
Expand All @@ -128,8 +127,8 @@
nil))))

(defn try-fetched-page-metadata
[url web-trace-atom]
(-> url (web/fetch-respecting-robots web-trace-atom) :body try-fetched-page-metadata-content))
[context url]
(->> url (web/fetch-respecting-robots context) :body (try-fetched-page-metadata-content context)))

(def redis-db-number (delay (Integer/parseInt (get env :landing-page-cache-redis-db "0"))))

Expand All @@ -151,14 +150,14 @@
; This one function is responsible for all outgoing web traffic. Cache its results.
; Other results are derived algorithmically, so there's no use caching those.
(defn try-fetched-page-metadata-cached
[url web-trace-atom]
[context url]
(if skip-cache
(try-fetched-page-metadata url web-trace-atom)
(try-fetched-page-metadata context url)
(if-let [cached-result (store/get-string @redis-cache-store url)]
(if (= cached-result "NULL")
nil
cached-result)
(if-let [result (try-fetched-page-metadata url web-trace-atom)]
(if-let [result (try-fetched-page-metadata context url)]
; success
(do
(redis/set-string-and-expiry-seconds @redis-cache-store url @success-expiry-seconds result)
Expand All @@ -177,15 +176,15 @@

(defn match-landing-page-url
"Try a multitude of ways to match, cheapest first."
[url web-trace-atom]
[context url]
; Step through lazy seq, an item at a time.
(or
(try-from-get-params url)
(try-doi-from-url-text url)
(try-pii-from-url-text url)
(try-fetched-page-metadata-cached url web-trace-atom)))
(try-from-get-params context url)
(try-doi-from-url-text context url)
(try-pii-from-url-text context url)
(try-fetched-page-metadata-cached context url)))

(defn match-landing-page-url-candidate
[candidate web-trace-atom]
[context candidate]
(assoc candidate
:match (match-landing-page-url (:value candidate) web-trace-atom)))
:match (match-landing-page-url context (:value candidate))))
4 changes: 2 additions & 2 deletions src/event_data_percolator/matchers/pii.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
(:require [event-data-percolator.util.pii :as pii]))

(defn match-pii-candidate
[candidate web-trace-atom]
[context candidate]
(assoc candidate
:match (pii/validate-pii (:value candidate))))
:match (pii/validate-pii context (:value candidate))))
8 changes: 4 additions & 4 deletions src/event_data_percolator/matchers/plain_doi.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

(defn match-plain-doi
"Return a canonical DOI if this is a valid, extant DOI."
[plain-doi]
(when-let [validated (doi/validate-cached plain-doi)]
[context plain-doi]
(when-let [validated (doi/validate-cached context plain-doi)]
(crdoi/normalise-doi validated)))

(defn match-plain-doi-candidate
[candidate web-trace-atom]
[context candidate]
(assoc candidate
:match (match-plain-doi (:value candidate))))
:match (match-plain-doi context (:value candidate))))
8 changes: 4 additions & 4 deletions src/event_data_percolator/matchers/shortdoi_url.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@

(defn match-shortdoi-url
"Return a canonical DOI if this is a valid, extant Short DOI."
[short-doi-url]
[context short-doi-url]
(let [valid-url (try (new URL short-doi-url) (catch Exception _ nil))
shortdoi-path (when valid-url
(let [the-path (.getPath valid-url)]
; Drop leading slash, unless there isn't a path.
(when-not (clojure.string/blank? the-path)
(.substring the-path 1))))
validated (doi/validate-cached shortdoi-path)]
validated (doi/validate-cached context shortdoi-path)]
(when validated
(crdoi/normalise-doi validated))))

(defn match-shortdoi-url-candidate
[candidate web-trace-atom]
[context candidate]
(assoc candidate
:match (match-shortdoi-url (:value candidate))))
:match (match-shortdoi-url context (:value candidate))))
Loading

0 comments on commit fa4eccd

Please sign in to comment.