diff --git a/README.md b/README.md index 1aa0f29..ccbc925 100644 --- a/README.md +++ b/README.md @@ -60,21 +60,8 @@ with some examples. ## Make logs available You will need to run the script on a computer where the log files you're trying to process are available on the file system for the script to access. -## Download the free IP to geolocation database -The geo-ip uses GeoLite2 data created by MaxMind and is available from - -Internet Archive -(you only need the country database in binary database format). - -GeoLite2 is a free IP geolocation database that must be installed. You can download the -database above. Choose the GeoLite2 Country database (binary, gzipped) and extract it to -the maxmind_geoip directory inside the application to use with default configuration, -or put it elsewhere and configure the path as mentioned below. - -Newer versions of the database cannot be used with the current version of the script since -additional licensing terms are required such as registering for accounts, having an auto-update -functionality and ensuring it runs regularly. The script has not been updated to take these -additional requirements into account. +## Download the free IP to geolocation database (Optional) +The geo-ip uses GeoLite2 data created by MaxMind to assign views and downloads to specific countries. It can be installed as discussed in the Dataverse Guides ## Set up the configuration file The script takes a number of different configuration parameters in order to run correctly. See **config/config.yaml** for an example. To change the configuration you may edit it at config/config.yaml or you can put it at a different location and then specify it with an environment variable when starting the script like the example below. diff --git a/config/config.py b/config/config.py index d7fa371..cc05728 100644 --- a/config/config.py +++ b/config/config.py @@ -156,7 +156,7 @@ def start_time(self): def end_time(self): return datetime.datetime.combine(self.end_date, datetime.datetime.min.time()) + datetime.timedelta(days=1) - # memorization of last day + # memoization of last day def last_day(self): """The last day available in the period, either yesterday if in same month, or else last day of month if it has passed""" if self.last_p_day is not None: @@ -170,7 +170,7 @@ def last_day(self): def month_complete(self): return (self.run_date >= self.end_time()) - # gets/memorizes the robots regexp + # gets/memoizes the robots regexp def robots_regexp(self): """Get the list of robots/crawlers from a list that is one per line from the URL and make a regular expression for the detection""" @@ -184,7 +184,7 @@ def robots_regexp(self): self.robots_reg = re.compile('|'.join(lines)) return self.robots_reg - # gets/memorizes the machines regexp + # gets/memoizes the machines regexp def machines_regexp(self): """Get the list of machines from a list that is one per line from the URL and make a regular expression for the detection""" @@ -198,7 +198,7 @@ def machines_regexp(self): self.machines_reg = re.compile('|'.join(lines)) return self.machines_reg - # gets/memorizes the hit-type regexp + # gets/memoizes the hit-type regexp def hit_type_regexp(self): """Make hit type regular expressions for investigation vs request""" if self.hit_type_reg is not None: diff --git a/config/config.yaml b/config/config.yaml index 7c19f54..1da2b47 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -33,8 +33,8 @@ path_types: # if something is a robot or machine user-agent. The text file has one regular expression per line #robots_url: https://raw.githubusercontent.com/CDLUC3/Make-Data-Count/master/user-agents/lists/robot.txt #machines_url: https://raw.githubusercontent.com/CDLUC3/Make-Data-Count/master/user-agents/lists/machine.txt -robots_url: https://raw.githubusercontent.com/IQSS/counter-processor/refs/heads/goto-gdcc/user-agents/lists/robots.txt -machines_url: https://raw.githubusercontent.com/IQSS/counter-processor/refs/heads/goto-gdcc/user-agents/lists/machine.txt +robots_url: https://raw.githubusercontent.com/IQSS/counter-processor/refs/heads/main/user-agents/lists/robots.txt +machines_url: https://raw.githubusercontent.com/IQSS/counter-processor/refs/heads/main/user-agents/lists/machine.txt # the year and month for the report you are creating. year_month: 2018-04 @@ -42,14 +42,14 @@ year_month: 2018-04 # Output formats are only json currently. tsv was planned but spec was never finalized. # Don't put the filename extension, the code will tack on the tsv or json extension for you. -output_file: tmp/test_out +output_file: tmp/make-data-count-report output_format: json # Allows the report to have volume (size) info, which DataCite doesn't accept yet output_volume: False # the name of the platform that goes into your reports -platform: Harvard Dataverse +platform: Dataverse # Don't put your api token in here if you're going to commit it, but put in separate secrets.yaml in same # directory as the config or else set a environment variable when starting up in order to override the key. diff --git a/documentation/step-by-step-install.md b/documentation/step-by-step-install.md index 27719bc..f8c97a7 100644 --- a/documentation/step-by-step-install.md +++ b/documentation/step-by-step-install.md @@ -3,7 +3,7 @@ ## Get the code
-git clone https://github.com/IQSS/counter-processor.git +git clone https://github.com/gdcc/counter-processor.git cd counter-processor git checkout branch-or-tagdiff --git a/user-agents/lists/machine.txt b/user-agents/lists/machine.txt new file mode 100644 index 0000000..b595287 --- /dev/null +++ b/user-agents/lists/machine.txt @@ -0,0 +1,36 @@ +# this file is generated from a master file, please modify it and regenerate it with the generate_lists.rb script +^ruby$ +AddThis +aria2\/\d +CakePHP +ColdFusion +curl\/ +^\%?default\%?$ +Dispatch\/\d +EBSCO\sEJS\sContent\sServer +Fetch(\s|\+)API(\s|\+)Request +geturl +gvfs\/ +HttpComponents\/1.1 +http.?client +Indy Library +^java\/\d{1,2}.\d +libcurl +libhttp +libwww +lwp +Microsoft(\s|\+)URL(\s|\+)Control +Microsoft Office Existence Discovery +ng\/2\. +no_user_agent +pear.php.net +PHP\/ +PycURL +python +rss +^undefined$ +^unknown$ +URL2File +urllib +Wget +wordpress \ No newline at end of file diff --git a/user-agents/lists/robots.txt b/user-agents/lists/robots.txt new file mode 100644 index 0000000..9ef6f3f --- /dev/null +++ b/user-agents/lists/robots.txt @@ -0,0 +1,202 @@ +# this file is generated from a master file, please modify it and regenerate it with the generate_lists.rb script +bot +spider +crawl +[^a]fish +^voyager\/ +ADmantX +alexa +Alexandria(\s|\+)prototype(\s|\+)project +AllenTrack +almaden +appie +API[\+\s]scraper +Arachmo +architext +ArchiveTeam +arks +asterias +atomz +BDFetch +baidu +biglotron +BingPreview +binlar +Blackboard[\+\s]Safeassign +blaiz\-bee +bloglines +blogpulse +boitho\.com\-dc +bookmark\-manager +Brutus\/AET +BUbiNG +bwh3_user_agent +celestial +cfnetwork +checklink +checkprivacy +China\sLocal\sBrowse\s2\.6 +cloakDetect +coccoc\/1\.0 +collection@infegy.com +com\.plumanalytics +combine +contentmatch +ContentSmartz +convera +core +CoverScout +cursor +custo +DataCha0s\/2\.0 +daumoa +DeuSu\/ +Docoloc +docomo +DSurf +DTS Agent +easydl +EmailSiphon +EmailWolf +Embedly +EThOS\+\(British\+Library\) +facebookexternalhit\/ +feedburner +FeedFetcher +feedreader +ferret +findlinks +Fulltext +Funnelback +G-i-g-a-b-o-t +Goldfire(\s|\+)Server +google +Grammarly +grub +gulliver +harvest +heritrix +holmes +htdig +htmlparser +HTTPFetcher +httrack +ia_archiver +ichiro +iktomi +ilse +^integrity\/\d +internetseer +intute +iSiloX +iskanie +jeeves +jobo +kyluka +larbin +lilina +link.?check +LinkLint-checkonly +^LinkParser\/ +^LinkSaver\/ +linkscan +LinkTiger +linkwalker +lipperhey +livejournal\.com +LOCKSS +ltx71 +lycos[\_\+] +mail.ru +mediapartners\-google +megite +MetaURI[\+\s]API\/\d\.\d +mimas +mnogosearch +moget +motor +MuscatFerre +myweb +nagios +^NetAnts\/\d +netcraft +netluchs +Ning +nomad +nutch +^oaDOI$ +ocelli +Offline(\s|\+)Navigator +onetszukaj +OurBrowser +panscient +parsijoo +EasyBib[\+\s]AutoCite[\+\s] +perman +pioneer +playmusic\.com +playstarmusic\.com +^Postgenomic(\s|\+)v2 +powermarks +proximic +Qwantify +Readpaper +redalert +Riddler +robozilla +scan4mail +scientificcommons +scirus +scooter +Scrapy\/\d +^scrutiny\/\d +SearchBloxIntra +shoutcast +SkypeUriPreview +slurp +sogou +speedy +Strider +summify +sunrise +Sysomos +T\-H\-U\-N\-D\-E\-R\-S\-T\-O\-N\-E +tailrank +Teleport(\s|\+)Pro +Teoma +titan +^Traackr\.com$ +Trove +twiceler +ucsd +ultraseek +urlaliasbuilder +validator +virus.detector +voila +^voltron$ +voyager\/ +w3af.org +Wanadoo +Web(\s|\+)Downloader +WebCloner +webcollage +WebCopier +Webinator +weblayers +Webmetrics +webmirror +webmon +webreaper +WebStripper +WebZIP +worm +www.gnip.com +WWW\-Mechanize +xenu +y!j +yacy +yahoo +yandex +zeus +zyborg \ No newline at end of file