Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,6 @@ CKAN__PLUGINS=tracking image_view text_view datajson_validator geodatagov datago
# Enable ckanext-saml2 in local development
# add "saml2auth" to CKAN__PLUGINS list

# Harvest settings
CKAN__HARVEST__MQ__TYPE=redis
CKAN__HARVEST__MQ__HOSTNAME=redis
CKAN__HARVEST__MQ__PORT=6379
CKAN__HARVEST__MQ__REDIS_DB=1
CKAN__HARVEST__LOG_LEVEL=info

CKAN__HARVEST__STATUS_MAIL__ALL=True

CKANEXT__GEODATAGOV__BUREAU_CSV__URL=https://resources.data.gov/schemas/dcat-us/v1.1/omb_bureau_codes.csv
CKANEXT__GEODATAGOV__BUREAU_CSV__URL_DEFAULT=https://resources.data.gov/schemas/dcat-us/v1.1/omb_bureau_codes.csv

Expand Down Expand Up @@ -166,10 +157,6 @@ CKANEXT__SAML2AUTH__REQUESTED_AUTHN_CONTEXT_COMPARISON=exact
# Avoid double package_show call to add tracking info
CKANEXT__DATAGOVCATALOG__ADD_PACKAGES_TRACKING_INFO=false

# remove ckanext-harvest dependency
CKANEXT__DATAGOVTHEME__HARVEST_NEXT=True
CKANEXT__DATAGOVTHEME__HARVEST_ADMIN_URL=http://localhost:8080

# Render recent view using AJAX call to boost page loading speed
CKANEXT__DATAGOVTHEME__JS_RECENT_VIEW=true

Expand Down
10 changes: 4 additions & 6 deletions .github/workflows/ckan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,10 @@ on:
required: true
type: choice
options:
- 'ckan geodatagov check-stuck-jobs'
- 'ckan geodatagov db-solr-sync --dryrun'
- 'ckan geodatagov db-solr-sync --cleanup_solr'
- 'ckan geodatagov db-solr-sync --update_solr'
- 'ckan geodatagov db-solr-sync --cleanup_solr --update_solr'
- 'ckan harvester run'
- 'ckan geodatagov db-solr-sync-next --dryrun'
- 'ckan geodatagov db-solr-sync-next --cleanup_solr'
- 'ckan geodatagov db-solr-sync-next --update_solr'
- 'ckan geodatagov db-solr-sync-next --cleanup_solr --update_solr'
- 'ckan geodatagov sitemap-to-s3'
- 'ckan geodatagov tracking-update'
memory:
Expand Down
21 changes: 4 additions & 17 deletions .github/workflows/ckan_auto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,14 @@ on:
schedule:
- cron: '30 7 * * *' # Tracking Update -- every day at 2:30am EST
- cron: '0 2 * * *' # S3 Sitemap Update -- every day at 10pm EST
- cron: '4/15 * * * *' # Harvester Check -- every 15 mins
- cron: '0 3 * * *' # DB-Solr-Sync -- every day at 10pm EST
- cron: '30 6 * * *' # Check Stuck Jobs -- every day at 1:30am EST
- cron: '0 3 * * *' # DB-Solr-Sync-Next -- every day at 10pm EST

env:
ERROR: false
# Make sure 'schedule-cron' matches these varaibles.
SCHEDULE_TRACKING: '30 7 * * *'
SCHEDULE_SITEMAP: '0 2 * * *'
SCHEDULE_HARVESTING: '4/15 * * * *'
SCHEDULE_DBSOLR_SYNC: '0 3 * * *'
SCHEDULE_STUCK_JOBS: '30 6 * * *'

jobs:
setup-matrix:
Expand All @@ -39,46 +35,37 @@ jobs:
# schedule: sitemap update
# - Only run on [prod]
# - Run on [catalog-gather] app
# schedule: harvesting update
# - Only run on [development, staging, prod]
# schedule: stuck jobs check
# - Only run on [staging, prod]
# - Create Error issue: if >0 stuck jobs, automated_ckan_error.md
# schedule: db-solr-sync
# schedule: db-solr-sync-next
# - Only run on [staging, prod]
# - Create Error issue: if runtime longer than 30 mins
# - Create Informational issue: db-solr-sync-info.md
MATRIX=$(cat << MAT
{
"schedule": ["${{env.SCHEDULE_TRACKING}}", "${{env.SCHEDULE_SITEMAP}}",
"${{env.SCHEDULE_HARVESTING}}", "${{env.SCHEDULE_STUCK_JOBS}}",
"${{env.SCHEDULE_DBSOLR_SYNC}}"],
"environ": ["development", "staging", "prod"],
"include": [ {"app": "catalog-admin"},
{"error_seconds": 22000},
{"info_issue": false},
{"issue_template": ".github/automated_ckan_error.md"},
{"monitor": true},
{"schedule": "${{env.SCHEDULE_HARVESTING}}", "command": "ckan harvester run"},
{"schedule": "${{env.SCHEDULE_TRACKING}}", "command": "ckan geodatagov tracking-update"},
{"schedule": "${{env.SCHEDULE_TRACKING}}", "error_seconds": 12600},
{"schedule": "${{env.SCHEDULE_TRACKING}}", "info_issue": true},
{"schedule": "${{env.SCHEDULE_TRACKING}}", "issue_template": ".github/tracking-update-info.md"},
{"schedule": "${{env.SCHEDULE_SITEMAP}}", "command": "ckan geodatagov sitemap-to-s3"},
{"schedule": "${{env.SCHEDULE_SITEMAP}}", "app": "catalog-gather"},
{"schedule": "${{env.SCHEDULE_DBSOLR_SYNC}}", "command": "ckan geodatagov db-solr-sync"},
{"schedule": "${{env.SCHEDULE_DBSOLR_SYNC}}", "command": "ckan geodatagov db-solr-sync-next"},
{"schedule": "${{env.SCHEDULE_DBSOLR_SYNC}}", "error_seconds": 1800},
{"schedule": "${{env.SCHEDULE_DBSOLR_SYNC}}", "info_issue": true},
{"schedule": "${{env.SCHEDULE_DBSOLR_SYNC}}", "issue_template": ".github/db-solr-sync-info.md"},
{"schedule": "${{env.SCHEDULE_STUCK_JOBS}}", "command": "ckan geodatagov check-stuck-jobs"},
{"environ": "development", "ram": "1G"},
{"environ": "staging", "ram": "2500M"},
{"environ": "prod", "ram": "3G"}
],
"exclude": [ {"schedule": "${{env.SCHEDULE_SITEMAP}}", "environ": "development"},
{"schedule": "${{env.SCHEDULE_SITEMAP}}", "environ": "staging"},
{"schedule": "${{env.SCHEDULE_DBSOLR_SYNC}}", "environ": "development"},
{"schedule": "${{env.SCHEDULE_STUCK_JOBS}}", "environ": "development"}
{"schedule": "${{env.SCHEDULE_DBSOLR_SYNC}}", "environ": "development"}
],
}
MAT
Expand Down
2 changes: 1 addition & 1 deletion .profile
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ export CKAN_SMTP_SERVER=$(vcap_get_service smtp .credentials.smtp_server)
export CKAN_SMTP_STARTTLS=True
export CKAN_SMTP_USER=$(vcap_get_service smtp .credentials.smtp_user)
export CKAN_SMTP_PASSWORD=$(vcap_get_service smtp .credentials.smtp_password)
export CKAN_SMTP_MAIL_FROM=harvester@$(vcap_get_service smtp .credentials.domain_arn | grep -o "ses-[[:alnum:]]\+.ssb.data.gov")
export CKAN_SMTP_MAIL_FROM=datagovhelp@gsa.gov
export [email protected]

# S3 settings
Expand Down
7 changes: 0 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,6 @@ update-tracking-info:
# https://docs.ckan.org/en/2.8/maintaining/tracking.html
docker compose exec ckan ckan tracking update

harvest:
# Pass any of the following arguments to run them
# ARGS=run make harvest
# ARGS=gather-consumer make harvest
# ARGS=fetch-consumer make harvest
docker compose exec ckan ckan harvester $(ARGS)

vulnerability-check:
# Check for no usage of SSL_free_buffers. # Details: https://github.com/GSA/data.gov/issues/4781
! docker compose run --rm -T ckan grep -riI "SSL_free_buffers" /usr/local/lib/python3.10/site-packages/ && echo "Vulnerable SSL_free_buffers is not used"
3 changes: 0 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,6 @@ This will start a new container, displaying the standard output in your
terminal. If you add a breakpoint in a source file in the `src` folder (`import
pdb; pdb.set_trace()`) you will be able to inspect it in this terminal next time
the code is executed.
If you are testing a harvest process (gather/fetch/run), try turning off the command
to start in the background in the `ckan/docker-entrypoint.d/10-setup-harvest.sh`.
Then, run the relevant command manually (`make harvest fetch-queue`) after startup.

## SAML2

Expand Down
3 changes: 1 addition & 2 deletions ckan/requirements.in
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# CKAN requirements and extensions
git+https://github.com/GSA/[email protected]#egg=ckan
git+https://github.com/ckan/[email protected]#egg=ckanext-dcat
-e git+https://github.com/GSA/[email protected]#egg=ckanext-harvest
-e git+https://github.com/GSA/[email protected]#egg=ckanext-spatial
-e git+https://github.com/ckan/[email protected]#egg=ckanext-spatial
git+https://github.com/GSA/ckanext-saml2auth.git@ckan-2-11-datagov#egg=ckanext-saml2auth

-e git+https://github.com/GSA/ckanext-datagovcatalog.git@harvest-next#egg=ckanext_datagovcatalog
Expand Down
9 changes: 4 additions & 5 deletions ckan/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,15 @@ cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.1
ckan @ git+https://github.com/GSA/ckan.git@0482ac3a602815c32998904b46e490e4ec6fbfbf
-e git+https://github.com/GSA/ckanext-datagovcatalog.git@4065ed607509f6f99cbe692cb0d97f4008c3b1af#egg=ckanext_datagovcatalog
-e git+https://github.com/GSA/ckanext-datagovtheme.git@a59ee495c731459cdccc7785eef888f329da45b7#egg=ckanext_datagovtheme
-e git+https://github.com/GSA/ckanext-datagovcatalog.git@b60f69e4cb15d3cdce80698e5b4b2dc406a2ca05#egg=ckanext_datagovcatalog
-e git+https://github.com/GSA/ckanext-datagovtheme.git@8cf3e64df10e2ddba897f65fa2424b1d62a28428#egg=ckanext_datagovtheme
ckanext-datajson==0.1.28
ckanext-dcat @ git+https://github.com/ckan/ckanext-dcat@b8ebf24004cd3f3edb7f9d01c87c20259c102093
ckanext-envvars==0.0.6
-e git+https://github.com/GSA/ckanext-geodatagov.git@aa30619f197abe0f918d547fc81db327d520dc8f#egg=ckanext_geodatagov
-e git+https://github.com/GSA/ckanext-harvest.git@8da16a1f993f0fbf18f9fc810ab6ed7dfb7f871a#egg=ckanext_harvest
-e git+https://github.com/GSA/ckanext-geodatagov.git@0ec4ae5fc826693fa3a9e954e5fb77735f9e38e2#egg=ckanext_geodatagov
ckanext-metrics-dashboard==0.1.7
ckanext-saml2auth @ git+https://github.com/GSA/ckanext-saml2auth.git@99f35585c219a5cd39717b8c42cc54cdd959dfb4
-e git+https://github.com/GSA/ckanext-spatial.git@6d83a53efa1e9ff225daf4e7a5751b98367ee7f2#egg=ckanext_spatial
-e git+https://github.com/ckan/ckanext-spatial.git@v2.1.1#egg=ckanext_spatial
ckantoolkit==0.0.7
click==8.1.7
cryptography==44.0.2
Expand Down
38 changes: 6 additions & 32 deletions ckan/setup/ckan.ini
Original file line number Diff line number Diff line change
Expand Up @@ -230,15 +230,9 @@ ckan.jobs.timeout = 180
# Disable this in favor of CKANEXT__DATAGOVTHEME__JS_RECENT_VIEW
ckanext.datagovcatalog.add_packages_tracking_info = false

# remove ckanext-harvest dependency
ckanext.datagovtheme.harvest_next = true

# Render recent view using AJAX call to boost page loading speed
ckanext.datagovtheme.js_recent_view = true

# Max number of resources to be allowed in a dataset to be harvested
ckanext.datajson.max_resource_count = 800

## Spatial settings
ckanext.spatial.search_backend = solr-bbox
# Customize map widget
Expand All @@ -259,20 +253,6 @@ ckanext.spatial.common_map.type = custom
ckanext.spatial.common_map.custom_url = /maptiles/{z}/{x}/{y}.png
ckanext.spatial.common_map.attribution = <a href="http://openstreetmap.org/copyright">OpenStreetMap</a> contributors

## Harvest settings
# ckanext-harvest will use ckan.redis.url if redis configuration
# is not specified here.

# Mark as finished Jobs in 'Running' status after x minutes (4320 min = 72 hours)
ckan.harvest.timeout = 4320

# define the time frame in days to clean the harvest logs
ckan.harvest.log_timeframe = 180

ckan.harvest.mq.type = redis
ckanext.harvest.email = on
ckan.harvest.status_mail.all=True

## SAML2auth Settings
# TODO fetch and verify remote metadata https://github.com/GSA/datagov-deploy/issues/2860
ckanext.saml2auth.idp_metadata.location=local
Expand Down Expand Up @@ -320,7 +300,7 @@ ckanext-archiver.max_content_length=10240

## Logging configuration
[loggers]
keys = root, ckan, ckanext, werkzeug, saml2, model, harvest
keys = root, ckan, ckanext, werkzeug, saml2, model

[handlers]
keys = console,consoleerror
Expand All @@ -329,41 +309,35 @@ keys = console,consoleerror
keys = generic

[logger_root]
level = WARNING
level = DEBUG
handlers = console,consoleerror

[logger_werkzeug]
level = WARNING
level = DEBUG
handlers = console,consoleerror
qualname = werkzeug
propagate = 0

[logger_ckan]
level = INFO
level = DEBUG
handlers = console,consoleerror
qualname = ckan
propagate = 0

[logger_model]
level = INFO
level = DEBUG
handlers = console,consoleerror
qualname = ckan.model
propagate = 0

[logger_ckanext]
level = INFO
handlers = console,consoleerror
qualname = ckanext
propagate = 0

[logger_harvest]
level = DEBUG
handlers = console,consoleerror
qualname = ckanext
propagate = 0

[logger_saml2]
level = INFO
level = DEBUG
handlers = console,consoleerror
qualname = saml2
propagate = 0
Expand Down
27 changes: 0 additions & 27 deletions docs/harvesters.md

This file was deleted.

2 changes: 0 additions & 2 deletions docs/harvesters/waf-collections.md

This file was deleted.

2 changes: 0 additions & 2 deletions docs/harvesters/waf.md

This file was deleted.

2 changes: 2 additions & 0 deletions e2e/cypress/fixtures/child_dataset.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
{
"name": "child",
"title": "Child Dataset",
"notes": "This is a child dataset.",
"owner_org": "test-organization-collection",
"extras": [
{
Expand Down
2 changes: 2 additions & 0 deletions e2e/cypress/fixtures/parent_dataset.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
{
"name": "parent",
"title": "Parent Dataset",
"notes": "This is a parent dataset.",
"owner_org": "test-organization-collection",
"extras": [
{
Expand Down
1 change: 1 addition & 0 deletions e2e/cypress/integration/dataset.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ describe('Dataset', () => {
"name": packageId,
"title": title,
"owner_org": orgId,
"notes": "This is a test dataset created for e2e testing.",
"extras": [
{
"key": "publisher",
Expand Down
2 changes: 0 additions & 2 deletions manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ applications:
NEW_RELIC_CONFIG_FILE: /home/vcap/app/ckan/setup/newrelic.ini
CKAN_SITE_URL: https://((route-public))
CKAN___GOOGLEANALYTICS__ID: ((googleanalytics_id))
HARVEST_ADMIN_URL: ((harvest_admin_url))

- name: ((app_name))-admin
buildpacks:
Expand Down Expand Up @@ -61,7 +60,6 @@ applications:
NEW_RELIC_MONITOR_MODE: ((new_relic_monitor_mode))
NEW_RELIC_CONFIG_FILE: /home/vcap/app/ckan/setup/newrelic.ini
CKAN_SITE_URL: https://((route-external-admin))
HARVEST_ADMIN_URL: ((harvest_admin_url))

- name: ((app_name))-proxy
buildpacks:
Expand Down
Loading