Skip to content

Commit

Permalink
Add edxorg api course metadata to marts (#1480)
Browse files Browse the repository at this point in the history
* incorporate edxorg courses ingested from api to intemediate and mart

* update

* cleanup and fix errors

* fix the courserun_readable_id format
  • Loading branch information
rachellougee authored Feb 21, 2025
1 parent 0441c0b commit cda833d
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 8 deletions.
55 changes: 54 additions & 1 deletion src/ol_dbt/models/intermediate/edxorg/_int_edxorg__models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ models:
- name: courserun_semester
description: str, The semester during which the course was launched e.g Fall 2020
- name: courserun_enrollment_start_date
description: date, registration open date for the course
description: timestamp, the course enrollment start date.
- name: courserun_enrollment_end_date
description: timestamp, the course enrollment end date.
- name: courserun_start_date
description: date, The date on which the course starts
- name: courserun_end_date
Expand All @@ -35,6 +37,26 @@ models:
description: str, The institution that is linked to the course. Values are currently
- name: courserun_instructors
description: str, List of instructors associated with the course
- name: course_topics
description: str, List of the academic subjects that this course covers.
- name: courserun_pace
description: str, the pacing of the course. Possible values are 'self_paced' or
'instructor_paced'.
- name: courserun_is_published
description: boolean, indicating whether the course run is published and open
for enrollment
- name: courserun_enrollment_mode
description: str, audit, verified, credit, or professional.
- name: courserun_availability
description: str, the availability of the course run. Possible values are 'Upcoming',
'Starting Soon', 'Current' or 'Archived'
- name: courserun_duration
description: str, the number of weeks to complete the course. e.g. '4 weeks'
- name: courserun_time_commitment
description: str, short description indicating about the time commitments (e.g.
'5-7 hours per week')
- name: courserun_estimated_hours
description: int, the estimated number of hours to complete the course.
- name: micromasters_program_id
description: int, foreign key to int__micromasters__programs
tests:
Expand Down Expand Up @@ -870,3 +892,34 @@ models:
tests:
- dbt_expectations.expect_compound_columns_to_be_unique:
column_list: ["program_uuid", "course_readable_id"]

- name: int__edxorg__mitx_product
description: MITx course available modes and prices on edX.org
columns:
- name: courserun_readable_id
description: str, unique identifier for the course run in {org}+{course number}+{run}
format.
tests:
- not_null
- name: courserun_mode
description: str, The course mode that the course offers. Possible values are
audit, credit, honor, professional education, or verified.
tests:
- not_null
- name: price
description: float, the cost in USD of a verified certificate, a professional
education certificate, or academic credit for the course.
- name: currency
description: str, the currency in which the course accepts payment. This value
is USD.
- name: upgrade_deadline
description: timestamp, the deadline for learners to upgrade from the audit track
to the verified certificate track.
- name: credit_provider
description: str, the institution that offers academic credit for learners who
pass the course.
- name: credit_hours
description: str, the number of credit hours that learners who pass the course
earn.
runs:
courserun_time_commitment:
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,90 @@
---It also adds a field micromaster_program_id so that we could use it to get program requirements from MicroMaster


with runs as (
with runs_from_bigquery as (
select *
from {{ ref('stg__edxorg__bigquery__mitx_courserun') }}
where courserun_platform = '{{ var("edxorg") }}'
)

, courseruns as (
select * from {{ ref('stg__edxorg__api__courserun') }}
)

, courses as (
select * from {{ ref('stg__edxorg__api__course') }}
)

, instructors as (
select
courseruns.courserun_readable_id
, array_join(
array_agg(
concat(
json_extract_scalar(t.instructor, '$.first_name')
, ' '
, json_extract_scalar(t.instructor, '$.last_name')
)
)
, ', '
) as instructor_names
from courseruns
cross join unnest(cast(json_parse(courseruns.courserun_instructors) as array (json))) as t (instructor) -- noqa
group by courseruns.courserun_readable_id
)

, runs_from_api as (
select
courseruns.*
, courses.course_organizations
, instructors.instructor_names
, array_join(courses.course_topics, ', ') as course_topics
from courseruns
inner join courses
on courseruns.course_readable_id = courses.course_readable_id
inner join instructors
on courseruns.courserun_readable_id = instructors.courserun_readable_id
)

, runs as (
select
runs_from_bigquery.course_number
, runs_from_bigquery.courserun_semester
, runs_from_api.courserun_enrollment_end_on as courserun_enrollment_end_date
, runs_from_api.courserun_short_description as courserun_description
, runs_from_api.course_topics
, runs_from_api.courserun_pace
, runs_from_api.courserun_time_commitment
, runs_from_api.courserun_estimated_hours
, runs_from_api.courserun_duration
, runs_from_api.courserun_enrollment_mode
, runs_from_api.courserun_availability
, runs_from_api.courserun_is_published
, coalesce(
replace(replace(runs_from_api.courserun_readable_id, 'course-v1:', ''), '+', '/')
, runs_from_bigquery.courserun_readable_id
) as courserun_readable_id
, coalesce(
runs_from_api.courserun_is_self_paced, runs_from_bigquery.courserun_is_self_paced
) as courserun_is_self_paced
, coalesce(runs_from_api.course_readable_id, runs_from_bigquery.course_readable_id) as course_readable_id
, coalesce(runs_from_api.courserun_title, runs_from_bigquery.courserun_title) as courserun_title
, coalesce(runs_from_api.courserun_marketing_url, runs_from_bigquery.courserun_url) as courserun_url
, coalesce(
runs_from_api.course_organizations, runs_from_bigquery.courserun_institution
) as courserun_institution
, coalesce(runs_from_api.instructor_names, runs_from_bigquery.courserun_instructors) as courserun_instructors
, coalesce(runs_from_api.courserun_enrollment_start_on, runs_from_bigquery.courserun_enrollment_start_date)
as courserun_enrollment_start_date
, coalesce(runs_from_api.courserun_start_on, runs_from_bigquery.courserun_start_date) as courserun_start_date
, coalesce(runs_from_api.courserun_end_on, runs_from_bigquery.courserun_end_date) as courserun_end_date
from runs_from_api
full outer join runs_from_bigquery
on
replace(replace(runs_from_api.courserun_readable_id, 'course-v1:', ''), '+', '/')
= runs_from_bigquery.courserun_readable_id
)

--- MicroMasters's course_edx_key can either be {org}+{course_number} or course-v1:{org}+{course_number}, so it
-- can't be directly used to link courses between edx and MM, it needs to be formatted as {org}/{course_number}
, micromasters_courses as (
Expand All @@ -30,9 +108,19 @@ select
, runs.courserun_institution
, runs.courserun_instructors
, runs.courserun_enrollment_start_date
, runs.courserun_enrollment_end_date
, runs.courserun_start_date
, runs.courserun_end_date
, runs.courserun_is_self_paced
, runs.courserun_description
, runs.course_topics
, runs.courserun_is_published
, runs.courserun_pace
, runs.courserun_time_commitment
, runs.courserun_estimated_hours
, runs.courserun_duration
, runs.courserun_enrollment_mode
, runs.courserun_availability
, micromasters_courses.program_id as micromasters_program_id
, micromasters_courses.course_id as micromasters_course_id

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
with courseruns as (
select * from {{ ref('stg__edxorg__api__courserun') }}
)

select
courseruns.courserun_readable_id
, cast(json_extract_scalar(t.seat, '$.price') as decimal(38, 2)) as price
, json_extract_scalar(t.seat, '$.type') as courserun_mode
, json_extract_scalar(t.seat, '$.currency') as currency
, json_extract_scalar(t.seat, '$.upgrade_deadline') as upgrade_deadline
, json_extract_scalar(t.seat, '$.credit_provider') as credit_provider
, json_extract_scalar(t.seat, '$.credit_hours') as credit_hours
from courseruns
cross join unnest(cast(json_parse(courseruns.courserun_enrollment_modes) as array (json))) as t (seat) -- noqa
7 changes: 4 additions & 3 deletions src/ol_dbt/models/marts/combined/_marts__combined__models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -503,15 +503,16 @@ models:
row_condition: "program_id =2 and platform_name ='edX.org'"

- name: marts__combined__products
description: combined course or program products from MITx Online and xPro
description: combined course or program products from MITx Online, xPro, or edX.org
columns:
- name: platform
description: str, application where data is from - MITx Online or xPro
description: str, application where data is from - MITx Online, xPro, or edX.org
tests:
- not_null
- name: product_platform
description: str, indicating the partners for course/program. The possible values
are Emeritus, xPRO, Simplilearn, Global Alumni, WHU, Susskind and MITx Online.
are Emeritus, xPRO, Simplilearn, Global Alumni, WHU, Susskind, MITx Online,
and edX.org
tests:
- not_null
- name: product_readable_id
Expand Down
72 changes: 72 additions & 0 deletions src/ol_dbt/models/marts/combined/marts__combined__products.sql
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ with mitxonline_product as (
select * from {{ ref('int__mitxpro__program_runs') }}
)

, edxorg_product as (
select * from {{ ref('int__edxorg__mitx_product') }}
)

, edxorg_runs as (
select * from {{ ref('int__edxorg__mitx_courseruns') }}
)

, mitxonline_product_view as (
select
mitxonline_product.product_id
Expand Down Expand Up @@ -149,6 +157,39 @@ with mitxonline_product as (
where mitxpro_product.product_type in ('program', 'course run')
)

, edxorg_product_view as (
select
edxorg_runs.courserun_readable_id as product_readable_id
, edxorg_runs.courserun_title as product_name
, edxorg_runs.courserun_description as product_description
, edxorg_product.price as list_price
, edxorg_runs.courserun_start_date as start_on
, edxorg_runs.courserun_end_date as end_on
, edxorg_runs.courserun_enrollment_start_date as enrollment_start_on
, edxorg_runs.courserun_enrollment_end_date as enrollment_end_on
, edxorg_product.upgrade_deadline
, edxorg_runs.courserun_pace as pace
, edxorg_runs.courserun_duration as duration
, edxorg_runs.courserun_time_commitment as time_commitment
, edxorg_runs.course_topics as topics
, edxorg_runs.courserun_instructors as instructors
, if(edxorg_runs.courserun_is_published, true, false) as is_live
, if(
edxorg_runs.micromasters_program_id is not null
, 'MicroMasters Credential'
, 'Certificate of Completion'
) as certification_type
from edxorg_product
inner join edxorg_runs
on edxorg_product.courserun_readable_id = edxorg_runs.courserun_readable_id
left join mitxonline_product_view
on edxorg_product.courserun_readable_id = mitxonline_product_view.product_readable_id
where
edxorg_product.courserun_mode = 'verified'
and mitxonline_product_view.product_readable_id is null

)

select
'{{ var("mitxonline") }}' as platform
, '{{ var("mitxonline") }}' as product_platform
Expand Down Expand Up @@ -208,3 +249,34 @@ select
, 'xPro' as offered_by
, is_live
from mitxpro_product_view

union all

select
'{{ var("edxorg") }}' as platform
, '{{ var("edxorg") }}' as product_platform
, product_readable_id
, product_name
, null as product_id
, 'course run' as product_type
, product_description
, list_price
, null as product_is_active
, false as product_is_private
, null as product_created_on
, start_on
, end_on
, enrollment_start_on
, enrollment_end_on
, upgrade_deadline
, pace
, duration
, time_commitment
, certification_type
, 'Online' as delivery
, null as continuing_education_credits
, topics
, instructors
, 'MITx' as offered_by
, is_live
from edxorg_product_view
12 changes: 10 additions & 2 deletions src/ol_dbt/models/staging/edxorg/_stg__edxorg__models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1084,8 +1084,10 @@ models:
- name: courserun_languages
description: str, the language for this course run.
- name: courserun_pace
description: str, the pacing of the course. Possible values are 'self-paced' or
'instructor-paced'.
description: str, the pacing of the course. Possible values are 'self_paced' or
'instructor_paced'.
- name: courserun_is_self_paced
description: boolean, indicating whether or not the course is self paced
- name: courserun_enrollment_mode
description: str, audit, verified, credit, or professional.
- name: courserun_status
Expand All @@ -1101,6 +1103,12 @@ models:
description: array, list of instructors associated with the course
- name: courserun_duration
description: str, the number of weeks to complete the course. e.g. '4 weeks'
- name: courserun_time_commitment
description: str, short description indicating about the time commitments (e.g.
'5-7 hours per week')
- name: courserun_is_published
description: boolean, indicating whether the course run is published and open
for enrollment
- name: courserun_min_weekly_hours
description: int, the minimum number of estimated hours of effort per week.
- name: courserun_max_weekly_hours
Expand Down
16 changes: 15 additions & 1 deletion src/ol_dbt/models/staging/edxorg/stg__edxorg__api__courserun.sql
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,31 @@ with source as (
, min_effort as courserun_min_weekly_hours
, max_effort as courserun_max_weekly_hours
, estimated_hours as courserun_estimated_hours
, pacing_type as courserun_pace
, enrollment_type as courserun_enrollment_mode
, availability as courserun_availability
, status as courserun_status
, is_enrollable as courserun_is_enrollable
, pacing_type as courserun_pace
, if(staff = '[]', null, staff) as courserun_instructors
, if(seats = '[]', null, seats) as courserun_enrollment_modes
, json_query(image, 'lax $.url' omit quotes) as courserun_image_url
, if(status = 'published' and is_enrollable, true, false) as courserun_is_published
, case
when pacing_type = 'self_paced' then true
when pacing_type = 'instructor_paced' then false
end as courserun_is_self_paced
, case
when weeks_to_complete = 1 then cast(weeks_to_complete as varchar) || ' week'
when weeks_to_complete > 1 then cast(weeks_to_complete as varchar) || ' weeks'
end as courserun_duration
, case
when min_effort is not null and max_effort is not null
then cast(min_effort as varchar) || '-' || cast(max_effort as varchar) || ' hours per week'
when min_effort is not null and max_effort is null
then cast(min_effort as varchar) || ' hours per week'
when min_effort is null and max_effort is not null
then cast(max_effort as varchar) || ' hours per week'
end as courserun_time_commitment
, {{ cast_timestamp_to_iso8601('start_on') }} as courserun_start_on
, {{ cast_timestamp_to_iso8601('end_on') }} as courserun_end_on
, {{ cast_timestamp_to_iso8601('enrollment_start') }} as courserun_enrollment_start_on
Expand All @@ -40,3 +53,4 @@ with source as (
)

select * from cleaned
where lower(courserun_title) not like 'delete%'

0 comments on commit cda833d

Please sign in to comment.