Skip to content

Merge pull request #4 from jipc3/construccion #4

Merge pull request #4 from jipc3/construccion

Merge pull request #4 from jipc3/construccion #4

name: Dynamic Databricks Notebook Deploy
on:
push:
branches:
- main
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Install jq & curl
run: sudo apt-get update && sudo apt-get install -y jq curl
- name: Export multiple notebooks (raw)
run: |
ORIGIN_HOST=${{ secrets.DATABRICKS_ORIGIN_HOST }}
ORIGIN_TOKEN=${{ secrets.DATABRICKS_ORIGIN_TOKEN }}
NOTEBOOK_BASE="/Workspace/Users/jeremypalma2022@gmail.com/Proyect_Databricks_GitHub"
NOTEBOOK_PATHS=(
"process/Ingest_supercias_compania"
"process/Ingest_supercias_ranking"
"process/Ingest_supercias_sector"
"process/Ingest_supercias_segmento"
"process/Load_supercias"
"process/Transform_supercias"
"scripts/Preparacion_Ambiente"
) # Agrega más rutas según necesites
mkdir -p notebooks_to_deploy
for nb_path in "${NOTEBOOK_PATHS[@]}"; do
nb=$(basename "$nb_path")
echo "Exportando $nb_path en modo raw..."
curl -s -X GET \
-H "Authorization: Bearer $ORIGIN_TOKEN" \
"$ORIGIN_HOST/api/2.0/workspace/export?path=$NOTEBOOK_BASE/$nb_path&format=SOURCE&direct_download=true" \
--output "notebooks_to_deploy/$nb.py"
done
- name: Deploy notebooks to Destination Workspace
run: |
DEST_HOST=${{ secrets.DATABRICKS_DEST_HOST }}
DEST_TOKEN=${{ secrets.DATABRICKS_DEST_TOKEN }}
DEST_BASE="/prod/scripts/main"
for file in notebooks_to_deploy/*.py; do
name=$(basename "$file" .py)
dest_path="$DEST_BASE/$name"
echo "Creando carpeta $DEST_BASE si no existe..."
curl -s -X POST \
-H "Authorization: Bearer $DEST_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"path\":\"$DEST_BASE\"}" \
"$DEST_HOST/api/2.0/workspace/mkdirs"
echo "Importando $file → $dest_path"
response=$(curl -s -X POST \
-H "Authorization: Bearer $DEST_TOKEN" \
-H "Content-Type: multipart/form-data" \
-F "path=$dest_path" \
-F "format=SOURCE" \
-F "language=PYTHON" \
-F "overwrite=true" \
-F "content=@$file" \
"$DEST_HOST/api/2.0/workspace/import")
echo "Response: $response"
done
- name: Check if workflow exists and delete if necessary
run: |
DEST_HOST=${{ secrets.DATABRICKS_DEST_HOST }}
DEST_TOKEN=${{ secrets.DATABRICKS_DEST_TOKEN }}
WORKFLOW_NAME="WF_ADB"
echo "Verificando si existe el workflow: $WORKFLOW_NAME"
# Listar todos los workflows y buscar por nombre
workflows_response=$(curl -s -X GET \
-H "Authorization: Bearer $DEST_TOKEN" \
"$DEST_HOST/api/2.1/jobs/list")
# Extraer job_id si existe el workflow
existing_job_id=$(echo "$workflows_response" | jq -r --arg name "$WORKFLOW_NAME" '.jobs[]? | select(.settings.name == $name) | .job_id')
if [ "$existing_job_id" != "" ] && [ "$existing_job_id" != "null" ]; then
echo "Workflow encontrado con ID: $existing_job_id. Eliminando..."
delete_response=$(curl -s -X POST \
-H "Authorization: Bearer $DEST_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"job_id\": $existing_job_id}" \
"$DEST_HOST/api/2.1/jobs/delete")
echo "Delete response: $delete_response"
else
echo "No se encontró workflow existente con nombre: $WORKFLOW_NAME"
fi
- name: Create Databricks Workflow WF_ADB
run: |
DEST_HOST=${{ secrets.DATABRICKS_DEST_HOST }}
DEST_TOKEN=${{ secrets.DATABRICKS_DEST_TOKEN }}
DEST_BASE="/py/scripts/main"
echo "Creando workflow: WF_ADB"
# Crear el JSON del workflow
cat > workflow_config.json << 'EOF'
{
"name": "WF_ADB",
"format": "MULTI_TASK",
"tasks": [
{
"task_key": "Preparacion_Ambiente",
"description": "Ejecuta notebook Preparacion_Ambiente",
"notebook_task": {
"notebook_path": "/prod/scripts/main/Preparacion_Ambiente",
"source": "WORKSPACE"
},
"job_cluster_key": "serverless_cluster",
"timeout_seconds": 3600,
"max_retries": 2
},
{
"task_key": "Ingest_supercias_compania",
"description": "Ejecuta notebook Ingest_supercias_compania",
"notebook_task": {
"notebook_path": "/prod/scripts/main/Ingest_supercias_compania",
"source": "WORKSPACE"
},
"job_cluster_key": "serverless_cluster",
"timeout_seconds": 3600,
"max_retries": 2,
"depends_on": [
{
"task_key": "Preparacion_Ambiente"
}
]
},
{
"task_key": "Ingest_supercias_ranking",
"description": "Ejecuta notebook Ingest_supercias_ranking",
"notebook_task": {
"notebook_path": "/prod/scripts/main/Ingest_supercias_ranking",
"source": "WORKSPACE"
},
"job_cluster_key": "serverless_cluster",
"timeout_seconds": 3600,
"max_retries": 2,
"depends_on": [
{
"task_key": "Preparacion_Ambiente"
}
]
},
{
"task_key": "Ingest_supercias_sector",
"description": "Ejecuta notebook Ingest_supercias_sector",
"notebook_task": {
"notebook_path": "/prod/scripts/main/Ingest_supercias_sector",
"source": "WORKSPACE"
},
"job_cluster_key": "serverless_cluster",
"timeout_seconds": 3600,
"max_retries": 2,
"depends_on": [
{
"task_key": "Preparacion_Ambiente"
}
]
},
{
"task_key": "Ingest_supercias_segmento",
"description": "Ejecuta notebook Ingest_supercias_segmento",
"notebook_task": {
"notebook_path": "/prod/scripts/main/Ingest_supercias_segmento",
"source": "WORKSPACE"
},
"job_cluster_key": "serverless_cluster",
"timeout_seconds": 3600,
"max_retries": 2,
"depends_on": [
{
"task_key": "Preparacion_Ambiente"
}
]
},
{
"task_key": "Transform_supercias",
"description": "Ejecuta notebook Transform_supercias",
"notebook_task": {
"notebook_path": "/prod/scripts/main/Transform_supercias",
"source": "WORKSPACE"
},
"job_cluster_key": "serverless_cluster",
"timeout_seconds": 3600,
"max_retries": 2,
"depends_on": [
{
"task_key": "Ingest_supercias_compania"
},
{
"task_key": "Ingest_supercias_ranking"
},
{
"task_key": "Ingest_supercias_sector"
},
{
"task_key": "Ingest_supercias_segmento"
}
]
},
{
"task_key": "Load_supercias",
"description": "Ejecuta notebook Load_supercias",
"notebook_task": {
"notebook_path": "/prod/scripts/main/Load_supercias",
"source": "WORKSPACE"
},
"job_cluster_key": "serverless_cluster",
"timeout_seconds": 3600,
"max_retries": 2,
"depends_on": [
{
"task_key": "Transform_supercias"
}
]
}
],
"job_clusters": [
{
"job_cluster_key": "serverless_cluster",
"new_cluster": {
"spark_version": "16.4.x-scala2.12",
"node_type_id": "Standard_D4s_v3",
"driver_node_type_id": "Standard_D4s_v3",
"num_workers": 0,
"data_security_mode": "SINGLE_USER",
"runtime_engine": "STANDARD",
"custom_tags": {
"environment": "production",
"team": "ext_users"
}
}
}
],
"schedule": {
"quartz_cron_expression": "0 0 8 * * ?",
"timezone_id": "America/Lima",
"pause_status": "PAUSED"
},
"email_notifications": {
"on_failure": [],
"on_success": [],
"no_alert_for_skipped_runs": false
},
"webhook_notifications": {},
"timeout_seconds": 7200,
"max_concurrent_runs": 1,
"tags": {
"environment": "production",
"created_by": "github_actions",
"project": "automated_deployment"
}
}
EOF
# Crear el workflow
create_response=$(curl -s -X POST \
-H "Authorization: Bearer $DEST_TOKEN" \
-H "Content-Type: application/json" \
-d @workflow_config.json \
"$DEST_HOST/api/2.1/jobs/create")
echo "Workflow creation response: $create_response"
# Extraer job_id del response
job_id=$(echo "$create_response" | jq -r '.job_id')
if [ "$job_id" != "" ] && [ "$job_id" != "null" ]; then
echo "Workflow 'WF_ADB' creado exitosamente con ID: $job_id"
# Obtener detalles del workflow creado
workflow_details=$(curl -s -X GET \
-H "Authorization: Bearer $DEST_TOKEN" \
"$DEST_HOST/api/2.1/jobs/get?job_id=$job_id")
echo "Detalles del workflow:"
echo "$workflow_details" | jq '.settings | {name, tasks: (.tasks | map({task_key, notebook_task: .notebook_task.notebook_path}))}'
else
echo "Error al crear el workflow"
echo "Response completo: $create_response"
exit 1
fi
- name: Validate Workflow Configuration
run: |
DEST_HOST=${{ secrets.DATABRICKS_DEST_HOST }}
DEST_TOKEN=${{ secrets.DATABRICKS_DEST_TOKEN }}
WORKFLOW_NAME="WF_ADB"
echo "Validando la configuración del workflow creado..."
# Obtener lista de workflows y encontrar el recién creado
workflows_list=$(curl -s -X GET \
-H "Authorization: Bearer $DEST_TOKEN" \
"$DEST_HOST/api/2.1/jobs/list")
job_id=$(echo "$workflows_list" | jq -r --arg name "$WORKFLOW_NAME" '.jobs[]? | select(.settings.name == $name) | .job_id')
if [ "$job_id" != "" ] && [ "$job_id" != "null" ]; then
echo "Workflow encontrado con ID: $job_id"
# Obtener configuración detallada
job_details=$(curl -s -X GET \
-H "Authorization: Bearer $DEST_TOKEN" \
"$DEST_HOST/api/2.1/jobs/get?job_id=$job_id")
echo "Resumen del workflow:"
echo "Nombre: $(echo "$job_details" | jq -r '.settings.name')"
echo "Número de tareas: $(echo "$job_details" | jq '.settings.tasks | length')"
echo ""
echo "Tareas configuradas:"
echo "$job_details" | jq -r '.settings.tasks[] | "- " + .task_key + " → " + .notebook_task.notebook_path'
echo ""
echo "Cluster configurado:"
echo "Tipo: $(echo "$job_details" | jq -r '.settings.job_clusters[0].new_cluster.node_type_id')"
echo "Workers: $(echo "$job_details" | jq -r '.settings.job_clusters[0].new_cluster.num_workers')"
echo "Spark Version: $(echo "$job_details" | jq -r '.settings.job_clusters[0].new_cluster.spark_version')"
else
echo "No se pudo encontrar el workflow creado"
exit 1
fi
- name: Clean up
run: |
rm -rf notebooks_to_deploy
rm -f workflow_config.json
- name: Done
run: |
echo "¡Despliegue completado exitosamente!"
echo ""
echo "Resumen:"
echo "Notebooks desplegados: ntbk_1, ntbk_2"
echo "Workflow creado: WF_ADB"
echo "Tareas configuradas:"
echo " - tarea1_notebook1 (ntbk_1)"
echo " - tarea2_notebook2 (ntbk_2)"
echo "Cluster: Serverless configurado"
echo ""
echo "Accede a tu workspace de Databricks para ver el workflow en la sección 'Workflows'"