Configuración YAML
Pipeline declarativo y auditable.
godml.yml (⚙️ Configuración Principal)
name: mi-modelo-churn
version: 1.0.0
provider: mlflow
description: "Modelo de clasificación para predecir baja de clientes"
# 📊 Dataset
dataset:
uri: ./data/churn.csv
hash: auto # Hash automático para trazabilidad
# 🤖 Modelo
model:
type: xgboost # xgboost | random_forest | lightgbm
source: core # core | custom
hyperparameters:
max_depth: 5
eta: 0.3
objective: binary:logistic
# 📈 Métricas de calidad
metrics:
- name: auc
threshold: 0.85 # Umbral mínimo requerido
- name: accuracy
threshold: 0.80
# 🏛️ Gobernanza
governance:
owner: "equipo-ml@empresa.com"
tags:
- project: churn-prediction
- environment: development
- compliance: gdpr
# 🚀 Despliegue
deploy:
realtime: false
batch_output: ./outputs/predictions.csv
# 🐳 Configuración de entornos
deploy_config:
dev:
docker_tag: mi-modelo:dev
port: 8000
host: 0.0.0.0
production:
docker_tag: mi-modelo:prod
port: 8080
host: 0.0.0.0
📊 DataPrep Service — Preparación Declarativa
godml.yml (Estructura de receta YAML)
# recipe.yml - Receta de preparación de datos
dataprep:
# 📥 INPUTS
inputs:
- name: raw_customers
connector: csv
uri: ./data/customers.csv
options:
sep: ","
encoding: utf-8
# 🔧 TRANSFORMACIONES
steps:
# Seleccionar columnas
- op: select
params:
columns: ["id", "age", "income", "email"]
# Rellenar valores nulos
- op: fillna
params:
strategy: mean
columns: ["age", "income"]
# Codificación one-hot
- op: one_hot
params:
columns: ["category"]
drop_first: true
# Escalado estándar
- op: standard_scale
params:
columns: ["income", "age"]
# Eliminar duplicados
- op: drop_duplicates
params:
subset: ["id"]
# ✅ VALIDACIONES
validations:
- type: expect_non_null
args: ["id", "email"]
- type: expect_unique
args: ["id"]
- type: expect_range
args:
column: age
min: 18
max: 100
# 📤 OUTPUTS
outputs:
- name: clean_data
connector: csv
uri: ./data/customers_clean.csv
# 🔒 COMPLIANCE
governance:
compliance: pci-dss # Aplica automáticamente PCI-DSS