Saltar al contenido principal

Configuración YAML

Pipeline declarativo y auditable.

godml.yml (⚙️ Configuración Principal)
name: mi-modelo-churn
version: 1.0.0
provider: mlflow
description: "Modelo de clasificación para predecir baja de clientes"

# 📊 Dataset
dataset:
uri: ./data/churn.csv
hash: auto # Hash automático para trazabilidad

# 🤖 Modelo
model:
type: xgboost # xgboost | random_forest | lightgbm
source: core # core | custom
hyperparameters:
max_depth: 5
eta: 0.3
objective: binary:logistic

# 📈 Métricas de calidad
metrics:
- name: auc
threshold: 0.85 # Umbral mínimo requerido
- name: accuracy
threshold: 0.80

# 🏛️ Gobernanza
governance:
owner: "equipo-ml@empresa.com"
tags:
- project: churn-prediction
- environment: development
- compliance: gdpr

# 🚀 Despliegue
deploy:
realtime: false
batch_output: ./outputs/predictions.csv

# 🐳 Configuración de entornos
deploy_config:
dev:
docker_tag: mi-modelo:dev
port: 8000
host: 0.0.0.0
production:
docker_tag: mi-modelo:prod
port: 8080
host: 0.0.0.0

📊 DataPrep Service — Preparación Declarativa

godml.yml (Estructura de receta YAML)
# recipe.yml - Receta de preparación de datos
dataprep:
# 📥 INPUTS
inputs:
- name: raw_customers
connector: csv
uri: ./data/customers.csv
options:
sep: ","
encoding: utf-8

# 🔧 TRANSFORMACIONES
steps:
# Seleccionar columnas
- op: select
params:
columns: ["id", "age", "income", "email"]

# Rellenar valores nulos
- op: fillna
params:
strategy: mean
columns: ["age", "income"]

# Codificación one-hot
- op: one_hot
params:
columns: ["category"]
drop_first: true

# Escalado estándar
- op: standard_scale
params:
columns: ["income", "age"]

# Eliminar duplicados
- op: drop_duplicates
params:
subset: ["id"]

# ✅ VALIDACIONES
validations:
- type: expect_non_null
args: ["id", "email"]

- type: expect_unique
args: ["id"]

- type: expect_range
args:
column: age
min: 18
max: 100

# 📤 OUTPUTS
outputs:
- name: clean_data
connector: csv
uri: ./data/customers_clean.csv

# 🔒 COMPLIANCE
governance:
compliance: pci-dss # Aplica automáticamente PCI-DSS