| { |
| "pipeline": { |
| "name": "customer_analytics_pipeline", |
| "description": "End-to-end customer analytics data pipeline", |
| "version": "2.1.0", |
| "owner": "data-engineering@company.com", |
| "created": "2025-01-15", |
| "schedule": "daily at 02:00 UTC" |
| }, |
| "sources": [ |
| { |
| "id": "src_salesforce", |
| "name": "Salesforce CRM", |
| "type": "api", |
| "connection": { |
| "endpoint": "https://company.salesforce.com/api/v52.0", |
| "auth": "oauth2" |
| }, |
| "objects": ["Account", "Contact", "Opportunity", "Lead"], |
| "incremental_field": "LastModifiedDate" |
| }, |
| { |
| "id": "src_stripe", |
| "name": "Stripe Payments", |
| "type": "api", |
| "connection": { |
| "endpoint": "https://api.stripe.com/v1", |
| "auth": "api_key" |
| }, |
| "objects": ["charges", "customers", "subscriptions", "invoices"] |
| }, |
| { |
| "id": "src_postgres_app", |
| "name": "Application Database", |
| "type": "database", |
| "connection": { |
| "host": "app-db.internal", |
| "port": 5432, |
| "database": "production" |
| }, |
| "tables": ["users", "user_events", "feature_flags", "subscriptions"] |
| }, |
| { |
| "id": "src_segment", |
| "name": "Segment Events", |
| "type": "stream", |
| "connection": { |
| "type": "kafka", |
| "topic": "segment-events", |
| "bootstrap_servers": "kafka.internal:9092" |
| }, |
| "events": ["page", "track", "identify"] |
| }, |
| { |
| "id": "src_google_analytics", |
| "name": "Google Analytics 4", |
| "type": "api", |
| "connection": { |
| "property_id": "GA4-123456789" |
| }, |
| "metrics": ["sessions", "users", "conversions", "revenue"] |
| } |
| ], |
| "stages": [ |
| { |
| "id": "extract", |
| "name": "Data Extraction", |
| "steps": [ |
| { |
| "id": "ext_salesforce", |
| "source": "src_salesforce", |
| "output": "landing/salesforce/", |
| "format": "parquet", |
| "partitions": ["date"], |
| "mode": "incremental" |
| }, |
| { |
| "id": "ext_stripe", |
| "source": "src_stripe", |
| "output": "landing/stripe/", |
| "format": "parquet", |
| "mode": "incremental" |
| }, |
| { |
| "id": "ext_postgres", |
| "source": "src_postgres_app", |
| "output": "landing/app_db/", |
| "format": "parquet", |
| "mode": "cdc" |
| }, |
| { |
| "id": "ext_segment", |
| "source": "src_segment", |
| "output": "landing/segment/", |
| "format": "parquet", |
| "mode": "streaming" |
| }, |
| { |
| "id": "ext_ga4", |
| "source": "src_google_analytics", |
| "output": "landing/ga4/", |
| "format": "parquet", |
| "mode": "batch" |
| } |
| ] |
| }, |
| { |
| "id": "transform", |
| "name": "Data Transformation", |
| "steps": [ |
| { |
| "id": "tfm_customer_identity", |
| "name": "Customer Identity Resolution", |
| "inputs": ["ext_salesforce", "ext_stripe", "ext_postgres"], |
| "output": "curated/customer_identity/", |
| "logic": "Match and merge customer identities across systems using email, phone, and probabilistic matching", |
| "technology": "Spark" |
| }, |
| { |
| "id": "tfm_event_enrichment", |
| "name": "Event Enrichment", |
| "inputs": ["ext_segment", "ext_ga4", "tfm_customer_identity"], |
| "output": "curated/events_enriched/", |
| "logic": "Join events with customer identity and add session context" |
| }, |
| { |
| "id": "tfm_revenue_calc", |
| "name": "Revenue Calculation", |
| "inputs": ["ext_stripe", "ext_salesforce", "tfm_customer_identity"], |
| "output": "curated/revenue/", |
| "logic": "Calculate MRR, ARR, churn, and expansion revenue metrics" |
| }, |
| { |
| "id": "tfm_product_usage", |
| "name": "Product Usage Metrics", |
| "inputs": ["ext_postgres", "tfm_event_enrichment"], |
| "output": "curated/product_usage/", |
| "logic": "Aggregate product usage by customer and feature" |
| } |
| ] |
| }, |
| { |
| "id": "model", |
| "name": "Data Modeling", |
| "steps": [ |
| { |
| "id": "mdl_dim_customer", |
| "name": "Customer Dimension", |
| "inputs": ["tfm_customer_identity", "tfm_revenue_calc"], |
| "output": "warehouse.dim_customer", |
| "type": "scd_type_2" |
| }, |
| { |
| "id": "mdl_dim_product", |
| "name": "Product Dimension", |
| "inputs": ["ext_postgres"], |
| "output": "warehouse.dim_product" |
| }, |
| { |
| "id": "mdl_fct_events", |
| "name": "Events Fact", |
| "inputs": ["tfm_event_enrichment", "mdl_dim_customer", "mdl_dim_product"], |
| "output": "warehouse.fct_events", |
| "grain": "event" |
| }, |
| { |
| "id": "mdl_fct_revenue", |
| "name": "Revenue Fact", |
| "inputs": ["tfm_revenue_calc", "mdl_dim_customer"], |
| "output": "warehouse.fct_revenue", |
| "grain": "transaction" |
| }, |
| { |
| "id": "mdl_fct_usage", |
| "name": "Usage Fact", |
| "inputs": ["tfm_product_usage", "mdl_dim_customer", "mdl_dim_product"], |
| "output": "warehouse.fct_usage", |
| "grain": "daily_customer_feature" |
| } |
| ] |
| }, |
| { |
| "id": "aggregate", |
| "name": "Aggregations & Marts", |
| "steps": [ |
| { |
| "id": "agg_customer_360", |
| "name": "Customer 360 View", |
| "inputs": ["mdl_dim_customer", "mdl_fct_events", "mdl_fct_revenue", "mdl_fct_usage"], |
| "output": "marts.customer_360", |
| "refresh": "hourly" |
| }, |
| { |
| "id": "agg_revenue_metrics", |
| "name": "Revenue Metrics", |
| "inputs": ["mdl_fct_revenue", "mdl_dim_customer"], |
| "output": "marts.revenue_metrics", |
| "refresh": "daily" |
| }, |
| { |
| "id": "agg_product_analytics", |
| "name": "Product Analytics", |
| "inputs": ["mdl_fct_usage", "mdl_fct_events", "mdl_dim_product"], |
| "output": "marts.product_analytics", |
| "refresh": "daily" |
| }, |
| { |
| "id": "agg_health_score", |
| "name": "Customer Health Score", |
| "inputs": ["agg_customer_360", "agg_revenue_metrics", "agg_product_analytics"], |
| "output": "marts.customer_health_score", |
| "logic": "ML-based health score prediction" |
| } |
| ] |
| }, |
| { |
| "id": "publish", |
| "name": "Data Publishing", |
| "steps": [ |
| { |
| "id": "pub_looker", |
| "name": "Looker Semantic Layer", |
| "inputs": ["agg_customer_360", "agg_revenue_metrics", "agg_product_analytics"], |
| "output": "looker://models/customer_analytics", |
| "type": "semantic_model" |
| }, |
| { |
| "id": "pub_salesforce_sync", |
| "name": "Salesforce Sync", |
| "inputs": ["agg_customer_360", "agg_health_score"], |
| "output": "salesforce://Account.HealthScore__c", |
| "type": "reverse_etl" |
| }, |
| { |
| "id": "pub_ml_features", |
| "name": "ML Feature Store", |
| "inputs": ["agg_customer_360", "agg_product_analytics"], |
| "output": "feast://customer_features", |
| "type": "feature_store" |
| } |
| ] |
| } |
| ], |
| "data_quality": { |
| "rules": [ |
| {"table": "mdl_dim_customer", "check": "unique", "column": "customer_id"}, |
| {"table": "mdl_fct_revenue", "check": "not_null", "columns": ["customer_id", "amount", "transaction_date"]}, |
| {"table": "agg_revenue_metrics", "check": "freshness", "max_delay_hours": 2} |
| ] |
| }, |
| "notes": "Comprehensive ETL pipeline sample showing data flow from multiple sources through transformation, modeling, and publishing stages." |
| } |
|
|