|
| 1 | +digraph DataPipeline { |
| 2 | + rankdir=TB; |
| 3 | + // ratio=0.75; // Set aspect ratio to 1 for a square shape |
| 4 | + compound=true; |
| 5 | + |
| 6 | + // Global node styles for consistent appearance |
| 7 | + node [shape=box, style=filled, fontname="Helvetica", fillcolor="#D8EBF7"]; // Single global fillcolor |
| 8 | + |
| 9 | + // New nodes for platforms |
| 10 | + node [shape=cylinder, fillcolor="#C1E1C1", style=filled]; // Distinct style for platform nodes |
| 11 | + ApacheAirflow [label="Apache Airflow\n(Orchestration)"]; |
| 12 | + GCS [label="Google Cloud Storage"]; |
| 13 | + BigQuery [label="BigQuery\n(Data Processing and Storage)"]; |
| 14 | + |
| 15 | + // Restore default node style for operators within the subgraph |
| 16 | + node [shape=box, fillcolor="#D8EBF7"]; |
| 17 | + |
| 18 | + |
| 19 | + // Approach 2: Split PythonOperators with XCom and Cleanup |
| 20 | + subgraph cluster_split_operator_approach { |
| 21 | + label="Data Pipeline Operations"; // More general label for the single approach |
| 22 | + style=filled; |
| 23 | + fillcolor="#F5F5F5"; // Light grey background for this subgraph |
| 24 | + |
| 25 | + // Common Data Ingestion Step (orchestrated by Airflow, moves data to GCS) |
| 26 | + download_upload [label="Download CSV from HTTPS\n& Upload to GCS\n(BashOperator)"]; |
| 27 | + bf_preprocess_op [label="1. Preprocess Data (PythonOperator)\n(Reads data via BigFrames)", height=1.5]; |
| 28 | + bf_validate_write_op [label="2. Validate & Write (PythonOperator)\n(Validates & Writes to BigQuery final table)", height=1.5]; |
| 29 | + cleanup_preprocess_table_op [label="3. Cleanup Temporary Table\n(BigQueryDeleteTableOperator)\n(trigger_rule='all_done')", shape=box, style="filled,dashed"]; |
| 30 | + } |
| 31 | + |
| 32 | + // Connect Airflow to the operators it orchestrates |
| 33 | + ApacheAirflow -> download_upload [label="Orchestrates" lhead=cluster_split_operator_approach]; |
| 34 | + |
| 35 | + |
| 36 | + // Data Flow: Ingestion |
| 37 | + download_upload -> GCS [label="Uploads Data"]; |
| 38 | + |
| 39 | + // Data Flow: Processing (GCS to BigQuery via BigFrames operations) |
| 40 | + GCS -> bf_preprocess_op [label="Data Source"]; // bf_preprocess_op reads from GCS (via BigQuery engine) |
| 41 | + bf_preprocess_op -> BigQuery [label="BigFrames processing"]; // BigFrames operates *in* BigQuery |
| 42 | + bf_validate_write_op -> BigQuery [label="BigFrames validation and writes to final table"]; // Writes to final table in BigQuery |
| 43 | + cleanup_preprocess_table_op -> BigQuery [style=dashed, label="Cleans Up"]; // Show BigQuery being used by cleanup |
| 44 | + |
| 45 | + // Dependencies within the Airflow DAG logic |
| 46 | + download_upload -> bf_preprocess_op [label="Task Dependency"]; |
| 47 | + bf_preprocess_op -> bf_validate_write_op [label="Temporary Table ID\n(XCom)"]; |
| 48 | + |
| 49 | + // Cleanup dependencies (trigger_rule="all_done") |
| 50 | + bf_preprocess_op -> cleanup_preprocess_table_op [label="Cleanup Trigger", style=dashed, color=gray, constraint=false]; |
| 51 | + bf_validate_write_op -> cleanup_preprocess_table_op [label="Cleanup Trigger", style=dashed, color=gray, constraint=false]; |
| 52 | + |
| 53 | + // Add a general note about BigFrames execution for clarity |
| 54 | + note_bigframes_execution [label="BigFrames operations execute\ndirectly within BigQuery's engine,\nminimizing Airflow worker load.", shape=note, fontsize=10, fillcolor="#FFFACD"]; |
| 55 | +} |
0 commit comments