Modify Data Pipeline to perform Global Energy Transition Analysis

its-ammu · its-ammu · commit 843926512fb3 · 2025-03-24T20:12:27.000+05:30
- Implement Global Energy Transition Analysis Pipeline in pipeline.py, replacing sample data generation with real-time data fetching from World Bank API.
- Enhance data processing with energy transition metrics and regional analysis.
- Add interactive visualizations using Plotly and a web dashboard with Dash.
- Update README.md to reflect new features and usage instructions. Modify requirements.txt to include necessary libraries for data fetching and visualization.
diff --git a/README.md b/README.md
@@ -1,6 +1,16 @@
-# Simple Data Pipeline with Prefect
+# Global Energy Transition Analysis Pipeline with Prefect
 
-This project demonstrates a simple data pipeline using Prefect. The pipeline generates sample data, processes it by adding derived columns, and saves the results to a CSV file.
+This project implements a data pipeline that analyzes global energy consumption and renewable energy adoption data from the World Bank. The pipeline tracks energy transition progress across countries and regions, providing insights into the shift towards sustainable energy sources.
+
+## Features
+
+- Real-time data fetching from World Bank API
+- Multi-indicator energy analysis
+- Regional trend analysis
+- Interactive visualizations using Plotly
+- Web dashboard using Dash
+- Automated report generation
+- Prefect workflow management
 
 ## Setup
 
@@ -23,11 +33,16 @@ python pipeline.py
 ```
 
 The pipeline will:
-1. Generate sample time series data
-2. Process the data by adding:
-   - 7-day rolling mean
-   - Boolean flag indicating if value is above mean
-3. Save the processed data to a CSV file with timestamp
+1. Fetch the latest energy consumption data from World Bank
+2. Process and analyze energy transition metrics
+3. Generate interactive visualizations
+4. Create a web dashboard
+5. Generate a summary report
+
+Outputs:
+- Interactive visualizations in the `output` directory
+- Summary report in `output/energy_report.txt`
+- Web dashboard available at http://localhost:8050
 
 ## Cloud Deployment
 
@@ -40,6 +55,7 @@ The pipeline will:
    - Go to your GitHub repository settings
    - Navigate to Secrets and Variables > Actions
    - Add a new secret named `PREFECT_API_KEY` with your Prefect Cloud API key
+   - Add a new secret named `PREFECT_WORKSPACE` with your workspace name
 
 ### GitHub Actions
 
@@ -58,13 +74,30 @@ To run the pipeline manually:
 - `pipeline.py`: Main pipeline script containing the Prefect flow and tasks
 - `requirements.txt`: Project dependencies
 - `.github/workflows/pipeline.yml`: GitHub Actions workflow configuration
+- `output/`: Directory containing generated visualizations and reports
 - `README.md`: This file
 
-## Features
-
-- Uses Prefect's `@flow` and `@task` decorators for workflow management
-- Demonstrates basic data processing with pandas
-- Includes automatic file naming with timestamps
-- Provides clear logging of pipeline execution
-- Runs in Docker containers via Prefect Cloud
-- Automated execution through GitHub Actions
+## Data Analysis Features
+
+1. Energy Consumption Analysis:
+   - Per capita energy use
+   - Electric power consumption
+   - Fossil fuel dependency
+   - Renewable energy adoption
+
+2. Transition Metrics:
+   - Energy transition score
+   - Year-over-year renewable growth
+   - Regional energy patterns
+   - Country-level comparisons
+
+3. Visualizations:
+   - Interactive scatter plots for energy transition progress
+   - Regional trend line charts
+   - Web dashboard with multiple views
+
+4. Reporting:
+   - Key energy statistics
+   - Top performing countries
+   - Regional comparisons
+   - Links to interactive visualizations
diff --git a/pipeline.py b/pipeline.py
@@ -2,55 +2,208 @@
 import pandas as pd
 import numpy as np
 from datetime import datetime
+import requests
+import plotly.express as px
+import plotly.graph_objects as go
+from dash import Dash, html, dcc
 import os
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
 
 @task
-def generate_data():
-    """Generate sample data"""
-    np.random.seed(42)
-    data = {
-        'date': pd.date_range(start='2024-01-01', periods=100),
-        'value': np.random.normal(100, 15, 100)
+def fetch_energy_data():
+    """Fetch energy consumption and renewable energy data from World Bank API"""
+    # World Bank API endpoints for energy indicators
+    indicators = {
+        'EG.USE.PCAP.KG.OE': 'Energy use per capita (kg of oil equivalent)',
+        'EG.FEC.RNEW.ZS': 'Renewable energy consumption (% of total final energy consumption)',
+        'EG.USE.COMM.FO.ZS': 'Fossil fuel energy consumption (% of total)',
+        'EG.USE.ELEC.KH.PC': 'Electric power consumption (kWh per capita)'
     }
-    return pd.DataFrame(data)
+    
+    try:
+        # Fetch data for each indicator
+        dfs = []
+        for indicator in indicators.keys():
+            url = f"https://api.worldbank.org/v2/country/all/indicator/{indicator}?format=json&per_page=1000"
+            response = requests.get(url)
+            response.raise_for_status()
+            
+            # Extract data from response
+            data = response.json()[1]
+            df = pd.DataFrame(data)
+            df['indicator'] = indicators[indicator]
+            dfs.append(df)
+        
+        # Combine all indicators
+        combined_df = pd.concat(dfs, ignore_index=True)
+        
+        # Clean and process the data
+        combined_df['date'] = pd.to_datetime(combined_df['date'])
+        combined_df['value'] = pd.to_numeric(combined_df['value'], errors='coerce')
+        
+        # Pivot the data to have indicators as columns
+        df_pivot = combined_df.pivot_table(
+            index=['country', 'date'],
+            columns='indicator',
+            values='value',
+            aggfunc='first'
+        ).reset_index()
+        
+        return df_pivot
+    except Exception as e:
+        print(f"Error fetching data: {e}")
+        raise
 
 @task
 def process_data(df):
-    """Process the data by adding derived columns"""
-    df['rolling_mean'] = df['value'].rolling(window=7).mean()
-    df['is_above_mean'] = df['value'] > df['value'].mean()
-    return df
+    """Process and clean the energy data"""
+    # Calculate year-over-year changes
+    df['Renewable Growth'] = df.groupby('country')['Renewable energy consumption (% of total final energy consumption)'].pct_change()
+    
+    # Calculate energy transition score (higher renewable %, lower fossil fuel %)
+    df['Energy Transition Score'] = (
+        df['Renewable energy consumption (% of total final energy consumption)'] -
+        df['Fossil fuel energy consumption (% of total)']
+    )
+    
+    # Get latest data for each country
+    latest_data = df.groupby('country').last().reset_index()
+    
+    # Calculate regional averages
+    df['Region'] = df['country'].map(lambda x: get_region(x))
+    regional_avg = df.groupby(['Region', 'date']).mean().reset_index()
+    
+    return df, latest_data, regional_avg
+
+def get_region(country):
+    """Map countries to regions"""
+    # This is a simplified mapping - you can expand this
+    regions = {
+        'United States': 'North America',
+        'Canada': 'North America',
+        'China': 'Asia',
+        'India': 'Asia',
+        'Germany': 'Europe',
+        'France': 'Europe',
+        'Brazil': 'South America',
+        'South Africa': 'Africa',
+        'Australia': 'Oceania'
+    }
+    return regions.get(country, 'Other')
 
 @task
-def save_data(df):
-    """Save the processed data"""
-    output_file = f"processed_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
-    df.to_csv(output_file, index=False)
-    return output_file
-
-@flow(name="Simple Data Pipeline")
-def data_pipeline():
+def create_visualizations(df, latest_data, regional_avg):
+    """Create interactive visualizations"""
+    # Create output directory if it doesn't exist
+    os.makedirs('output', exist_ok=True)
+    
+    # 1. Energy Transition Progress
+    fig_transition = px.scatter(
+        latest_data,
+        x='Fossil fuel energy consumption (% of total)',
+        y='Renewable energy consumption (% of total final energy consumption)',
+        size='Energy use per capita (kg of oil equivalent)',
+        color='Region',
+        hover_data=['country'],
+        title='Energy Transition Progress by Country',
+        labels={
+            'Fossil fuel energy consumption (% of total)': 'Fossil Fuel Consumption (%)',
+            'Renewable energy consumption (% of total final energy consumption)': 'Renewable Energy (%)'
+        }
+    )
+    fig_transition.write_html('output/energy_transition.html')
+    
+    # 2. Regional Renewable Energy Trends
+    fig_regional = px.line(
+        regional_avg,
+        x='date',
+        y='Renewable energy consumption (% of total final energy consumption)',
+        color='Region',
+        title='Regional Renewable Energy Adoption Trends',
+        labels={
+            'Renewable energy consumption (% of total final energy consumption)': 'Renewable Energy (%)',
+            'date': 'Year'
+        }
+    )
+    fig_regional.write_html('output/regional_trends.html')
+    
+    # 3. Create a dashboard
+    app = Dash(__name__)
+    
+    app.layout = html.Div([
+        html.H1('Global Energy Transition Dashboard'),
+        html.Div([
+            dcc.Graph(figure=fig_transition),
+            dcc.Graph(figure=fig_regional)
+        ])
+    ])
+    
+    app.run_server(debug=False, port=8050)
+    
+    return 'output/energy_transition.html', 'output/regional_trends.html'
+
+@task
+def generate_report(latest_data, transition_file, regional_file):
+    """Generate a summary report"""
+    report = f"""
+    Global Energy Transition Analysis Report
+    Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+    
+    Key Statistics:
+    --------------
+    Total Countries Analyzed: {len(latest_data)}
+    Global Average Renewable Energy Share: {latest_data['Renewable energy consumption (% of total final energy consumption)'].mean():.2f}%
+    
+    Top 5 Countries by Renewable Energy Share:
+    ----------------------------------------
+    {latest_data.nlargest(5, 'Renewable energy consumption (% of total final energy consumption)')[['country', 'Renewable energy consumption (% of total final energy consumption)']].to_string()}
+    
+    Top 5 Countries by Energy Transition Score:
+    ----------------------------------------
+    {latest_data.nlargest(5, 'Energy Transition Score')[['country', 'Energy Transition Score']].to_string()}
+    
+    Visualizations:
+    --------------
+    - Energy Transition Progress: {transition_file}
+    - Regional Renewable Trends: {regional_file}
+    """
+    
+    with open('output/energy_report.txt', 'w') as f:
+        f.write(report)
+    
+    return 'output/energy_report.txt'
+
+@flow(name="Global Energy Transition Analysis Pipeline")
+def energy_pipeline():
     """Main pipeline flow"""
-    # Generate data
-    raw_data = generate_data()
+    # Fetch data
+    raw_data = fetch_energy_data()
     
     # Process data
-    processed_data = process_data(raw_data)
+    processed_data, latest_data, regional_avg = process_data(raw_data)
+    
+    # Create visualizations
+    transition_file, regional_file = create_visualizations(processed_data, latest_data, regional_avg)
     
-    # Save results
-    output_file = save_data(processed_data)
+    # Generate report
+    report_file = generate_report(latest_data, transition_file, regional_file)
     
-    print(f"Pipeline completed successfully. Output saved to: {output_file}")
+    print(f"Pipeline completed successfully. Report saved to: {report_file}")
+    print(f"Visualizations saved in the 'output' directory")
+    print("Dashboard available at http://localhost:8050")
 
 def create_deployment():
     """Create a deployment for the pipeline"""
-    data_pipeline.serve(
-        name="data-pipeline-deployment",
+    energy_pipeline.serve(
+        name="energy-analysis-deployment",
         work_queue_name="default",
     )
 
 if __name__ == "__main__":
     if os.getenv("PREFECT_DEPLOYMENT"):
         create_deployment()
     else:
-        data_pipeline() 
+        energy_pipeline() 
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,7 @@
 prefect>=2.14.0
 pandas>=2.0.0
-numpy>=1.24.0 
+numpy>=1.24.0
+requests>=2.31.0
+plotly>=5.18.0
+dash>=2.14.0
+python-dotenv>=1.0.0