Tutorials

This page provides step-by-step tutorials for common use cases with EurostatAPI.jl.

Tutorial 1: Your First Eurostat Dataset

In this tutorial, we'll walk through the basics of fetching and exploring a Eurostat dataset.

Step 1: Setup

First, make sure you have EurostatAPI.jl installed and loaded:

using EurostatAPI
using DataFrames
using Statistics  # For basic statistical functions

Step 2: Choose a Dataset

For this tutorial, we'll use the European GDP dataset (nama_10_gdp). This dataset contains national accounts data including GDP figures for EU countries.

# Fetch GDP data for 2022
df = fetch_dataset("nama_10_gdp", 2022)

Step 3: Explore the Data Structure

Let's examine what we received:

# Check the dimensions
println("Dataset size: $(size(df))")
println("Columns: $(names(df))")

# Look at the first few rows
first(df, 3)

Step 4: Understand the Dimensions

# Find the dimension columns (excluding metadata columns)
metadata_cols = [:dataset, :year, :value, :original_value, :fetch_date, :original_key]
dimension_cols = filter(col -> !(col in metadata_cols), names(df))

println("Available dimensions: $dimension_cols")

# Explore each dimension
for col in dimension_cols
    unique_vals = unique(skipmissing(df[!, col]))
    println("$col has $(length(unique_vals)) unique values")
    if length(unique_vals) <= 10
        println("  Values: $unique_vals")
    else
        println("  Sample values: $(first(unique_vals, 5))...")
    end
end

Step 5: Filter for Meaningful Data

Most Eurostat datasets have multiple indicators. Let's filter for actual GDP values:

# Look for GDP-related indicators
if :na_item in names(df)
    gdp_indicators = unique(filter(!ismissing, df.na_item))
    println("Available GDP indicators:")
    for indicator in gdp_indicators
        count = nrow(filter(row -> row.na_item == indicator, df))
        println("  $indicator: $count records")
    end
    
    # Filter for GDP at market prices (B1GQ)
    gdp_data = filter(row -> 
        !ismissing(row.na_item) && 
        occursin("B1GQ", row.na_item) &&
        !ismissing(row.value), df)
    
    println("\nGDP at market prices: $(nrow(gdp_data)) records")
end

Step 6: Analyze by Country

if :geo in names(gdp_data) && nrow(gdp_data) > 0
    # Group by country and calculate statistics
    country_stats = combine(groupby(gdp_data, :geo),
        :value => mean => :avg_gdp,
        :value => length => :count)
    
    sort!(country_stats, :avg_gdp, rev=true)
    
    println("Top 10 countries by GDP:")
    first(country_stats, 10)
end

Tutorial 2: Time Series Analysis

In this tutorial, we'll collect data across multiple years and analyze trends.

Step 1: Collect Multi-Year Data

function collect_gdp_timeseries(years)
    all_data = DataFrame()
    
    for year in years
        try
            println("Fetching data for $year...")
            yearly_data = fetch_dataset("nama_10_gdp", year)
            
            # Filter for GDP at market prices
            gdp_yearly = filter(row -> 
                !ismissing(row.na_item) && 
                occursin("B1GQ", row.na_item) &&
                !ismissing(row.value), yearly_data)
            
            append!(all_data, gdp_yearly)
            println("  Added $(nrow(gdp_yearly)) GDP records")
            
        catch e
            println("  Failed to get data for $year: $e")
        end
        
        # Be nice to the API
        sleep(1)
    end
    
    return all_data
end

# Collect data for recent years
recent_years = 2019:2023
gdp_timeseries = collect_gdp_timeseries(recent_years)
function calculate_growth_rates(data, country_code)
    # Filter for specific country
    country_data = filter(row -> 
        !ismissing(row.geo) && row.geo == country_code, data)
    
    if nrow(country_data) == 0
        return nothing
    end
    
    # Sort by year
    sort!(country_data, :year)
    
    # Calculate year-over-year growth
    growth_data = DataFrame()
    
    for i in 2:nrow(country_data)
        prev_value = country_data[i-1, :value]
        curr_value = country_data[i, :value]
        
        if !ismissing(prev_value) && !ismissing(curr_value) && prev_value != 0
            growth_rate = (curr_value - prev_value) / prev_value * 100
            
            push!(growth_data, (
                country = country_code,
                year = country_data[i, :year],
                gdp = curr_value,
                growth_rate = growth_rate
            ))
        end
    end
    
    return growth_data
end

# Calculate growth for major EU countries
major_countries = ["DE", "FR", "IT", "ES", "NL"]
all_growth = DataFrame()

for country in major_countries
    growth = calculate_growth_rates(gdp_timeseries, country)
    if growth !== nothing
        append!(all_growth, growth)
    end
end

# Show average growth by country
if nrow(all_growth) > 0
    avg_growth = combine(groupby(all_growth, :country),
        :growth_rate => mean => :avg_growth,
        :growth_rate => std => :growth_volatility)
    
    sort!(avg_growth, :avg_growth, rev=true)
    println("Average GDP growth rates:")
    avg_growth
end

Tutorial 3: Working with Population Data

Let's explore demographic data using the population dataset.

Step 1: Fetch Population Data

# Population on 1 January by age and sex
pop_data = fetch_dataset("demo_pjan", 2023)

println("Population dataset dimensions: $(size(pop_data))")

Step 2: Explore Population Structure

# Check available dimensions
if :age in names(pop_data) && :sex in names(pop_data)
    println("Age groups available:")
    age_groups = unique(filter(!ismissing, pop_data.age))
    println(first(age_groups, 10))
    
    println("\nSex categories:")
    sex_categories = unique(filter(!ismissing, pop_data.sex))
    println(sex_categories)
end

Step 3: Calculate Total Population by Country

# Filter for total population (all ages, both sexes)
total_pop = filter(row -> 
    !ismissing(row.age) && row.age == "TOTAL" &&
    !ismissing(row.sex) && row.sex == "T" &&
    !ismissing(row.value), pop_data)

if nrow(total_pop) > 0
    # Sort by population size
    sort!(total_pop, :value, rev=true)
    
    println("Top 10 countries by population:")
    for i in 1:min(10, nrow(total_pop))
        row = total_pop[i, :]
        country = row.geo
        population = round(Int, row.value)
        println("$i. $country: $(format_number(population))")
    end
end

# Helper function to format large numbers
function format_number(n)
    if n >= 1_000_000
        return "$(round(n/1_000_000, digits=1))M"
    elseif n >= 1_000
        return "$(round(n/1_000, digits=1))K"
    else
        return string(n)
    end
end

Step 4: Age Distribution Analysis

function analyze_age_distribution(data, country_code)
    country_data = filter(row -> 
        !ismissing(row.geo) && row.geo == country_code &&
        !ismissing(row.sex) && row.sex == "T" &&  # Total (both sexes)
        !ismissing(row.age) && row.age != "TOTAL" &&
        !ismissing(row.value), data)
    
    if nrow(country_data) == 0
        println("No age distribution data for $country_code")
        return nothing
    end
    
    sort!(country_data, :value, rev=true)
    
    println("Age distribution for $country_code:")
    for row in first(eachrow(country_data), 10)
        age_group = row.age
        population = round(Int, row.value)
        println("  $age_group: $(format_number(population))")
    end
    
    return country_data
end

# Analyze age distribution for Germany
if nrow(total_pop) > 0
    de_ages = analyze_age_distribution(pop_data, "DE")
end

Tutorial 4: Environmental Data Analysis

Let's explore environmental statistics, focusing on greenhouse gas emissions.

Step 1: Fetch Environmental Data

# Greenhouse gas emissions data
env_data = fetch_dataset("env_air_gge", 2021)

println("Environmental dataset dimensions: $(size(env_data))")

Step 2: Explore Emission Sources

# Check what emission sources are available
if :src_crf in names(env_data)
    emission_sources = unique(filter(!ismissing, env_data.src_crf))
    println("Available emission sources:")
    for source in first(emission_sources, 15)
        count = nrow(filter(row -> row.src_crf == source, env_data))
        println("  $source: $count records")
    end
end

Step 3: Total Emissions by Country

# Filter for total greenhouse gas emissions
total_emissions = filter(row -> 
    !ismissing(row.src_crf) && 
    occursin("TOTAL", row.src_crf) &&
    !ismissing(row.value), env_data)

if nrow(total_emissions) > 0
    # Group by country
    country_emissions = combine(groupby(total_emissions, :geo),
        :value => sum => :total_emissions)
    
    sort!(country_emissions, :total_emissions, rev=true)
    
    println("Top emitters (CO2 equivalent):")
    first(country_emissions, 10)
end

Tutorial 5: Data Quality Assessment

Learn how to assess and handle data quality issues.

Step 1: Check for Missing Data

function assess_data_quality(df, dataset_name)
    println("Data Quality Assessment for $dataset_name")
    println("=" ^ (30 + length(dataset_name)))
    
    # Basic statistics
    println("Total records: $(nrow(df))")
    println("Total columns: $(ncol(df))")
    
    # Missing values analysis
    println("\nMissing Values by Column:")
    for col in names(df)
        missing_count = sum(ismissing.(df[!, col]))
        missing_pct = round(missing_count / nrow(df) * 100, digits=2)
        println("  $col: $missing_count ($missing_pct%)")
    end
    
    # Special values analysis
    if :original_value in names(df)
        special_df = filter(row -> !ismissing(row.original_value), df)
        if nrow(special_df) > 0
            println("\nSpecial Value Codes:")
            for code in unique(special_df.original_value)
                count = sum(special_df.original_value .== code)
                println("  '$code': $count occurrences")
            end
        end
    end
    
    # Actual data coverage
    actual_data = filter(row -> !ismissing(row.value), df)
    coverage_pct = round(nrow(actual_data) / nrow(df) * 100, digits=2)
    println("\nData Coverage: $(nrow(actual_data)) records ($coverage_pct%)")
    
    return actual_data
end

# Example usage
if @isdefined(gdp_data)
    clean_gdp = assess_data_quality(gdp_data, "GDP Data")
end

Step 2: Data Validation

function validate_numeric_data(df, value_col=:value)
    issues = String[]
    
    if value_col in names(df)
        values = filter(!ismissing, df[!, value_col])
        
        if length(values) == 0
            push!(issues, "No numeric values found")
        else
            # Check for extreme values
            if any(v -> v < 0, values)
                negative_count = sum(v -> v < 0, values)
                push!(issues, "$negative_count negative values found")
            end
            
            # Check for very large values (might indicate data errors)
            if any(v -> v > 1e12, values)
                extreme_count = sum(v -> v > 1e12, values)
                push!(issues, "$extreme_count extremely large values (>1e12)")
            end
            
            # Check for unrealistic precision
            if any(v -> v != round(v, digits=2), values)
                high_precision = sum(v -> v != round(v, digits=2), values)
                if high_precision > length(values) * 0.1  # More than 10%
                    push!(issues, "High precision values detected (might indicate raw data)")
                end
            end
        end
    else
        push!(issues, "Value column '$value_col' not found")
    end
    
    # Report results
    if isempty(issues)
        println("✓ Data validation passed")
        return true
    else
        println("⚠ Data validation issues:")
        for issue in issues
            println("  - $issue")
        end
        return false
    end
end

Tutorial 6: Creating Reusable Analysis Functions

Build functions that you can reuse across different datasets and analyses.

Step 1: Generic Data Fetcher

function fetch_eurostat_robust(dataset_id, years; max_retries=3)
    """
    Robustly fetch Eurostat data for multiple years with error handling.
    """
    results = Dict{Int, DataFrame}()
    errors = Dict{Int, Exception}()
    
    for year in years
        success = false
        for attempt in 1:max_retries
            try
                df = fetch_dataset(dataset_id, year)
                results[year] = df
                println("✓ $dataset_id ($year): $(nrow(df)) records")
                success = true
                break
            catch e
                if attempt < max_retries
                    println("⚠ $dataset_id ($year) attempt $attempt failed, retrying...")
                    sleep(2)
                else
                    errors[year] = e
                    println("✗ $dataset_id ($year): All attempts failed")
                end
            end
        end
        
        # Rate limiting between requests
        if success
            sleep(1)
        end
    end
    
    return results, errors
end

Step 2: Flexible Analysis Function

function analyze_by_dimension(data, group_col, value_col=:value; 
                             top_n=10, sort_desc=true)
    """
    Generic function to analyze data by any dimension.
    """
    if !(group_col in names(data)) || !(value_col in names(data))
        error("Required columns not found in data")
    end
    
    # Filter out missing values
    clean_data = filter(row -> 
        !ismissing(row[group_col]) && !ismissing(row[value_col]), data)
    
    if nrow(clean_data) == 0
        println("No valid data for analysis")
        return nothing
    end
    
    # Calculate statistics by group
    stats = combine(groupby(clean_data, group_col),
        value_col => mean => :mean_value,
        value_col => median => :median_value,
        value_col => std => :std_value,
        value_col => length => :count,
        value_col => minimum => :min_value,
        value_col => maximum => :max_value)
    
    # Sort results
    sort!(stats, :mean_value, rev=sort_desc)
    
    # Display top results
    println("Analysis by $group_col (top $top_n):")
    println("-" ^ 50)
    
    display_data = first(stats, min(top_n, nrow(stats)))
    for row in eachrow(display_data)
        group_val = row[group_col]
        mean_val = round(row.mean_value, digits=2)
        count = row.count
        println("$group_val: Mean=$mean_val (n=$count)")
    end
    
    return stats
end

# Example usage
if @isdefined(gdp_timeseries)
    country_analysis = analyze_by_dimension(gdp_timeseries, :geo)
end

Step 3: Automated Report Generator

function generate_dataset_report(dataset_id, year; output_file=nothing)
    """
    Generate a comprehensive report for any Eurostat dataset.
    """
    println("Generating report for $dataset_id ($year)")
    println("=" ^ 50)
    
    # Fetch data
    try
        df = fetch_dataset(dataset_id, year)
    catch e
        println("Failed to fetch data: $e")
        return nothing
    end
    
    # Basic information
    println("\n1. Dataset Overview")
    println("   Records: $(nrow(df))")
    println("   Columns: $(ncol(df))")
    
    # Data quality
    actual_data = filter(row -> !ismissing(row.value), df)
    coverage = round(nrow(actual_data) / nrow(df) * 100, digits=1)
    println("   Data coverage: $coverage%")
    
    # Dimensions analysis
    metadata_cols = [:dataset, :year, :value, :original_value, :fetch_date, :original_key]
    dimension_cols = filter(col -> !(col in metadata_cols), names(df))
    
    println("\n2. Dimensions Analysis")
    for col in dimension_cols
        unique_count = length(unique(skipmissing(df[!, col])))
        println("   $col: $unique_count categories")
    end
    
    # Value statistics
    if nrow(actual_data) > 0
        values = actual_data.value
        println("\n3. Value Statistics")
        println("   Min: $(minimum(values))")
        println("   Max: $(maximum(values))")
        println("   Mean: $(round(mean(values), digits=2))")
        println("   Median: $(round(median(values), digits=2))")
    end
    
    # Top categories for each dimension
    println("\n4. Top Categories by Dimension")
    for col in dimension_cols[1:min(3, length(dimension_cols))]  # Limit to first 3
        println("   $col:")
        col_stats = analyze_by_dimension(actual_data, col, top_n=5)
    end
    
    return df
end

# Example usage
# report_data = generate_dataset_report("nama_10_gdp", 2022)

These tutorials provide a solid foundation for working with EurostatAPI.jl across different types of analyses and datasets. Each tutorial builds upon the previous ones, showing progressively more advanced techniques for data fetching, analysis, and quality assessment.