Examples

This page provides comprehensive examples of using EurostatAPI.jl to access and analyze Eurostat data.

Basic Usage Examples

Fetching a Simple Dataset

using EurostatAPI
using DataFrames

# Fetch European GDP data for 2022
df = fetch_dataset("nama_10_gdp", 2022)

# Examine the structure
println("Dataset dimensions: $(size(df))")
println("Column names: $(names(df))")

# Show first few rows
first(df, 5)

Exploring Dataset Structure

# Check what dimensions are available
dimension_cols = filter(name -> !(name in [:dataset, :year, :value, :original_value, :fetch_date, :original_key]), names(df))
println("Available dimensions: ", dimension_cols)

# Count unique values in each dimension
for col in dimension_cols
    unique_count = length(unique(skipmissing(df[!, col])))
    println("$col: $unique_count unique values")
end

Working with Different Dataset Types

National Accounts Data

# GDP and main components
gdp_data = fetch_dataset("nama_10_gdp", 2022)

# Filter for GDP at market prices for EU countries
if :na_item in names(gdp_data) && :geo in names(gdp_data)
    eu_gdp = filter(row -> 
        !ismissing(row.na_item) && 
        occursin("B1GQ", row.na_item) &&  # GDP at market prices
        !ismissing(row.geo) &&
        length(row.geo) == 2,  # Country codes are 2 letters
        gdp_data)
    
    println("EU GDP data: $(nrow(eu_gdp)) records")
end

Population Data

# Population on 1 January by age and sex
pop_data = fetch_dataset("demo_pjan", 2023)

# Examine population structure
if :age in names(pop_data) && :sex in names(pop_data)
    # Total population by country
    total_pop = filter(row -> 
        !ismissing(row.age) && row.age == "TOTAL" &&
        !ismissing(row.sex) && row.sex == "T",  # Total (both sexes)
        pop_data)
    
    println("Total population records: $(nrow(total_pop))")
end

Environmental Data

# Greenhouse gas emissions
emission_data = fetch_dataset("env_air_gge", 2021)

# Air quality data
if haskey(ENV, "FETCH_AIR_QUALITY")  # Optional example
    air_data = fetch_dataset("env_air_qa", 2022)
end

Time Series Analysis

Multi-Year Data Collection

function collect_time_series(dataset_id, years)
    all_data = DataFrame()
    successful_years = Int[]
    
    for year in years
        try
            yearly_data = fetch_dataset(dataset_id, year)
            if nrow(yearly_data) > 0
                append!(all_data, yearly_data)
                push!(successful_years, year)
                println("✓ Year $year: $(nrow(yearly_data)) records")
            else
                println("⚠ Year $year: No data available")
            end
        catch e
            println("✗ Year $year: Error - $e")
        end
        
        # Be respectful to the API
        sleep(1)
    end
    
    println("\nSummary:")
    println("Successfully collected: $(length(successful_years)) years")
    println("Total records: $(nrow(all_data))")
    println("Years: $(successful_years)")
    
    return all_data, successful_years
end

# Collect GDP data for recent years
gdp_timeseries, years_collected = collect_time_series("nama_10_gdp", 2019:2023)
using Statistics, StatsBase

function analyze_country_trends(data, country_code, indicator)
    # Filter for specific country and indicator
    country_data = filter(row -> 
        !ismissing(row.geo) && row.geo == country_code &&
        !ismissing(row.na_item) && occursin(indicator, row.na_item) &&
        !ismissing(row.value),
        data)
    
    if nrow(country_data) == 0
        println("No data found for $country_code with indicator $indicator")
        return nothing
    end
    
    # Sort by year
    sort!(country_data, :year)
    
    # Calculate year-over-year growth
    if nrow(country_data) > 1
        values = country_data.value
        years = country_data.year
        
        growth_rates = []
        for i in 2:length(values)
            if values[i-1] != 0
                growth = (values[i] - values[i-1]) / values[i-1] * 100
                push!(growth_rates, growth)
            end
        end
        
        println("$country_code GDP trends:")
        println("Years: $(minimum(years))-$(maximum(years))")
        println("Average growth: $(round(mean(growth_rates), digits=2))%")
        println("Growth volatility (std): $(round(std(growth_rates), digits=2))%")
        
        return country_data, growth_rates
    end
    
    return country_data, nothing
end

# Example analysis for Germany
if nrow(gdp_timeseries) > 0
    de_analysis = analyze_country_trends(gdp_timeseries, "DE", "B1GQ")
end

Data Quality and Validation

Checking for Missing Data

function data_quality_report(df)
    println("Data Quality Report")
    println("==================")
    
    # Overall statistics
    println("Total records: $(nrow(df))")
    println("Total columns: $(ncol(df))")
    
    # Missing value analysis
    println("\nMissing Value Analysis:")
    for col in names(df)
        missing_count = sum(ismissing.(df[!, col]))
        missing_pct = round(missing_count / nrow(df) * 100, digits=2)
        println("$col: $missing_count missing ($(missing_pct)%)")
    end
    
    # Special value analysis
    if :original_value in names(df)
        special_values = filter(row -> !ismissing(row.original_value), df)
        if nrow(special_values) > 0
            println("\nSpecial Values:")
            value_counts = countmap(special_values.original_value)
            for (val, count) in sort(collect(value_counts), by=x->x[2], rev=true)
                println("'$val': $count occurrences")
            end
        end
    end
    
    # Actual data records
    actual_data = filter(row -> !ismissing(row.value), df)
    println("\nActual data records: $(nrow(actual_data)) ($(round(nrow(actual_data)/nrow(df)*100, digits=2))%)")
    
    return actual_data
end

# Run quality report
if @isdefined(gdp_data)
    clean_data = data_quality_report(gdp_data)
end

Data Validation

function validate_eurostat_data(df)
    issues = String[]
    
    # Check required columns
    required_cols = [:dataset, :year, :value, :fetch_date]
    for col in required_cols
        if !(col in names(df))
            push!(issues, "Missing required column: $col")
        end
    end
    
    # Check data types
    if :year in names(df) && !all(x -> ismissing(x) || isa(x, Integer), df.year)
        push!(issues, "Year column contains non-integer values")
    end
    
    # Check for completely empty dataset
    if nrow(df) == 0
        push!(issues, "Dataset is empty")
    end
    
    # Check for reasonable value ranges
    if :value in names(df)
        numeric_values = filter(!ismissing, df.value)
        if length(numeric_values) > 0
            if any(x -> isa(x, Number) && (x < -1e12 || x > 1e12), numeric_values)
                push!(issues, "Some values are outside reasonable range")
            end
        end
    end
    
    # Report results
    if isempty(issues)
        println("✓ Data validation passed")
        return true
    else
        println("⚠ Data validation issues found:")
        for issue in issues
            println("  - $issue")
        end
        return false
    end
end

Advanced Analysis Examples

Cross-Country Comparisons

function compare_countries(data, indicator, countries, year_range=nothing)
    # Filter for specific indicator
    indicator_data = filter(row -> 
        !ismissing(row.na_item) && occursin(indicator, row.na_item) &&
        !ismissing(row.geo) && row.geo in countries &&
        !ismissing(row.value),
        data)
    
    if nrow(indicator_data) == 0
        println("No data found for comparison")
        return nothing
    end
    
    # Apply year filter if specified
    if year_range !== nothing
        indicator_data = filter(row -> row.year in year_range, indicator_data)
    end
    
    # Calculate statistics by country
    country_stats = combine(groupby(indicator_data, :geo), 
        :value => mean => :avg_value,
        :value => std => :std_value,
        :value => length => :count,
        :year => (x -> (minimum(x), maximum(x))) => :year_range)
    
    sort!(country_stats, :avg_value, rev=true)
    
    println("Country Comparison - $indicator")
    println("="^50)
    for row in eachrow(country_stats)
        println("$(row.geo): Avg = $(round(row.avg_value, digits=2)), " *
                "Std = $(round(row.std_value, digits=2)), " *
                "Years: $(row.year_range[1])-$(row.year_range[2]) ($(row.count) obs)")
    end
    
    return country_stats
end

# Compare GDP across major EU economies
if @isdefined(gdp_timeseries)
    major_eu = ["DE", "FR", "IT", "ES", "NL", "PL"]
    comparison = compare_countries(gdp_timeseries, "B1GQ", major_eu)
end

Regional Analysis

function analyze_by_region(data)
    # Define regional groupings (simplified)
    regions = Dict(
        "Western Europe" => ["DE", "FR", "IT", "ES", "NL", "BE", "AT", "CH"],
        "Northern Europe" => ["SE", "DK", "FI", "NO", "IS"],
        "Eastern Europe" => ["PL", "CZ", "HU", "SK", "SI", "HR", "RO", "BG"],
        "Southern Europe" => ["GR", "PT", "CY", "MT"]
    )
    
    regional_data = DataFrame()
    
    for (region_name, countries) in regions
        region_subset = filter(row -> 
            !ismissing(row.geo) && row.geo in countries &&
            !ismissing(row.value),
            data)
        
        if nrow(region_subset) > 0
            # Add region column
            region_subset[!, :region] = region_name
            append!(regional_data, region_subset)
        end
    end
    
    if nrow(regional_data) > 0
        # Regional statistics
        regional_stats = combine(groupby(regional_data, [:region, :year]),
            :value => mean => :avg_value,
            :value => sum => :total_value,
            :value => length => :country_count)
        
        return regional_stats
    end
    
    return nothing
end

Error Handling and Robustness

Comprehensive Error Handling

function robust_fetch(dataset, year; max_retries=3, delay=5)
    for attempt in 1:max_retries
        try
            println("Attempt $attempt for $dataset ($year)...")
            df = fetch_dataset(dataset, year)
            
            # Validate the result
            if nrow(df) == 0
                println("Warning: Empty dataset received")
                return df  # Return empty but valid DataFrame
            end
            
            println("Success: $(nrow(df)) records retrieved")
            return df
            
        catch e
            println("Attempt $attempt failed: $e")
            
            if attempt < max_retries
                println("Waiting $(delay) seconds before retry...")
                sleep(delay)
            else
                println("All attempts failed for $dataset ($year)")
                rethrow(e)
            end
        end
    end
end

# Usage with error handling
function safe_multi_fetch(datasets, year)
    results = Dict{String, DataFrame}()
    errors = Dict{String, Exception}()
    
    for dataset in datasets
        try
            df = robust_fetch(dataset, year)
            results[dataset] = df
        catch e
            errors[dataset] = e
        end
    end
    
    println("\nSummary:")
    println("Successful: $(length(results)) datasets")
    println("Failed: $(length(errors)) datasets")
    
    if !isempty(errors)
        println("\nErrors:")
        for (dataset, error) in errors
            println("$dataset: $error")
        end
    end
    
    return results, errors
end

# Example usage
test_datasets = ["nama_10_gdp", "demo_pjan", "env_air_gge"]
results, errors = safe_multi_fetch(test_datasets, 2022)

Monitoring and Logging

using Logging

function fetch_with_detailed_logging(dataset, year)
    # Create a custom logger
    logger = ConsoleLogger(stderr, Logging.Info)
    
    with_logger(logger) do
        @info "Starting fetch" dataset=dataset year=year timestamp=now()
        
        start_time = time()
        
        try
            df = fetch_dataset(dataset, year)
            
            elapsed = time() - start_time
            @info "Fetch completed successfully" rows=nrow(df) columns=ncol(df) elapsed_seconds=round(elapsed, digits=2)
            
            # Log data quality metrics
            actual_values = count(!ismissing, df.value)
            @info "Data quality" total_records=nrow(df) actual_values=actual_values missing_rate=round((nrow(df)-actual_values)/nrow(df)*100, digits=2)
            
            return df
            
        catch e
            elapsed = time() - start_time
            @error "Fetch failed" exception=e elapsed_seconds=round(elapsed, digits=2)
            rethrow(e)
        end
    end
end

Integration Examples

Combining with Other Packages

# Integration with Plots.jl for visualization
using Plots

function plot_time_series(data, country, indicator)
    country_data = filter(row -> 
        !ismissing(row.geo) && row.geo == country &&
        !ismissing(row.na_item) && occursin(indicator, row.na_item) &&
        !ismissing(row.value),
        data)
    
    if nrow(country_data) == 0
        println("No data to plot")
        return nothing
    end
    
    sort!(country_data, :year)
    
    plot(country_data.year, country_data.value,
         title="$indicator for $country",
         xlabel="Year",
         ylabel="Value",
         marker=:circle,
         linewidth=2)
end

# Integration with CSV.jl for data export
using CSV

function export_to_csv(data, filename)
    # Clean up the data for export
    export_data = select(data, Not([:fetch_date, :original_key]))
    
    CSV.write(filename, export_data)
    println("Data exported to $filename")
    println("Exported $(nrow(export_data)) rows, $(ncol(export_data)) columns")
end

Batch Processing Pipeline

function eurostat_pipeline(datasets, years, output_dir="eurostat_data")
    # Create output directory
    if !isdir(output_dir)
        mkdir(output_dir)
    end
    
    results_summary = DataFrame(
        dataset = String[],
        year = Int[],
        status = String[],
        records = Int[],
        filename = String[]
    )
    
    for dataset in datasets
        for year in years
            try
                # Fetch data
                df = fetch_dataset(dataset, year)
                
                # Generate filename
                filename = joinpath(output_dir, "$(dataset)_$(year).csv")
                
                # Export to CSV
                CSV.write(filename, df)
                
                # Record success
                push!(results_summary, (dataset, year, "SUCCESS", nrow(df), filename))
                
                println("✓ $dataset ($year): $(nrow(df)) records → $filename")
                
            catch e
                # Record failure
                push!(results_summary, (dataset, year, "FAILED", 0, ""))
                
                println("✗ $dataset ($year): $e")
            end
            
            # Rate limiting
            sleep(2)
        end
    end
    
    # Save summary
    summary_file = joinpath(output_dir, "processing_summary.csv")
    CSV.write(summary_file, results_summary)
    
    println("\nPipeline completed. Summary saved to $summary_file")
    return results_summary
end

# Example pipeline execution
# datasets_to_process = ["nama_10_gdp", "demo_pjan"]
# years_to_process = 2020:2022
# summary = eurostat_pipeline(datasets_to_process, years_to_process)

Performance Optimization Examples

Memory-Efficient Processing

function process_large_dataset_efficiently(dataset, year)
    println("Processing $dataset ($year) with memory monitoring...")
    
    # Monitor initial memory
    initial_memory = Base.gc_bytes()
    
    # Fetch data
    df = fetch_dataset(dataset, year)
    
    # Monitor peak memory
    peak_memory = Base.gc_bytes()
    
    # Process in chunks if large
    if nrow(df) > 100_000
        println("Large dataset detected. Processing in chunks...")
        
        chunk_size = 50_000
        processed_chunks = 0
        
        for start_idx in 1:chunk_size:nrow(df)
            end_idx = min(start_idx + chunk_size - 1, nrow(df))
            chunk = df[start_idx:end_idx, :]
            
            # Process chunk (example: just count non-missing values)
            non_missing = count(!ismissing, chunk.value)
            
            processed_chunks += 1
            if processed_chunks % 5 == 0
                println("Processed $processed_chunks chunks...")
                # Force garbage collection periodically
                GC.gc()
            end
        end
    end
    
    # Final memory check
    final_memory = Base.gc_bytes()
    
    println("Memory usage:")
    println("  Initial: $(initial_memory ÷ 1024^2) MB")
    println("  Peak: $(peak_memory ÷ 1024^2) MB")
    println("  Final: $(final_memory ÷ 1024^2) MB")
    
    return df
end

Filtering and Chunking Large Datasets

Using Filters to Reduce Response Size

The fetch_dataset function now supports filtering to reduce data transfer and improve performance:

# Filter by geographic regions
df_germany = fetch_dataset("nama_10_gdp", 2022; geo=["DE"])
df_european = fetch_dataset("nama_10_gdp", 2022; geo=["EU27_2020", "DE", "FR", "IT", "ES"])

# Filter PRODCOM data by indicators and product codes
df_quantity = fetch_dataset("DS-056120", 2022; indicators=["PRODQNT", "QNTUNIT"])
df_products = fetch_dataset("DS-056120", 2022; prodcode=["10110000", "10120000", "10130000"])

# Combine multiple filters
df_filtered = fetch_dataset("DS-056120", 2022;
                           indicators=["PRODQNT"],
                           geo=["EU27_2020"],
                           prodcode=["10110000", "10120000"])

Automatic Fallback for Large Datasets

The fetch_with_fallback function automatically handles 413 (Request Entity Too Large) errors:

# Automatically adds filters if response is too large
df = fetch_with_fallback("DS-056120", 2022)

# You can still specify your own filters
df_custom = fetch_with_fallback("DS-056120", 2022; geo=["DE", "FR"])

# Handle errors gracefully
try
    df = fetch_dataset("DS-056120", 2022)
catch e
    if isa(e, HTTP.StatusError) && e.status == 413
        println("Dataset too large, using fallback...")
        df = fetch_with_fallback("DS-056120", 2022)
    else
        rethrow(e)
    end
end

Chunked Fetching for Very Large Datasets

For extremely large datasets, use fetch_dataset_chunked:

# Fetch PRODCOM data in chunks of 100 product codes
df_chunked = fetch_dataset_chunked("DS-056120", 2022;
                                  chunk_by=:prodcode,
                                  chunk_size=100)

# Chunk by geographic regions
df_geo_chunks = fetch_dataset_chunked("nama_10_gdp", 2022;
                                     chunk_by=:geo,
                                     chunk_size=10)

# Combine chunking with filtering
df_filtered_chunks = fetch_dataset_chunked("DS-056120", 2022;
                                         chunk_by=:prodcode,
                                         chunk_size=50,
                                         indicators=["PRODQNT"])

Getting Dataset Metadata

Query available dimensions before fetching data:

# Get metadata for a dataset
metadata = get_dataset_metadata("DS-056120")

println("Number of product codes: $(length(metadata.prodcodes))")
println("Number of geographic codes: $(length(metadata.geo_codes))")
println("Available indicators: $(metadata.indicators)")

# Use metadata to select specific codes
if !isempty(metadata.prodcodes)
    # Get first 10 product codes
    selected_codes = metadata.prodcodes[1:min(10, length(metadata.prodcodes))]
    df = fetch_dataset("DS-056120", 2022; prodcode=selected_codes)
end

Smart Fetching Strategy

Combine metadata queries with appropriate fetch methods:

function fetch_prodcom_smart(year::Int; max_products::Int=100)
    dataset = "DS-056120"
    
    # First, get metadata to understand the dataset
    metadata = get_dataset_metadata(dataset)
    
    if isempty(metadata.prodcodes)
        # No product codes found, use fallback
        return fetch_with_fallback(dataset, year)
    elseif length(metadata.prodcodes) > max_products
        # Large dataset, use chunking
        return fetch_dataset_chunked(dataset, year;
                                   chunk_by=:prodcode,
                                   chunk_size=max_products,
                                   indicators=["PRODQNT", "QNTUNIT"])
    else
        # Small enough to fetch directly
        return fetch_dataset(dataset, year;
                           indicators=["PRODQNT", "QNTUNIT"])
    end
end

# Use the smart fetching function
df = fetch_prodcom_smart(2022)

Memory-Efficient Processing with Chunks

Process chunks as they arrive to minimize memory usage:

function process_prodcom_streaming(year::Int)
    metadata = get_dataset_metadata("DS-056120")
    chunk_size = 50
    results = []
    
    for i in 1:chunk_size:length(metadata.prodcodes)
        chunk_codes = metadata.prodcodes[i:min(i+chunk_size-1, end)]
        
        try
            chunk_df = fetch_dataset("DS-056120", year; prodcode=chunk_codes)
            
            # Process chunk immediately to save memory
            summary = DataFrame(
                prodcode = chunk_codes,
                total_quantity = [sum(skipmissing(chunk_df.value[chunk_df.prodcode .== code])) 
                                 for code in chunk_codes]
            )
            
            push!(results, summary)
            
            # Allow garbage collection between chunks
            GC.gc()
        catch e
            @warn "Failed to process chunk: $e"
        end
    end
    
    return vcat(results...)
end

These examples demonstrate the versatility and robustness of EurostatAPI.jl for various data analysis scenarios. The package handles the complexity of the Eurostat API while providing clean, analyzable data for your research and analysis needs.